X86ISelLowering.cpp revision c25ccf85e55137d9d5cc6f607317d841ff5ae347
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the interfaces that X86 uses to lower LLVM code into a
11// selection DAG.
12//
13//===----------------------------------------------------------------------===//
14
15#define DEBUG_TYPE "x86-isel"
16#include "X86.h"
17#include "X86InstrBuilder.h"
18#include "X86ISelLowering.h"
19#include "X86TargetMachine.h"
20#include "X86TargetObjectFile.h"
21#include "llvm/CallingConv.h"
22#include "llvm/Constants.h"
23#include "llvm/DerivedTypes.h"
24#include "llvm/GlobalAlias.h"
25#include "llvm/GlobalVariable.h"
26#include "llvm/Function.h"
27#include "llvm/Instructions.h"
28#include "llvm/Intrinsics.h"
29#include "llvm/LLVMContext.h"
30#include "llvm/CodeGen/MachineFrameInfo.h"
31#include "llvm/CodeGen/MachineFunction.h"
32#include "llvm/CodeGen/MachineInstrBuilder.h"
33#include "llvm/CodeGen/MachineJumpTableInfo.h"
34#include "llvm/CodeGen/MachineModuleInfo.h"
35#include "llvm/CodeGen/MachineRegisterInfo.h"
36#include "llvm/CodeGen/PseudoSourceValue.h"
37#include "llvm/MC/MCAsmInfo.h"
38#include "llvm/MC/MCContext.h"
39#include "llvm/MC/MCExpr.h"
40#include "llvm/MC/MCSymbol.h"
41#include "llvm/ADT/BitVector.h"
42#include "llvm/ADT/SmallSet.h"
43#include "llvm/ADT/Statistic.h"
44#include "llvm/ADT/StringExtras.h"
45#include "llvm/ADT/VectorExtras.h"
46#include "llvm/Support/CommandLine.h"
47#include "llvm/Support/Debug.h"
48#include "llvm/Support/Dwarf.h"
49#include "llvm/Support/ErrorHandling.h"
50#include "llvm/Support/MathExtras.h"
51#include "llvm/Support/raw_ostream.h"
52using namespace llvm;
53using namespace dwarf;
54
55STATISTIC(NumTailCalls, "Number of tail calls");
56
57static cl::opt<bool>
58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
59
60// Forward declarations.
61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
62                       SDValue V2);
63
64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
65  switch (TM.getSubtarget<X86Subtarget>().TargetType) {
66  default: llvm_unreachable("unknown subtarget type");
67  case X86Subtarget::isDarwin:
68    if (TM.getSubtarget<X86Subtarget>().is64Bit())
69      return new X8664_MachoTargetObjectFile();
70    return new TargetLoweringObjectFileMachO();
71  case X86Subtarget::isELF:
72   if (TM.getSubtarget<X86Subtarget>().is64Bit())
73     return new X8664_ELFTargetObjectFile(TM);
74    return new X8632_ELFTargetObjectFile(TM);
75  case X86Subtarget::isMingw:
76  case X86Subtarget::isCygwin:
77  case X86Subtarget::isWindows:
78    return new TargetLoweringObjectFileCOFF();
79  }
80}
81
82X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
83  : TargetLowering(TM, createTLOF(TM)) {
84  Subtarget = &TM.getSubtarget<X86Subtarget>();
85  X86ScalarSSEf64 = Subtarget->hasSSE2();
86  X86ScalarSSEf32 = Subtarget->hasSSE1();
87  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
88
89  RegInfo = TM.getRegisterInfo();
90  TD = getTargetData();
91
92  // Set up the TargetLowering object.
93
94  // X86 is weird, it always uses i8 for shift amounts and setcc results.
95  setShiftAmountType(MVT::i8);
96  setBooleanContents(ZeroOrOneBooleanContent);
97  setSchedulingPreference(Sched::RegPressure);
98  setStackPointerRegisterToSaveRestore(X86StackPtr);
99
100  if (Subtarget->isTargetDarwin()) {
101    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
102    setUseUnderscoreSetJmp(false);
103    setUseUnderscoreLongJmp(false);
104  } else if (Subtarget->isTargetMingw()) {
105    // MS runtime is weird: it exports _setjmp, but longjmp!
106    setUseUnderscoreSetJmp(true);
107    setUseUnderscoreLongJmp(false);
108  } else {
109    setUseUnderscoreSetJmp(true);
110    setUseUnderscoreLongJmp(true);
111  }
112
113  // Set up the register classes.
114  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
115  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
116  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
117  if (Subtarget->is64Bit())
118    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
119
120  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
121
122  // We don't accept any truncstore of integer registers.
123  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
124  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
125  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
126  setTruncStoreAction(MVT::i32, MVT::i16, Expand);
127  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
128  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
129
130  // SETOEQ and SETUNE require checking two conditions.
131  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
132  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
133  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
134  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
135  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
136  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
137
138  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
139  // operation.
140  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
141  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
142  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
143
144  if (Subtarget->is64Bit()) {
145    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
146    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
147  } else if (!UseSoftFloat) {
148    // We have an algorithm for SSE2->double, and we turn this into a
149    // 64-bit FILD followed by conditional FADD for other targets.
150    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
151    // We have an algorithm for SSE2, and we turn this into a 64-bit
152    // FILD for other targets.
153    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
154  }
155
156  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
157  // this operation.
158  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
159  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
160
161  if (!UseSoftFloat) {
162    // SSE has no i16 to fp conversion, only i32
163    if (X86ScalarSSEf32) {
164      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
165      // f32 and f64 cases are Legal, f80 case is not
166      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
167    } else {
168      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
169      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
170    }
171  } else {
172    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
173    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
174  }
175
176  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
177  // are Legal, f80 is custom lowered.
178  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
179  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
180
181  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
182  // this operation.
183  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
184  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
185
186  if (X86ScalarSSEf32) {
187    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
188    // f32 and f64 cases are Legal, f80 case is not
189    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
190  } else {
191    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
192    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
193  }
194
195  // Handle FP_TO_UINT by promoting the destination to a larger signed
196  // conversion.
197  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
198  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
199  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
200
201  if (Subtarget->is64Bit()) {
202    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
203    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
204  } else if (!UseSoftFloat) {
205    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
206      // Expand FP_TO_UINT into a select.
207      // FIXME: We would like to use a Custom expander here eventually to do
208      // the optimal thing for SSE vs. the default expansion in the legalizer.
209      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
210    else
211      // With SSE3 we can use fisttpll to convert to a signed i64; without
212      // SSE, we're stuck with a fistpll.
213      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
214  }
215
216  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
217  if (!X86ScalarSSEf64) {
218    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
219    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
220    if (Subtarget->is64Bit()) {
221      setOperationAction(ISD::BIT_CONVERT    , MVT::f64  , Expand);
222      // Without SSE, i64->f64 goes through memory; i64->MMX is Legal.
223      if (Subtarget->hasMMX() && !DisableMMX)
224        setOperationAction(ISD::BIT_CONVERT    , MVT::i64  , Custom);
225      else
226        setOperationAction(ISD::BIT_CONVERT    , MVT::i64  , Expand);
227    }
228  }
229
230  // Scalar integer divide and remainder are lowered to use operations that
231  // produce two results, to match the available instructions. This exposes
232  // the two-result form to trivial CSE, which is able to combine x/y and x%y
233  // into a single instruction.
234  //
235  // Scalar integer multiply-high is also lowered to use two-result
236  // operations, to match the available instructions. However, plain multiply
237  // (low) operations are left as Legal, as there are single-result
238  // instructions for this in x86. Using the two-result multiply instructions
239  // when both high and low results are needed must be arranged by dagcombine.
240  setOperationAction(ISD::MULHS           , MVT::i8    , Expand);
241  setOperationAction(ISD::MULHU           , MVT::i8    , Expand);
242  setOperationAction(ISD::SDIV            , MVT::i8    , Expand);
243  setOperationAction(ISD::UDIV            , MVT::i8    , Expand);
244  setOperationAction(ISD::SREM            , MVT::i8    , Expand);
245  setOperationAction(ISD::UREM            , MVT::i8    , Expand);
246  setOperationAction(ISD::MULHS           , MVT::i16   , Expand);
247  setOperationAction(ISD::MULHU           , MVT::i16   , Expand);
248  setOperationAction(ISD::SDIV            , MVT::i16   , Expand);
249  setOperationAction(ISD::UDIV            , MVT::i16   , Expand);
250  setOperationAction(ISD::SREM            , MVT::i16   , Expand);
251  setOperationAction(ISD::UREM            , MVT::i16   , Expand);
252  setOperationAction(ISD::MULHS           , MVT::i32   , Expand);
253  setOperationAction(ISD::MULHU           , MVT::i32   , Expand);
254  setOperationAction(ISD::SDIV            , MVT::i32   , Expand);
255  setOperationAction(ISD::UDIV            , MVT::i32   , Expand);
256  setOperationAction(ISD::SREM            , MVT::i32   , Expand);
257  setOperationAction(ISD::UREM            , MVT::i32   , Expand);
258  setOperationAction(ISD::MULHS           , MVT::i64   , Expand);
259  setOperationAction(ISD::MULHU           , MVT::i64   , Expand);
260  setOperationAction(ISD::SDIV            , MVT::i64   , Expand);
261  setOperationAction(ISD::UDIV            , MVT::i64   , Expand);
262  setOperationAction(ISD::SREM            , MVT::i64   , Expand);
263  setOperationAction(ISD::UREM            , MVT::i64   , Expand);
264
265  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
266  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
267  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
268  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
269  if (Subtarget->is64Bit())
270    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
271  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
272  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
273  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
274  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
275  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
276  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
277  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
278  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
279
280  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
281  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
282  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
283  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
284  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
285  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
286  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
287  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
288  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
289  if (Subtarget->is64Bit()) {
290    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
291    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
292    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
293  }
294
295  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
296  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
297
298  // These should be promoted to a larger select which is supported.
299  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
300  // X86 wants to expand cmov itself.
301  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
302  setOperationAction(ISD::SELECT        , MVT::i16  , Custom);
303  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
304  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
305  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
306  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
307  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
308  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
309  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
310  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
311  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
312  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
313  if (Subtarget->is64Bit()) {
314    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
315    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
316  }
317  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
318
319  // Darwin ABI issue.
320  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
321  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
322  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
323  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
324  if (Subtarget->is64Bit())
325    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
326  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
327  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
328  if (Subtarget->is64Bit()) {
329    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
330    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
331    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
332    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
333    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
334  }
335  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
336  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
337  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
338  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
339  if (Subtarget->is64Bit()) {
340    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
341    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
342    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
343  }
344
345  if (Subtarget->hasSSE1())
346    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
347
348  if (!Subtarget->hasSSE2())
349    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
350  // On X86 and X86-64, atomic operations are lowered to locked instructions.
351  // Locked instructions, in turn, have implicit fence semantics (all memory
352  // operations are flushed before issuing the locked instruction, and they
353  // are not buffered), so we can fold away the common pattern of
354  // fence-atomic-fence.
355  setShouldFoldAtomicFences(true);
356
357  // Expand certain atomics
358  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
359  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
360  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
361  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
362
363  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
364  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
365  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
366  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
367
368  if (!Subtarget->is64Bit()) {
369    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
370    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
371    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
372    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
373    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
374    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
375    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
376  }
377
378  // FIXME - use subtarget debug flags
379  if (!Subtarget->isTargetDarwin() &&
380      !Subtarget->isTargetELF() &&
381      !Subtarget->isTargetCygMing()) {
382    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
383  }
384
385  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
386  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
387  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
388  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
389  if (Subtarget->is64Bit()) {
390    setExceptionPointerRegister(X86::RAX);
391    setExceptionSelectorRegister(X86::RDX);
392  } else {
393    setExceptionPointerRegister(X86::EAX);
394    setExceptionSelectorRegister(X86::EDX);
395  }
396  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
397  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
398
399  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
400
401  setOperationAction(ISD::TRAP, MVT::Other, Legal);
402
403  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
404  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
405  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
406  if (Subtarget->is64Bit()) {
407    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
408    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
409  } else {
410    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
411    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
412  }
413
414  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
415  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
416  if (Subtarget->is64Bit())
417    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
418  if (Subtarget->isTargetCygMing())
419    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
420  else
421    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
422
423  if (!UseSoftFloat && X86ScalarSSEf64) {
424    // f32 and f64 use SSE.
425    // Set up the FP register classes.
426    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
427    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
428
429    // Use ANDPD to simulate FABS.
430    setOperationAction(ISD::FABS , MVT::f64, Custom);
431    setOperationAction(ISD::FABS , MVT::f32, Custom);
432
433    // Use XORP to simulate FNEG.
434    setOperationAction(ISD::FNEG , MVT::f64, Custom);
435    setOperationAction(ISD::FNEG , MVT::f32, Custom);
436
437    // Use ANDPD and ORPD to simulate FCOPYSIGN.
438    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
439    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
440
441    // We don't support sin/cos/fmod
442    setOperationAction(ISD::FSIN , MVT::f64, Expand);
443    setOperationAction(ISD::FCOS , MVT::f64, Expand);
444    setOperationAction(ISD::FSIN , MVT::f32, Expand);
445    setOperationAction(ISD::FCOS , MVT::f32, Expand);
446
447    // Expand FP immediates into loads from the stack, except for the special
448    // cases we handle.
449    addLegalFPImmediate(APFloat(+0.0)); // xorpd
450    addLegalFPImmediate(APFloat(+0.0f)); // xorps
451  } else if (!UseSoftFloat && X86ScalarSSEf32) {
452    // Use SSE for f32, x87 for f64.
453    // Set up the FP register classes.
454    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
455    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
456
457    // Use ANDPS to simulate FABS.
458    setOperationAction(ISD::FABS , MVT::f32, Custom);
459
460    // Use XORP to simulate FNEG.
461    setOperationAction(ISD::FNEG , MVT::f32, Custom);
462
463    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
464
465    // Use ANDPS and ORPS to simulate FCOPYSIGN.
466    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
467    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
468
469    // We don't support sin/cos/fmod
470    setOperationAction(ISD::FSIN , MVT::f32, Expand);
471    setOperationAction(ISD::FCOS , MVT::f32, Expand);
472
473    // Special cases we handle for FP constants.
474    addLegalFPImmediate(APFloat(+0.0f)); // xorps
475    addLegalFPImmediate(APFloat(+0.0)); // FLD0
476    addLegalFPImmediate(APFloat(+1.0)); // FLD1
477    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
478    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
479
480    if (!UnsafeFPMath) {
481      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
482      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
483    }
484  } else if (!UseSoftFloat) {
485    // f32 and f64 in x87.
486    // Set up the FP register classes.
487    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
488    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
489
490    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
491    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
492    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
493    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
494
495    if (!UnsafeFPMath) {
496      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
497      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
498    }
499    addLegalFPImmediate(APFloat(+0.0)); // FLD0
500    addLegalFPImmediate(APFloat(+1.0)); // FLD1
501    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
502    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
503    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
504    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
505    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
506    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
507  }
508
509  // Long double always uses X87.
510  if (!UseSoftFloat) {
511    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
512    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
513    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
514    {
515      bool ignored;
516      APFloat TmpFlt(+0.0);
517      TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
518                     &ignored);
519      addLegalFPImmediate(TmpFlt);  // FLD0
520      TmpFlt.changeSign();
521      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
522      APFloat TmpFlt2(+1.0);
523      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
524                      &ignored);
525      addLegalFPImmediate(TmpFlt2);  // FLD1
526      TmpFlt2.changeSign();
527      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
528    }
529
530    if (!UnsafeFPMath) {
531      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
532      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
533    }
534  }
535
536  // Always use a library call for pow.
537  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
538  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
539  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
540
541  setOperationAction(ISD::FLOG, MVT::f80, Expand);
542  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
543  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
544  setOperationAction(ISD::FEXP, MVT::f80, Expand);
545  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
546
547  // First set operation action for all vector types to either promote
548  // (for widening) or expand (for scalarization). Then we will selectively
549  // turn on ones that can be effectively codegen'd.
550  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
551       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
552    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
553    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
554    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
555    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
556    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
557    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
558    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
559    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
560    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
561    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
562    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
563    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
564    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
565    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
566    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
567    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
568    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
569    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
570    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
571    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
572    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
573    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
574    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
575    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
576    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
577    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
578    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
579    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
580    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
581    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
582    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
583    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
584    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
585    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
586    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
587    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
588    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
589    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
590    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
591    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
592    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
593    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
594    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
595    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
596    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
597    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
598    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
599    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
600    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
601    setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
602    setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
603    setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
604    setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
605    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
606         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
607      setTruncStoreAction((MVT::SimpleValueType)VT,
608                          (MVT::SimpleValueType)InnerVT, Expand);
609    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
610    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
611    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
612  }
613
614  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
615  // with -msoft-float, disable use of MMX as well.
616  if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
617    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass, false);
618    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false);
619    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false);
620    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass, false);
621    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false);
622
623    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
624    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
625    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
626    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
627
628    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
629    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
630    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
631    setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
632
633    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
634    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
635
636    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
637    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
638    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
639    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
640    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
641    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
642    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
643
644    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
645    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
646    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
647    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
648    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
649    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
650    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
651
652    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
653    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
654    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
655    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
656    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
657    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
658    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
659
660    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
661    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
662    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
663    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
664    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
665    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
666    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
667    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
668    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
669
670    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
671    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
672    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
673    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
674    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
675
676    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
677    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
678    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
679    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
680
681    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
682    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
683    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
684    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
685
686    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
687
688    setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
689    setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
690    setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
691    setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
692    setOperationAction(ISD::VSETCC,             MVT::v8i8, Custom);
693    setOperationAction(ISD::VSETCC,             MVT::v4i16, Custom);
694    setOperationAction(ISD::VSETCC,             MVT::v2i32, Custom);
695
696    if (!X86ScalarSSEf64 && Subtarget->is64Bit()) {
697      setOperationAction(ISD::BIT_CONVERT,        MVT::v8i8,  Custom);
698      setOperationAction(ISD::BIT_CONVERT,        MVT::v4i16, Custom);
699      setOperationAction(ISD::BIT_CONVERT,        MVT::v2i32, Custom);
700      setOperationAction(ISD::BIT_CONVERT,        MVT::v2f32, Custom);
701      setOperationAction(ISD::BIT_CONVERT,        MVT::v1i64, Custom);
702    }
703  }
704
705  if (!UseSoftFloat && Subtarget->hasSSE1()) {
706    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
707
708    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
709    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
710    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
711    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
712    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
713    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
714    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
715    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
716    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
717    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
718    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
719    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
720  }
721
722  if (!UseSoftFloat && Subtarget->hasSSE2()) {
723    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
724
725    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
726    // registers cannot be used even for integer operations.
727    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
728    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
729    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
730    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
731
732    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
733    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
734    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
735    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
736    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
737    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
738    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
739    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
740    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
741    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
742    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
743    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
744    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
745    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
746    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
747    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
748
749    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
750    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
751    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
752    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
753
754    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
755    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
756    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
757    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
758    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
759
760    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
761    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
762    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
763    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
764    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
765
766    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
767    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
768      EVT VT = (MVT::SimpleValueType)i;
769      // Do not attempt to custom lower non-power-of-2 vectors
770      if (!isPowerOf2_32(VT.getVectorNumElements()))
771        continue;
772      // Do not attempt to custom lower non-128-bit vectors
773      if (!VT.is128BitVector())
774        continue;
775      setOperationAction(ISD::BUILD_VECTOR,
776                         VT.getSimpleVT().SimpleTy, Custom);
777      setOperationAction(ISD::VECTOR_SHUFFLE,
778                         VT.getSimpleVT().SimpleTy, Custom);
779      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
780                         VT.getSimpleVT().SimpleTy, Custom);
781    }
782
783    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
784    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
785    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
786    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
787    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
788    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
789
790    if (Subtarget->is64Bit()) {
791      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
792      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
793    }
794
795    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
796    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
797      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
798      EVT VT = SVT;
799
800      // Do not attempt to promote non-128-bit vectors
801      if (!VT.is128BitVector()) {
802        continue;
803      }
804
805      setOperationAction(ISD::AND,    SVT, Promote);
806      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
807      setOperationAction(ISD::OR,     SVT, Promote);
808      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
809      setOperationAction(ISD::XOR,    SVT, Promote);
810      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
811      setOperationAction(ISD::LOAD,   SVT, Promote);
812      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
813      setOperationAction(ISD::SELECT, SVT, Promote);
814      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
815    }
816
817    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
818
819    // Custom lower v2i64 and v2f64 selects.
820    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
821    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
822    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
823    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
824
825    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
826    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
827    if (!DisableMMX && Subtarget->hasMMX()) {
828      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
829      setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
830    }
831  }
832
833  if (Subtarget->hasSSE41()) {
834    setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
835    setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
836    setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
837    setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
838    setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
839    setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
840    setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
841    setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
842    setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
843    setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
844
845    // FIXME: Do we need to handle scalar-to-vector here?
846    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
847
848    // i8 and i16 vectors are custom , because the source register and source
849    // source memory operand types are not the same width.  f32 vectors are
850    // custom since the immediate controlling the insert encodes additional
851    // information.
852    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
853    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
854    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
855    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
856
857    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
858    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
859    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
860    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
861
862    if (Subtarget->is64Bit()) {
863      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
864      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
865    }
866  }
867
868  if (Subtarget->hasSSE42()) {
869    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
870  }
871
872  if (!UseSoftFloat && Subtarget->hasAVX()) {
873    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
874    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
875    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
876    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
877
878    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
879    setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
880    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
881    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
882    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
883    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
884    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
885    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
886    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
887    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
888    //setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
889    //setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8f32, Custom);
890    //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
891    //setOperationAction(ISD::SELECT,             MVT::v8f32, Custom);
892    //setOperationAction(ISD::VSETCC,             MVT::v8f32, Custom);
893
894    // Operations to consider commented out -v16i16 v32i8
895    //setOperationAction(ISD::ADD,                MVT::v16i16, Legal);
896    setOperationAction(ISD::ADD,                MVT::v8i32, Custom);
897    setOperationAction(ISD::ADD,                MVT::v4i64, Custom);
898    //setOperationAction(ISD::SUB,                MVT::v32i8, Legal);
899    //setOperationAction(ISD::SUB,                MVT::v16i16, Legal);
900    setOperationAction(ISD::SUB,                MVT::v8i32, Custom);
901    setOperationAction(ISD::SUB,                MVT::v4i64, Custom);
902    //setOperationAction(ISD::MUL,                MVT::v16i16, Legal);
903    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
904    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
905    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
906    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
907    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
908    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
909
910    setOperationAction(ISD::VSETCC,             MVT::v4f64, Custom);
911    // setOperationAction(ISD::VSETCC,             MVT::v32i8, Custom);
912    // setOperationAction(ISD::VSETCC,             MVT::v16i16, Custom);
913    setOperationAction(ISD::VSETCC,             MVT::v8i32, Custom);
914
915    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i8, Custom);
916    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i16, Custom);
917    // setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i16, Custom);
918    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i32, Custom);
919    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8f32, Custom);
920
921    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f64, Custom);
922    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i64, Custom);
923    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f64, Custom);
924    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i64, Custom);
925    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f64, Custom);
926    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
927
928#if 0
929    // Not sure we want to do this since there are no 256-bit integer
930    // operations in AVX
931
932    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
933    // This includes 256-bit vectors
934    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
935      EVT VT = (MVT::SimpleValueType)i;
936
937      // Do not attempt to custom lower non-power-of-2 vectors
938      if (!isPowerOf2_32(VT.getVectorNumElements()))
939        continue;
940
941      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
942      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
943      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
944    }
945
946    if (Subtarget->is64Bit()) {
947      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i64, Custom);
948      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
949    }
950#endif
951
952#if 0
953    // Not sure we want to do this since there are no 256-bit integer
954    // operations in AVX
955
956    // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
957    // Including 256-bit vectors
958    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
959      EVT VT = (MVT::SimpleValueType)i;
960
961      if (!VT.is256BitVector()) {
962        continue;
963      }
964      setOperationAction(ISD::AND,    VT, Promote);
965      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
966      setOperationAction(ISD::OR,     VT, Promote);
967      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
968      setOperationAction(ISD::XOR,    VT, Promote);
969      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
970      setOperationAction(ISD::LOAD,   VT, Promote);
971      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
972      setOperationAction(ISD::SELECT, VT, Promote);
973      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
974    }
975
976    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
977#endif
978  }
979
980  // We want to custom lower some of our intrinsics.
981  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
982
983  // Add/Sub/Mul with overflow operations are custom lowered.
984  setOperationAction(ISD::SADDO, MVT::i32, Custom);
985  setOperationAction(ISD::UADDO, MVT::i32, Custom);
986  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
987  setOperationAction(ISD::USUBO, MVT::i32, Custom);
988  setOperationAction(ISD::SMULO, MVT::i32, Custom);
989
990  // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
991  // handle type legalization for these operations here.
992  //
993  // FIXME: We really should do custom legalization for addition and
994  // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
995  // than generic legalization for 64-bit multiplication-with-overflow, though.
996  if (Subtarget->is64Bit()) {
997    setOperationAction(ISD::SADDO, MVT::i64, Custom);
998    setOperationAction(ISD::UADDO, MVT::i64, Custom);
999    setOperationAction(ISD::SSUBO, MVT::i64, Custom);
1000    setOperationAction(ISD::USUBO, MVT::i64, Custom);
1001    setOperationAction(ISD::SMULO, MVT::i64, Custom);
1002  }
1003
1004  if (!Subtarget->is64Bit()) {
1005    // These libcalls are not available in 32-bit.
1006    setLibcallName(RTLIB::SHL_I128, 0);
1007    setLibcallName(RTLIB::SRL_I128, 0);
1008    setLibcallName(RTLIB::SRA_I128, 0);
1009  }
1010
1011  // We have target-specific dag combine patterns for the following nodes:
1012  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1013  setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1014  setTargetDAGCombine(ISD::BUILD_VECTOR);
1015  setTargetDAGCombine(ISD::SELECT);
1016  setTargetDAGCombine(ISD::SHL);
1017  setTargetDAGCombine(ISD::SRA);
1018  setTargetDAGCombine(ISD::SRL);
1019  setTargetDAGCombine(ISD::OR);
1020  setTargetDAGCombine(ISD::STORE);
1021  setTargetDAGCombine(ISD::ZERO_EXTEND);
1022  if (Subtarget->is64Bit())
1023    setTargetDAGCombine(ISD::MUL);
1024
1025  computeRegisterProperties();
1026
1027  // FIXME: These should be based on subtarget info. Plus, the values should
1028  // be smaller when we are in optimizing for size mode.
1029  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1030  maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1031  maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
1032  setPrefLoopAlignment(16);
1033  benefitFromCodePlacementOpt = true;
1034}
1035
1036
1037MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
1038  return MVT::i8;
1039}
1040
1041
1042/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1043/// the desired ByVal argument alignment.
1044static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
1045  if (MaxAlign == 16)
1046    return;
1047  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1048    if (VTy->getBitWidth() == 128)
1049      MaxAlign = 16;
1050  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1051    unsigned EltAlign = 0;
1052    getMaxByValAlign(ATy->getElementType(), EltAlign);
1053    if (EltAlign > MaxAlign)
1054      MaxAlign = EltAlign;
1055  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
1056    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
1057      unsigned EltAlign = 0;
1058      getMaxByValAlign(STy->getElementType(i), EltAlign);
1059      if (EltAlign > MaxAlign)
1060        MaxAlign = EltAlign;
1061      if (MaxAlign == 16)
1062        break;
1063    }
1064  }
1065  return;
1066}
1067
1068/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1069/// function arguments in the caller parameter area. For X86, aggregates
1070/// that contain SSE vectors are placed at 16-byte boundaries while the rest
1071/// are at 4-byte boundaries.
1072unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
1073  if (Subtarget->is64Bit()) {
1074    // Max of 8 and alignment of type.
1075    unsigned TyAlign = TD->getABITypeAlignment(Ty);
1076    if (TyAlign > 8)
1077      return TyAlign;
1078    return 8;
1079  }
1080
1081  unsigned Align = 4;
1082  if (Subtarget->hasSSE1())
1083    getMaxByValAlign(Ty, Align);
1084  return Align;
1085}
1086
1087/// getOptimalMemOpType - Returns the target specific optimal type for load
1088/// and store operations as a result of memset, memcpy, and memmove
1089/// lowering. If DstAlign is zero that means it's safe to destination
1090/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1091/// means there isn't a need to check it against alignment requirement,
1092/// probably because the source does not need to be loaded. If
1093/// 'NonScalarIntSafe' is true, that means it's safe to return a
1094/// non-scalar-integer type, e.g. empty string source, constant, or loaded
1095/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
1096/// constant so it does not need to be loaded.
1097/// It returns EVT::Other if the type should be determined using generic
1098/// target-independent logic.
1099EVT
1100X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1101                                       unsigned DstAlign, unsigned SrcAlign,
1102                                       bool NonScalarIntSafe,
1103                                       bool MemcpyStrSrc,
1104                                       MachineFunction &MF) const {
1105  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
1106  // linux.  This is because the stack realignment code can't handle certain
1107  // cases like PR2962.  This should be removed when PR2962 is fixed.
1108  const Function *F = MF.getFunction();
1109  if (NonScalarIntSafe &&
1110      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
1111    if (Size >= 16 &&
1112        (Subtarget->isUnalignedMemAccessFast() ||
1113         ((DstAlign == 0 || DstAlign >= 16) &&
1114          (SrcAlign == 0 || SrcAlign >= 16))) &&
1115        Subtarget->getStackAlignment() >= 16) {
1116      if (Subtarget->hasSSE2())
1117        return MVT::v4i32;
1118      if (Subtarget->hasSSE1())
1119        return MVT::v4f32;
1120    } else if (!MemcpyStrSrc && Size >= 8 &&
1121               !Subtarget->is64Bit() &&
1122               Subtarget->getStackAlignment() >= 8 &&
1123               Subtarget->hasSSE2()) {
1124      // Do not use f64 to lower memcpy if source is string constant. It's
1125      // better to use i32 to avoid the loads.
1126      return MVT::f64;
1127    }
1128  }
1129  if (Subtarget->is64Bit() && Size >= 8)
1130    return MVT::i64;
1131  return MVT::i32;
1132}
1133
1134/// getJumpTableEncoding - Return the entry encoding for a jump table in the
1135/// current function.  The returned value is a member of the
1136/// MachineJumpTableInfo::JTEntryKind enum.
1137unsigned X86TargetLowering::getJumpTableEncoding() const {
1138  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1139  // symbol.
1140  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1141      Subtarget->isPICStyleGOT())
1142    return MachineJumpTableInfo::EK_Custom32;
1143
1144  // Otherwise, use the normal jump table encoding heuristics.
1145  return TargetLowering::getJumpTableEncoding();
1146}
1147
1148/// getPICBaseSymbol - Return the X86-32 PIC base.
1149MCSymbol *
1150X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
1151                                    MCContext &Ctx) const {
1152  const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
1153  return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
1154                               Twine(MF->getFunctionNumber())+"$pb");
1155}
1156
1157
1158const MCExpr *
1159X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1160                                             const MachineBasicBlock *MBB,
1161                                             unsigned uid,MCContext &Ctx) const{
1162  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
1163         Subtarget->isPICStyleGOT());
1164  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1165  // entries.
1166  return MCSymbolRefExpr::Create(MBB->getSymbol(),
1167                                 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1168}
1169
1170/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
1171/// jumptable.
1172SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1173                                                    SelectionDAG &DAG) const {
1174  if (!Subtarget->is64Bit())
1175    // This doesn't have DebugLoc associated with it, but is not really the
1176    // same as a Register.
1177    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
1178  return Table;
1179}
1180
1181/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
1182/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
1183/// MCExpr.
1184const MCExpr *X86TargetLowering::
1185getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1186                             MCContext &Ctx) const {
1187  // X86-64 uses RIP relative addressing based on the jump table label.
1188  if (Subtarget->isPICStyleRIPRel())
1189    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1190
1191  // Otherwise, the reference is relative to the PIC base.
1192  return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx);
1193}
1194
1195/// getFunctionAlignment - Return the Log2 alignment of this function.
1196unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
1197  return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
1198}
1199
1200//===----------------------------------------------------------------------===//
1201//               Return Value Calling Convention Implementation
1202//===----------------------------------------------------------------------===//
1203
1204#include "X86GenCallingConv.inc"
1205
1206bool
1207X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
1208                        const SmallVectorImpl<EVT> &OutTys,
1209                        const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
1210                        SelectionDAG &DAG) const {
1211  SmallVector<CCValAssign, 16> RVLocs;
1212  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1213                 RVLocs, *DAG.getContext());
1214  return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86);
1215}
1216
1217SDValue
1218X86TargetLowering::LowerReturn(SDValue Chain,
1219                               CallingConv::ID CallConv, bool isVarArg,
1220                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1221                               DebugLoc dl, SelectionDAG &DAG) const {
1222  MachineFunction &MF = DAG.getMachineFunction();
1223  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1224
1225  SmallVector<CCValAssign, 16> RVLocs;
1226  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1227                 RVLocs, *DAG.getContext());
1228  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
1229
1230  // Add the regs to the liveout set for the function.
1231  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1232  for (unsigned i = 0; i != RVLocs.size(); ++i)
1233    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
1234      MRI.addLiveOut(RVLocs[i].getLocReg());
1235
1236  SDValue Flag;
1237
1238  SmallVector<SDValue, 6> RetOps;
1239  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1240  // Operand #1 = Bytes To Pop
1241  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
1242                   MVT::i16));
1243
1244  // Copy the result values into the output registers.
1245  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1246    CCValAssign &VA = RVLocs[i];
1247    assert(VA.isRegLoc() && "Can only return in registers!");
1248    SDValue ValToCopy = Outs[i].Val;
1249
1250    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
1251    // the RET instruction and handled by the FP Stackifier.
1252    if (VA.getLocReg() == X86::ST0 ||
1253        VA.getLocReg() == X86::ST1) {
1254      // If this is a copy from an xmm register to ST(0), use an FPExtend to
1255      // change the value to the FP stack register class.
1256      if (isScalarFPTypeInSSEReg(VA.getValVT()))
1257        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
1258      RetOps.push_back(ValToCopy);
1259      // Don't emit a copytoreg.
1260      continue;
1261    }
1262
1263    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
1264    // which is returned in RAX / RDX.
1265    if (Subtarget->is64Bit()) {
1266      EVT ValVT = ValToCopy.getValueType();
1267      if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
1268        ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
1269        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
1270          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
1271      }
1272    }
1273
1274    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
1275    Flag = Chain.getValue(1);
1276  }
1277
1278  // The x86-64 ABI for returning structs by value requires that we copy
1279  // the sret argument into %rax for the return. We saved the argument into
1280  // a virtual register in the entry block, so now we copy the value out
1281  // and into %rax.
1282  if (Subtarget->is64Bit() &&
1283      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
1284    MachineFunction &MF = DAG.getMachineFunction();
1285    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1286    unsigned Reg = FuncInfo->getSRetReturnReg();
1287    assert(Reg &&
1288           "SRetReturnReg should have been set in LowerFormalArguments().");
1289    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
1290
1291    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
1292    Flag = Chain.getValue(1);
1293
1294    // RAX now acts like a return value.
1295    MRI.addLiveOut(X86::RAX);
1296  }
1297
1298  RetOps[0] = Chain;  // Update chain.
1299
1300  // Add the flag if we have it.
1301  if (Flag.getNode())
1302    RetOps.push_back(Flag);
1303
1304  return DAG.getNode(X86ISD::RET_FLAG, dl,
1305                     MVT::Other, &RetOps[0], RetOps.size());
1306}
1307
1308/// LowerCallResult - Lower the result values of a call into the
1309/// appropriate copies out of appropriate physical registers.
1310///
1311SDValue
1312X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
1313                                   CallingConv::ID CallConv, bool isVarArg,
1314                                   const SmallVectorImpl<ISD::InputArg> &Ins,
1315                                   DebugLoc dl, SelectionDAG &DAG,
1316                                   SmallVectorImpl<SDValue> &InVals) const {
1317
1318  // Assign locations to each value returned by this call.
1319  SmallVector<CCValAssign, 16> RVLocs;
1320  bool Is64Bit = Subtarget->is64Bit();
1321  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1322                 RVLocs, *DAG.getContext());
1323  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
1324
1325  // Copy all of the result registers out of their specified physreg.
1326  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1327    CCValAssign &VA = RVLocs[i];
1328    EVT CopyVT = VA.getValVT();
1329
1330    // If this is x86-64, and we disabled SSE, we can't return FP values
1331    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
1332        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
1333      report_fatal_error("SSE register return with SSE disabled");
1334    }
1335
1336    // If this is a call to a function that returns an fp value on the floating
1337    // point stack, but where we prefer to use the value in xmm registers, copy
1338    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
1339    if ((VA.getLocReg() == X86::ST0 ||
1340         VA.getLocReg() == X86::ST1) &&
1341        isScalarFPTypeInSSEReg(VA.getValVT())) {
1342      CopyVT = MVT::f80;
1343    }
1344
1345    SDValue Val;
1346    if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
1347      // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
1348      if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
1349        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1350                                   MVT::v2i64, InFlag).getValue(1);
1351        Val = Chain.getValue(0);
1352        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1353                          Val, DAG.getConstant(0, MVT::i64));
1354      } else {
1355        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1356                                   MVT::i64, InFlag).getValue(1);
1357        Val = Chain.getValue(0);
1358      }
1359      Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
1360    } else {
1361      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
1362                                 CopyVT, InFlag).getValue(1);
1363      Val = Chain.getValue(0);
1364    }
1365    InFlag = Chain.getValue(2);
1366
1367    if (CopyVT != VA.getValVT()) {
1368      // Round the F80 the right size, which also moves to the appropriate xmm
1369      // register.
1370      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
1371                        // This truncation won't change the value.
1372                        DAG.getIntPtrConstant(1));
1373    }
1374
1375    InVals.push_back(Val);
1376  }
1377
1378  return Chain;
1379}
1380
1381
1382//===----------------------------------------------------------------------===//
1383//                C & StdCall & Fast Calling Convention implementation
1384//===----------------------------------------------------------------------===//
1385//  StdCall calling convention seems to be standard for many Windows' API
1386//  routines and around. It differs from C calling convention just a little:
1387//  callee should clean up the stack, not caller. Symbols should be also
1388//  decorated in some fancy way :) It doesn't support any vector arguments.
1389//  For info on fast calling convention see Fast Calling Convention (tail call)
1390//  implementation LowerX86_32FastCCCallTo.
1391
1392/// CallIsStructReturn - Determines whether a call uses struct return
1393/// semantics.
1394static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1395  if (Outs.empty())
1396    return false;
1397
1398  return Outs[0].Flags.isSRet();
1399}
1400
1401/// ArgsAreStructReturn - Determines whether a function uses struct
1402/// return semantics.
1403static bool
1404ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
1405  if (Ins.empty())
1406    return false;
1407
1408  return Ins[0].Flags.isSRet();
1409}
1410
1411/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
1412/// given CallingConvention value.
1413CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
1414  if (Subtarget->is64Bit()) {
1415    if (CC == CallingConv::GHC)
1416      return CC_X86_64_GHC;
1417    else if (Subtarget->isTargetWin64())
1418      return CC_X86_Win64_C;
1419    else
1420      return CC_X86_64_C;
1421  }
1422
1423  if (CC == CallingConv::X86_FastCall)
1424    return CC_X86_32_FastCall;
1425  else if (CC == CallingConv::X86_ThisCall)
1426    return CC_X86_32_ThisCall;
1427  else if (CC == CallingConv::Fast)
1428    return CC_X86_32_FastCC;
1429  else if (CC == CallingConv::GHC)
1430    return CC_X86_32_GHC;
1431  else
1432    return CC_X86_32_C;
1433}
1434
1435/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
1436/// by "Src" to address "Dst" with size and alignment information specified by
1437/// the specific parameter attribute. The copy will be passed as a byval
1438/// function parameter.
1439static SDValue
1440CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
1441                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1442                          DebugLoc dl) {
1443  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
1444  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
1445                       /*isVolatile*/false, /*AlwaysInline=*/true,
1446                       NULL, 0, NULL, 0);
1447}
1448
1449/// IsTailCallConvention - Return true if the calling convention is one that
1450/// supports tail call optimization.
1451static bool IsTailCallConvention(CallingConv::ID CC) {
1452  return (CC == CallingConv::Fast || CC == CallingConv::GHC);
1453}
1454
1455/// FuncIsMadeTailCallSafe - Return true if the function is being made into
1456/// a tailcall target by changing its ABI.
1457static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
1458  return GuaranteedTailCallOpt && IsTailCallConvention(CC);
1459}
1460
1461SDValue
1462X86TargetLowering::LowerMemArgument(SDValue Chain,
1463                                    CallingConv::ID CallConv,
1464                                    const SmallVectorImpl<ISD::InputArg> &Ins,
1465                                    DebugLoc dl, SelectionDAG &DAG,
1466                                    const CCValAssign &VA,
1467                                    MachineFrameInfo *MFI,
1468                                    unsigned i) const {
1469  // Create the nodes corresponding to a load from this parameter slot.
1470  ISD::ArgFlagsTy Flags = Ins[i].Flags;
1471  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
1472  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
1473  EVT ValVT;
1474
1475  // If value is passed by pointer we have address passed instead of the value
1476  // itself.
1477  if (VA.getLocInfo() == CCValAssign::Indirect)
1478    ValVT = VA.getLocVT();
1479  else
1480    ValVT = VA.getValVT();
1481
1482  // FIXME: For now, all byval parameter objects are marked mutable. This can be
1483  // changed with more analysis.
1484  // In case of tail call optimization mark all arguments mutable. Since they
1485  // could be overwritten by lowering of arguments in case of a tail call.
1486  if (Flags.isByVal()) {
1487    int FI = MFI->CreateFixedObject(Flags.getByValSize(),
1488                                    VA.getLocMemOffset(), isImmutable, false);
1489    return DAG.getFrameIndex(FI, getPointerTy());
1490  } else {
1491    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
1492                                    VA.getLocMemOffset(), isImmutable, false);
1493    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
1494    return DAG.getLoad(ValVT, dl, Chain, FIN,
1495                       PseudoSourceValue::getFixedStack(FI), 0,
1496                       false, false, 0);
1497  }
1498}
1499
1500SDValue
1501X86TargetLowering::LowerFormalArguments(SDValue Chain,
1502                                        CallingConv::ID CallConv,
1503                                        bool isVarArg,
1504                                      const SmallVectorImpl<ISD::InputArg> &Ins,
1505                                        DebugLoc dl,
1506                                        SelectionDAG &DAG,
1507                                        SmallVectorImpl<SDValue> &InVals)
1508                                          const {
1509  MachineFunction &MF = DAG.getMachineFunction();
1510  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1511
1512  const Function* Fn = MF.getFunction();
1513  if (Fn->hasExternalLinkage() &&
1514      Subtarget->isTargetCygMing() &&
1515      Fn->getName() == "main")
1516    FuncInfo->setForceFramePointer(true);
1517
1518  MachineFrameInfo *MFI = MF.getFrameInfo();
1519  bool Is64Bit = Subtarget->is64Bit();
1520  bool IsWin64 = Subtarget->isTargetWin64();
1521
1522  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1523         "Var args not supported with calling convention fastcc or ghc");
1524
1525  // Assign locations to all of the incoming arguments.
1526  SmallVector<CCValAssign, 16> ArgLocs;
1527  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1528                 ArgLocs, *DAG.getContext());
1529  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
1530
1531  unsigned LastVal = ~0U;
1532  SDValue ArgValue;
1533  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1534    CCValAssign &VA = ArgLocs[i];
1535    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
1536    // places.
1537    assert(VA.getValNo() != LastVal &&
1538           "Don't support value assigned to multiple locs yet");
1539    LastVal = VA.getValNo();
1540
1541    if (VA.isRegLoc()) {
1542      EVT RegVT = VA.getLocVT();
1543      TargetRegisterClass *RC = NULL;
1544      if (RegVT == MVT::i32)
1545        RC = X86::GR32RegisterClass;
1546      else if (Is64Bit && RegVT == MVT::i64)
1547        RC = X86::GR64RegisterClass;
1548      else if (RegVT == MVT::f32)
1549        RC = X86::FR32RegisterClass;
1550      else if (RegVT == MVT::f64)
1551        RC = X86::FR64RegisterClass;
1552      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
1553        RC = X86::VR128RegisterClass;
1554      else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
1555        RC = X86::VR64RegisterClass;
1556      else
1557        llvm_unreachable("Unknown argument type!");
1558
1559      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
1560      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
1561
1562      // If this is an 8 or 16-bit value, it is really passed promoted to 32
1563      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
1564      // right size.
1565      if (VA.getLocInfo() == CCValAssign::SExt)
1566        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
1567                               DAG.getValueType(VA.getValVT()));
1568      else if (VA.getLocInfo() == CCValAssign::ZExt)
1569        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
1570                               DAG.getValueType(VA.getValVT()));
1571      else if (VA.getLocInfo() == CCValAssign::BCvt)
1572        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1573
1574      if (VA.isExtInLoc()) {
1575        // Handle MMX values passed in XMM regs.
1576        if (RegVT.isVector()) {
1577          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
1578                                 ArgValue, DAG.getConstant(0, MVT::i64));
1579          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
1580        } else
1581          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
1582      }
1583    } else {
1584      assert(VA.isMemLoc());
1585      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
1586    }
1587
1588    // If value is passed via pointer - do a load.
1589    if (VA.getLocInfo() == CCValAssign::Indirect)
1590      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0,
1591                             false, false, 0);
1592
1593    InVals.push_back(ArgValue);
1594  }
1595
1596  // The x86-64 ABI for returning structs by value requires that we copy
1597  // the sret argument into %rax for the return. Save the argument into
1598  // a virtual register so that we can access it from the return points.
1599  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
1600    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
1601    unsigned Reg = FuncInfo->getSRetReturnReg();
1602    if (!Reg) {
1603      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
1604      FuncInfo->setSRetReturnReg(Reg);
1605    }
1606    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
1607    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
1608  }
1609
1610  unsigned StackSize = CCInfo.getNextStackOffset();
1611  // Align stack specially for tail calls.
1612  if (FuncIsMadeTailCallSafe(CallConv))
1613    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
1614
1615  // If the function takes variable number of arguments, make a frame index for
1616  // the start of the first vararg value... for expansion of llvm.va_start.
1617  if (isVarArg) {
1618    if (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
1619                    CallConv != CallingConv::X86_ThisCall)) {
1620      FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,
1621                                                            true, false));
1622    }
1623    if (Is64Bit) {
1624      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
1625
1626      // FIXME: We should really autogenerate these arrays
1627      static const unsigned GPR64ArgRegsWin64[] = {
1628        X86::RCX, X86::RDX, X86::R8,  X86::R9
1629      };
1630      static const unsigned XMMArgRegsWin64[] = {
1631        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
1632      };
1633      static const unsigned GPR64ArgRegs64Bit[] = {
1634        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
1635      };
1636      static const unsigned XMMArgRegs64Bit[] = {
1637        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1638        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1639      };
1640      const unsigned *GPR64ArgRegs, *XMMArgRegs;
1641
1642      if (IsWin64) {
1643        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
1644        GPR64ArgRegs = GPR64ArgRegsWin64;
1645        XMMArgRegs = XMMArgRegsWin64;
1646      } else {
1647        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
1648        GPR64ArgRegs = GPR64ArgRegs64Bit;
1649        XMMArgRegs = XMMArgRegs64Bit;
1650      }
1651      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
1652                                                       TotalNumIntRegs);
1653      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
1654                                                       TotalNumXMMRegs);
1655
1656      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
1657      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
1658             "SSE register cannot be used when SSE is disabled!");
1659      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
1660             "SSE register cannot be used when SSE is disabled!");
1661      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
1662        // Kernel mode asks for SSE to be disabled, so don't push them
1663        // on the stack.
1664        TotalNumXMMRegs = 0;
1665
1666      // For X86-64, if there are vararg parameters that are passed via
1667      // registers, then we must store them to their spots on the stack so they
1668      // may be loaded by deferencing the result of va_next.
1669      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
1670      FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16);
1671      FuncInfo->setRegSaveFrameIndex(
1672        MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16,
1673                               false));
1674
1675      // Store the integer parameter registers.
1676      SmallVector<SDValue, 8> MemOps;
1677      SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
1678                                        getPointerTy());
1679      unsigned Offset = FuncInfo->getVarArgsGPOffset();
1680      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
1681        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
1682                                  DAG.getIntPtrConstant(Offset));
1683        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
1684                                     X86::GR64RegisterClass);
1685        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
1686        SDValue Store =
1687          DAG.getStore(Val.getValue(1), dl, Val, FIN,
1688                       PseudoSourceValue::getFixedStack(
1689                         FuncInfo->getRegSaveFrameIndex()),
1690                       Offset, false, false, 0);
1691        MemOps.push_back(Store);
1692        Offset += 8;
1693      }
1694
1695      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
1696        // Now store the XMM (fp + vector) parameter registers.
1697        SmallVector<SDValue, 11> SaveXMMOps;
1698        SaveXMMOps.push_back(Chain);
1699
1700        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
1701        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
1702        SaveXMMOps.push_back(ALVal);
1703
1704        SaveXMMOps.push_back(DAG.getIntPtrConstant(
1705                               FuncInfo->getRegSaveFrameIndex()));
1706        SaveXMMOps.push_back(DAG.getIntPtrConstant(
1707                               FuncInfo->getVarArgsFPOffset()));
1708
1709        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
1710          unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
1711                                       X86::VR128RegisterClass);
1712          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
1713          SaveXMMOps.push_back(Val);
1714        }
1715        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
1716                                     MVT::Other,
1717                                     &SaveXMMOps[0], SaveXMMOps.size()));
1718      }
1719
1720      if (!MemOps.empty())
1721        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1722                            &MemOps[0], MemOps.size());
1723    }
1724  }
1725
1726  // Some CCs need callee pop.
1727  if (Subtarget->IsCalleePop(isVarArg, CallConv)) {
1728    FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
1729  } else {
1730    FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
1731    // If this is an sret function, the return should pop the hidden pointer.
1732    if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins))
1733      FuncInfo->setBytesToPopOnReturn(4);
1734  }
1735
1736  if (!Is64Bit) {
1737    // RegSaveFrameIndex is X86-64 only.
1738    FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
1739    if (CallConv == CallingConv::X86_FastCall ||
1740        CallConv == CallingConv::X86_ThisCall)
1741      // fastcc functions can't have varargs.
1742      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
1743  }
1744
1745  return Chain;
1746}
1747
1748SDValue
1749X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
1750                                    SDValue StackPtr, SDValue Arg,
1751                                    DebugLoc dl, SelectionDAG &DAG,
1752                                    const CCValAssign &VA,
1753                                    ISD::ArgFlagsTy Flags) const {
1754  const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
1755  unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
1756  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
1757  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
1758  if (Flags.isByVal()) {
1759    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
1760  }
1761  return DAG.getStore(Chain, dl, Arg, PtrOff,
1762                      PseudoSourceValue::getStack(), LocMemOffset,
1763                      false, false, 0);
1764}
1765
1766/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
1767/// optimization is performed and it is required.
1768SDValue
1769X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
1770                                           SDValue &OutRetAddr, SDValue Chain,
1771                                           bool IsTailCall, bool Is64Bit,
1772                                           int FPDiff, DebugLoc dl) const {
1773  // Adjust the Return address stack slot.
1774  EVT VT = getPointerTy();
1775  OutRetAddr = getReturnAddressFrameIndex(DAG);
1776
1777  // Load the "old" Return address.
1778  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0);
1779  return SDValue(OutRetAddr.getNode(), 1);
1780}
1781
1782/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
1783/// optimization is performed and it is required (FPDiff!=0).
1784static SDValue
1785EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
1786                         SDValue Chain, SDValue RetAddrFrIdx,
1787                         bool Is64Bit, int FPDiff, DebugLoc dl) {
1788  // Store the return address to the appropriate stack slot.
1789  if (!FPDiff) return Chain;
1790  // Calculate the new stack slot for the return address.
1791  int SlotSize = Is64Bit ? 8 : 4;
1792  int NewReturnAddrFI =
1793    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false);
1794  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
1795  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
1796  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
1797                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0,
1798                       false, false, 0);
1799  return Chain;
1800}
1801
1802SDValue
1803X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1804                             CallingConv::ID CallConv, bool isVarArg,
1805                             bool &isTailCall,
1806                             const SmallVectorImpl<ISD::OutputArg> &Outs,
1807                             const SmallVectorImpl<ISD::InputArg> &Ins,
1808                             DebugLoc dl, SelectionDAG &DAG,
1809                             SmallVectorImpl<SDValue> &InVals) const {
1810  MachineFunction &MF = DAG.getMachineFunction();
1811  bool Is64Bit        = Subtarget->is64Bit();
1812  bool IsStructRet    = CallIsStructReturn(Outs);
1813  bool IsSibcall      = false;
1814
1815  if (isTailCall) {
1816    // Check if it's really possible to do a tail call.
1817    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
1818                    isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
1819                                                   Outs, Ins, DAG);
1820
1821    // Sibcalls are automatically detected tailcalls which do not require
1822    // ABI changes.
1823    if (!GuaranteedTailCallOpt && isTailCall)
1824      IsSibcall = true;
1825
1826    if (isTailCall)
1827      ++NumTailCalls;
1828  }
1829
1830  assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
1831         "Var args not supported with calling convention fastcc or ghc");
1832
1833  // Analyze operands of the call, assigning locations to each operand.
1834  SmallVector<CCValAssign, 16> ArgLocs;
1835  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
1836                 ArgLocs, *DAG.getContext());
1837  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
1838
1839  // Get a count of how many bytes are to be pushed on the stack.
1840  unsigned NumBytes = CCInfo.getNextStackOffset();
1841  if (IsSibcall)
1842    // This is a sibcall. The memory operands are available in caller's
1843    // own caller's stack.
1844    NumBytes = 0;
1845  else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv))
1846    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
1847
1848  int FPDiff = 0;
1849  if (isTailCall && !IsSibcall) {
1850    // Lower arguments at fp - stackoffset + fpdiff.
1851    unsigned NumBytesCallerPushed =
1852      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
1853    FPDiff = NumBytesCallerPushed - NumBytes;
1854
1855    // Set the delta of movement of the returnaddr stackslot.
1856    // But only set if delta is greater than previous delta.
1857    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
1858      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
1859  }
1860
1861  if (!IsSibcall)
1862    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
1863
1864  SDValue RetAddrFrIdx;
1865  // Load return adress for tail calls.
1866  if (isTailCall && FPDiff)
1867    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
1868                                    Is64Bit, FPDiff, dl);
1869
1870  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
1871  SmallVector<SDValue, 8> MemOpChains;
1872  SDValue StackPtr;
1873
1874  // Walk the register/memloc assignments, inserting copies/loads.  In the case
1875  // of tail call optimization arguments are handle later.
1876  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
1877    CCValAssign &VA = ArgLocs[i];
1878    EVT RegVT = VA.getLocVT();
1879    SDValue Arg = Outs[i].Val;
1880    ISD::ArgFlagsTy Flags = Outs[i].Flags;
1881    bool isByVal = Flags.isByVal();
1882
1883    // Promote the value if needed.
1884    switch (VA.getLocInfo()) {
1885    default: llvm_unreachable("Unknown loc info!");
1886    case CCValAssign::Full: break;
1887    case CCValAssign::SExt:
1888      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
1889      break;
1890    case CCValAssign::ZExt:
1891      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
1892      break;
1893    case CCValAssign::AExt:
1894      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
1895        // Special case: passing MMX values in XMM registers.
1896        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
1897        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
1898        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
1899      } else
1900        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
1901      break;
1902    case CCValAssign::BCvt:
1903      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
1904      break;
1905    case CCValAssign::Indirect: {
1906      // Store the argument.
1907      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
1908      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1909      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
1910                           PseudoSourceValue::getFixedStack(FI), 0,
1911                           false, false, 0);
1912      Arg = SpillSlot;
1913      break;
1914    }
1915    }
1916
1917    if (VA.isRegLoc()) {
1918      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1919    } else if (!IsSibcall && (!isTailCall || isByVal)) {
1920      assert(VA.isMemLoc());
1921      if (StackPtr.getNode() == 0)
1922        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
1923      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
1924                                             dl, DAG, VA, Flags));
1925    }
1926  }
1927
1928  if (!MemOpChains.empty())
1929    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1930                        &MemOpChains[0], MemOpChains.size());
1931
1932  // Build a sequence of copy-to-reg nodes chained together with token chain
1933  // and flag operands which copy the outgoing args into registers.
1934  SDValue InFlag;
1935  // Tail call byval lowering might overwrite argument registers so in case of
1936  // tail call optimization the copies to registers are lowered later.
1937  if (!isTailCall)
1938    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1939      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1940                               RegsToPass[i].second, InFlag);
1941      InFlag = Chain.getValue(1);
1942    }
1943
1944  if (Subtarget->isPICStyleGOT()) {
1945    // ELF / PIC requires GOT in the EBX register before function calls via PLT
1946    // GOT pointer.
1947    if (!isTailCall) {
1948      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
1949                               DAG.getNode(X86ISD::GlobalBaseReg,
1950                                           DebugLoc(), getPointerTy()),
1951                               InFlag);
1952      InFlag = Chain.getValue(1);
1953    } else {
1954      // If we are tail calling and generating PIC/GOT style code load the
1955      // address of the callee into ECX. The value in ecx is used as target of
1956      // the tail jump. This is done to circumvent the ebx/callee-saved problem
1957      // for tail calls on PIC/GOT architectures. Normally we would just put the
1958      // address of GOT into ebx and then call target@PLT. But for tail calls
1959      // ebx would be restored (since ebx is callee saved) before jumping to the
1960      // target@PLT.
1961
1962      // Note: The actual moving to ECX is done further down.
1963      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
1964      if (G && !G->getGlobal()->hasHiddenVisibility() &&
1965          !G->getGlobal()->hasProtectedVisibility())
1966        Callee = LowerGlobalAddress(Callee, DAG);
1967      else if (isa<ExternalSymbolSDNode>(Callee))
1968        Callee = LowerExternalSymbol(Callee, DAG);
1969    }
1970  }
1971
1972  if (Is64Bit && isVarArg) {
1973    // From AMD64 ABI document:
1974    // For calls that may call functions that use varargs or stdargs
1975    // (prototype-less calls or calls to functions containing ellipsis (...) in
1976    // the declaration) %al is used as hidden argument to specify the number
1977    // of SSE registers used. The contents of %al do not need to match exactly
1978    // the number of registers, but must be an ubound on the number of SSE
1979    // registers used and is in the range 0 - 8 inclusive.
1980
1981    // FIXME: Verify this on Win64
1982    // Count the number of XMM registers allocated.
1983    static const unsigned XMMArgRegs[] = {
1984      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
1985      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
1986    };
1987    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
1988    assert((Subtarget->hasSSE1() || !NumXMMRegs)
1989           && "SSE registers cannot be used when SSE is disabled");
1990
1991    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
1992                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
1993    InFlag = Chain.getValue(1);
1994  }
1995
1996
1997  // For tail calls lower the arguments to the 'real' stack slot.
1998  if (isTailCall) {
1999    // Force all the incoming stack arguments to be loaded from the stack
2000    // before any new outgoing arguments are stored to the stack, because the
2001    // outgoing stack slots may alias the incoming argument stack slots, and
2002    // the alias isn't otherwise explicit. This is slightly more conservative
2003    // than necessary, because it means that each store effectively depends
2004    // on every argument instead of just those arguments it would clobber.
2005    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
2006
2007    SmallVector<SDValue, 8> MemOpChains2;
2008    SDValue FIN;
2009    int FI = 0;
2010    // Do not flag preceeding copytoreg stuff together with the following stuff.
2011    InFlag = SDValue();
2012    if (GuaranteedTailCallOpt) {
2013      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2014        CCValAssign &VA = ArgLocs[i];
2015        if (VA.isRegLoc())
2016          continue;
2017        assert(VA.isMemLoc());
2018        SDValue Arg = Outs[i].Val;
2019        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2020        // Create frame index.
2021        int32_t Offset = VA.getLocMemOffset()+FPDiff;
2022        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
2023        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false);
2024        FIN = DAG.getFrameIndex(FI, getPointerTy());
2025
2026        if (Flags.isByVal()) {
2027          // Copy relative to framepointer.
2028          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
2029          if (StackPtr.getNode() == 0)
2030            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
2031                                          getPointerTy());
2032          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
2033
2034          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
2035                                                           ArgChain,
2036                                                           Flags, DAG, dl));
2037        } else {
2038          // Store relative to framepointer.
2039          MemOpChains2.push_back(
2040            DAG.getStore(ArgChain, dl, Arg, FIN,
2041                         PseudoSourceValue::getFixedStack(FI), 0,
2042                         false, false, 0));
2043        }
2044      }
2045    }
2046
2047    if (!MemOpChains2.empty())
2048      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
2049                          &MemOpChains2[0], MemOpChains2.size());
2050
2051    // Copy arguments to their registers.
2052    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2053      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2054                               RegsToPass[i].second, InFlag);
2055      InFlag = Chain.getValue(1);
2056    }
2057    InFlag =SDValue();
2058
2059    // Store the return address to the appropriate stack slot.
2060    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
2061                                     FPDiff, dl);
2062  }
2063
2064  bool WasGlobalOrExternal = false;
2065  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
2066    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
2067    // In the 64-bit large code model, we have to make all calls
2068    // through a register, since the call instruction's 32-bit
2069    // pc-relative offset may not be large enough to hold the whole
2070    // address.
2071  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2072    WasGlobalOrExternal = true;
2073    // If the callee is a GlobalAddress node (quite common, every direct call
2074    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
2075    // it.
2076
2077    // We should use extra load for direct calls to dllimported functions in
2078    // non-JIT mode.
2079    const GlobalValue *GV = G->getGlobal();
2080    if (!GV->hasDLLImportLinkage()) {
2081      unsigned char OpFlags = 0;
2082
2083      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
2084      // external symbols most go through the PLT in PIC mode.  If the symbol
2085      // has hidden or protected visibility, or if it is static or local, then
2086      // we don't need to use the PLT - we can directly call it.
2087      if (Subtarget->isTargetELF() &&
2088          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
2089          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
2090        OpFlags = X86II::MO_PLT;
2091      } else if (Subtarget->isPICStyleStubAny() &&
2092               (GV->isDeclaration() || GV->isWeakForLinker()) &&
2093               Subtarget->getDarwinVers() < 9) {
2094        // PC-relative references to external symbols should go through $stub,
2095        // unless we're building with the leopard linker or later, which
2096        // automatically synthesizes these stubs.
2097        OpFlags = X86II::MO_DARWIN_STUB;
2098      }
2099
2100      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
2101                                          G->getOffset(), OpFlags);
2102    }
2103  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2104    WasGlobalOrExternal = true;
2105    unsigned char OpFlags = 0;
2106
2107    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
2108    // symbols should go through the PLT.
2109    if (Subtarget->isTargetELF() &&
2110        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
2111      OpFlags = X86II::MO_PLT;
2112    } else if (Subtarget->isPICStyleStubAny() &&
2113             Subtarget->getDarwinVers() < 9) {
2114      // PC-relative references to external symbols should go through $stub,
2115      // unless we're building with the leopard linker or later, which
2116      // automatically synthesizes these stubs.
2117      OpFlags = X86II::MO_DARWIN_STUB;
2118    }
2119
2120    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
2121                                         OpFlags);
2122  }
2123
2124  // Returns a chain & a flag for retval copy to use.
2125  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
2126  SmallVector<SDValue, 8> Ops;
2127
2128  if (!IsSibcall && isTailCall) {
2129    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
2130                           DAG.getIntPtrConstant(0, true), InFlag);
2131    InFlag = Chain.getValue(1);
2132  }
2133
2134  Ops.push_back(Chain);
2135  Ops.push_back(Callee);
2136
2137  if (isTailCall)
2138    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
2139
2140  // Add argument registers to the end of the list so that they are known live
2141  // into the call.
2142  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2143    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2144                                  RegsToPass[i].second.getValueType()));
2145
2146  // Add an implicit use GOT pointer in EBX.
2147  if (!isTailCall && Subtarget->isPICStyleGOT())
2148    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
2149
2150  // Add an implicit use of AL for x86 vararg functions.
2151  if (Is64Bit && isVarArg)
2152    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
2153
2154  if (InFlag.getNode())
2155    Ops.push_back(InFlag);
2156
2157  if (isTailCall) {
2158    // We used to do:
2159    //// If this is the first return lowered for this function, add the regs
2160    //// to the liveout set for the function.
2161    // This isn't right, although it's probably harmless on x86; liveouts
2162    // should be computed from returns not tail calls.  Consider a void
2163    // function making a tail call to a function returning int.
2164    return DAG.getNode(X86ISD::TC_RETURN, dl,
2165                       NodeTys, &Ops[0], Ops.size());
2166  }
2167
2168  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
2169  InFlag = Chain.getValue(1);
2170
2171  // Create the CALLSEQ_END node.
2172  unsigned NumBytesForCalleeToPush;
2173  if (Subtarget->IsCalleePop(isVarArg, CallConv))
2174    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
2175  else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet)
2176    // If this is a call to a struct-return function, the callee
2177    // pops the hidden struct pointer, so we have to push it back.
2178    // This is common for Darwin/X86, Linux & Mingw32 targets.
2179    NumBytesForCalleeToPush = 4;
2180  else
2181    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
2182
2183  // Returns a flag for retval copy to use.
2184  if (!IsSibcall) {
2185    Chain = DAG.getCALLSEQ_END(Chain,
2186                               DAG.getIntPtrConstant(NumBytes, true),
2187                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
2188                                                     true),
2189                               InFlag);
2190    InFlag = Chain.getValue(1);
2191  }
2192
2193  // Handle result values, copying them out of physregs into vregs that we
2194  // return.
2195  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
2196                         Ins, dl, DAG, InVals);
2197}
2198
2199
2200//===----------------------------------------------------------------------===//
2201//                Fast Calling Convention (tail call) implementation
2202//===----------------------------------------------------------------------===//
2203
2204//  Like std call, callee cleans arguments, convention except that ECX is
2205//  reserved for storing the tail called function address. Only 2 registers are
2206//  free for argument passing (inreg). Tail call optimization is performed
2207//  provided:
2208//                * tailcallopt is enabled
2209//                * caller/callee are fastcc
2210//  On X86_64 architecture with GOT-style position independent code only local
2211//  (within module) calls are supported at the moment.
2212//  To keep the stack aligned according to platform abi the function
2213//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
2214//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
2215//  If a tail called function callee has more arguments than the caller the
2216//  caller needs to make sure that there is room to move the RETADDR to. This is
2217//  achieved by reserving an area the size of the argument delta right after the
2218//  original REtADDR, but before the saved framepointer or the spilled registers
2219//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
2220//  stack layout:
2221//    arg1
2222//    arg2
2223//    RETADDR
2224//    [ new RETADDR
2225//      move area ]
2226//    (possible EBP)
2227//    ESI
2228//    EDI
2229//    local1 ..
2230
2231/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
2232/// for a 16 byte align requirement.
2233unsigned
2234X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
2235                                               SelectionDAG& DAG) const {
2236  MachineFunction &MF = DAG.getMachineFunction();
2237  const TargetMachine &TM = MF.getTarget();
2238  const TargetFrameInfo &TFI = *TM.getFrameInfo();
2239  unsigned StackAlignment = TFI.getStackAlignment();
2240  uint64_t AlignMask = StackAlignment - 1;
2241  int64_t Offset = StackSize;
2242  uint64_t SlotSize = TD->getPointerSize();
2243  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
2244    // Number smaller than 12 so just add the difference.
2245    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
2246  } else {
2247    // Mask out lower bits, add stackalignment once plus the 12 bytes.
2248    Offset = ((~AlignMask) & Offset) + StackAlignment +
2249      (StackAlignment-SlotSize);
2250  }
2251  return Offset;
2252}
2253
2254/// MatchingStackOffset - Return true if the given stack call argument is
2255/// already available in the same position (relatively) of the caller's
2256/// incoming argument stack.
2257static
2258bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2259                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
2260                         const X86InstrInfo *TII) {
2261  unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
2262  int FI = INT_MAX;
2263  if (Arg.getOpcode() == ISD::CopyFromReg) {
2264    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2265    if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
2266      return false;
2267    MachineInstr *Def = MRI->getVRegDef(VR);
2268    if (!Def)
2269      return false;
2270    if (!Flags.isByVal()) {
2271      if (!TII->isLoadFromStackSlot(Def, FI))
2272        return false;
2273    } else {
2274      unsigned Opcode = Def->getOpcode();
2275      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
2276          Def->getOperand(1).isFI()) {
2277        FI = Def->getOperand(1).getIndex();
2278        Bytes = Flags.getByValSize();
2279      } else
2280        return false;
2281    }
2282  } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2283    if (Flags.isByVal())
2284      // ByVal argument is passed in as a pointer but it's now being
2285      // dereferenced. e.g.
2286      // define @foo(%struct.X* %A) {
2287      //   tail call @bar(%struct.X* byval %A)
2288      // }
2289      return false;
2290    SDValue Ptr = Ld->getBasePtr();
2291    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2292    if (!FINode)
2293      return false;
2294    FI = FINode->getIndex();
2295  } else
2296    return false;
2297
2298  assert(FI != INT_MAX);
2299  if (!MFI->isFixedObjectIndex(FI))
2300    return false;
2301  return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
2302}
2303
2304/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2305/// for tail call optimization. Targets which want to do tail call
2306/// optimization should implement this function.
2307bool
2308X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
2309                                                     CallingConv::ID CalleeCC,
2310                                                     bool isVarArg,
2311                                                     bool isCalleeStructRet,
2312                                                     bool isCallerStructRet,
2313                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2314                                    const SmallVectorImpl<ISD::InputArg> &Ins,
2315                                                     SelectionDAG& DAG) const {
2316  if (!IsTailCallConvention(CalleeCC) &&
2317      CalleeCC != CallingConv::C)
2318    return false;
2319
2320  // If -tailcallopt is specified, make fastcc functions tail-callable.
2321  const MachineFunction &MF = DAG.getMachineFunction();
2322  const Function *CallerF = DAG.getMachineFunction().getFunction();
2323  CallingConv::ID CallerCC = CallerF->getCallingConv();
2324  bool CCMatch = CallerCC == CalleeCC;
2325
2326  if (GuaranteedTailCallOpt) {
2327    if (IsTailCallConvention(CalleeCC) && CCMatch)
2328      return true;
2329    return false;
2330  }
2331
2332  // Look for obvious safe cases to perform tail call optimization that do not
2333  // require ABI changes. This is what gcc calls sibcall.
2334
2335  // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
2336  // emit a special epilogue.
2337  if (RegInfo->needsStackRealignment(MF))
2338    return false;
2339
2340  // Do not sibcall optimize vararg calls unless the call site is not passing any
2341  // arguments.
2342  if (isVarArg && !Outs.empty())
2343    return false;
2344
2345  // Also avoid sibcall optimization if either caller or callee uses struct
2346  // return semantics.
2347  if (isCalleeStructRet || isCallerStructRet)
2348    return false;
2349
2350  // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack.
2351  // Therefore if it's not used by the call it is not safe to optimize this into
2352  // a sibcall.
2353  bool Unused = false;
2354  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
2355    if (!Ins[i].Used) {
2356      Unused = true;
2357      break;
2358    }
2359  }
2360  if (Unused) {
2361    SmallVector<CCValAssign, 16> RVLocs;
2362    CCState CCInfo(CalleeCC, false, getTargetMachine(),
2363                   RVLocs, *DAG.getContext());
2364    CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2365    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
2366      CCValAssign &VA = RVLocs[i];
2367      if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1)
2368        return false;
2369    }
2370  }
2371
2372  // If the calling conventions do not match, then we'd better make sure the
2373  // results are returned in the same way as what the caller expects.
2374  if (!CCMatch) {
2375    SmallVector<CCValAssign, 16> RVLocs1;
2376    CCState CCInfo1(CalleeCC, false, getTargetMachine(),
2377                    RVLocs1, *DAG.getContext());
2378    CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
2379
2380    SmallVector<CCValAssign, 16> RVLocs2;
2381    CCState CCInfo2(CallerCC, false, getTargetMachine(),
2382                    RVLocs2, *DAG.getContext());
2383    CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
2384
2385    if (RVLocs1.size() != RVLocs2.size())
2386      return false;
2387    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
2388      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
2389        return false;
2390      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
2391        return false;
2392      if (RVLocs1[i].isRegLoc()) {
2393        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
2394          return false;
2395      } else {
2396        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
2397          return false;
2398      }
2399    }
2400  }
2401
2402  // If the callee takes no arguments then go on to check the results of the
2403  // call.
2404  if (!Outs.empty()) {
2405    // Check if stack adjustment is needed. For now, do not do this if any
2406    // argument is passed on the stack.
2407    SmallVector<CCValAssign, 16> ArgLocs;
2408    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
2409                   ArgLocs, *DAG.getContext());
2410    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
2411    if (CCInfo.getNextStackOffset()) {
2412      MachineFunction &MF = DAG.getMachineFunction();
2413      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
2414        return false;
2415      if (Subtarget->isTargetWin64())
2416        // Win64 ABI has additional complications.
2417        return false;
2418
2419      // Check if the arguments are already laid out in the right way as
2420      // the caller's fixed stack objects.
2421      MachineFrameInfo *MFI = MF.getFrameInfo();
2422      const MachineRegisterInfo *MRI = &MF.getRegInfo();
2423      const X86InstrInfo *TII =
2424        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
2425      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2426        CCValAssign &VA = ArgLocs[i];
2427        EVT RegVT = VA.getLocVT();
2428        SDValue Arg = Outs[i].Val;
2429        ISD::ArgFlagsTy Flags = Outs[i].Flags;
2430        if (VA.getLocInfo() == CCValAssign::Indirect)
2431          return false;
2432        if (!VA.isRegLoc()) {
2433          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
2434                                   MFI, MRI, TII))
2435            return false;
2436        }
2437      }
2438    }
2439
2440    // If the tailcall address may be in a register, then make sure it's
2441    // possible to register allocate for it. In 32-bit, the call address can
2442    // only target EAX, EDX, or ECX since the tail call must be scheduled after
2443    // callee-saved registers are restored. In 64-bit, it's RAX, RCX, RDX, RSI,
2444    // RDI, R8, R9, R11.
2445    if (!isa<GlobalAddressSDNode>(Callee) &&
2446        !isa<ExternalSymbolSDNode>(Callee)) {
2447      unsigned Limit = Subtarget->is64Bit() ? 8 : 3;
2448      unsigned NumInRegs = 0;
2449      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
2450        CCValAssign &VA = ArgLocs[i];
2451        if (VA.isRegLoc()) {
2452          if (++NumInRegs == Limit)
2453            return false;
2454        }
2455      }
2456    }
2457  }
2458
2459  return true;
2460}
2461
2462FastISel *
2463X86TargetLowering::createFastISel(MachineFunction &mf,
2464                            DenseMap<const Value *, unsigned> &vm,
2465                            DenseMap<const BasicBlock*, MachineBasicBlock*> &bm,
2466                            DenseMap<const AllocaInst *, int> &am,
2467                            std::vector<std::pair<MachineInstr*, unsigned> > &pn
2468#ifndef NDEBUG
2469                          , SmallSet<const Instruction *, 8> &cil
2470#endif
2471                                  ) const {
2472  return X86::createFastISel(mf, vm, bm, am, pn
2473#ifndef NDEBUG
2474                             , cil
2475#endif
2476                             );
2477}
2478
2479
2480//===----------------------------------------------------------------------===//
2481//                           Other Lowering Hooks
2482//===----------------------------------------------------------------------===//
2483
2484
2485SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
2486  MachineFunction &MF = DAG.getMachineFunction();
2487  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2488  int ReturnAddrIndex = FuncInfo->getRAIndex();
2489
2490  if (ReturnAddrIndex == 0) {
2491    // Set up a frame object for the return address.
2492    uint64_t SlotSize = TD->getPointerSize();
2493    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
2494                                                           false, false);
2495    FuncInfo->setRAIndex(ReturnAddrIndex);
2496  }
2497
2498  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
2499}
2500
2501
2502bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
2503                                       bool hasSymbolicDisplacement) {
2504  // Offset should fit into 32 bit immediate field.
2505  if (!isInt<32>(Offset))
2506    return false;
2507
2508  // If we don't have a symbolic displacement - we don't have any extra
2509  // restrictions.
2510  if (!hasSymbolicDisplacement)
2511    return true;
2512
2513  // FIXME: Some tweaks might be needed for medium code model.
2514  if (M != CodeModel::Small && M != CodeModel::Kernel)
2515    return false;
2516
2517  // For small code model we assume that latest object is 16MB before end of 31
2518  // bits boundary. We may also accept pretty large negative constants knowing
2519  // that all objects are in the positive half of address space.
2520  if (M == CodeModel::Small && Offset < 16*1024*1024)
2521    return true;
2522
2523  // For kernel code model we know that all object resist in the negative half
2524  // of 32bits address space. We may not accept negative offsets, since they may
2525  // be just off and we may accept pretty large positive ones.
2526  if (M == CodeModel::Kernel && Offset > 0)
2527    return true;
2528
2529  return false;
2530}
2531
2532/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
2533/// specific condition code, returning the condition code and the LHS/RHS of the
2534/// comparison to make.
2535static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
2536                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
2537  if (!isFP) {
2538    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2539      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
2540        // X > -1   -> X == 0, jump !sign.
2541        RHS = DAG.getConstant(0, RHS.getValueType());
2542        return X86::COND_NS;
2543      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
2544        // X < 0   -> X == 0, jump on sign.
2545        return X86::COND_S;
2546      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
2547        // X < 1   -> X <= 0
2548        RHS = DAG.getConstant(0, RHS.getValueType());
2549        return X86::COND_LE;
2550      }
2551    }
2552
2553    switch (SetCCOpcode) {
2554    default: llvm_unreachable("Invalid integer condition!");
2555    case ISD::SETEQ:  return X86::COND_E;
2556    case ISD::SETGT:  return X86::COND_G;
2557    case ISD::SETGE:  return X86::COND_GE;
2558    case ISD::SETLT:  return X86::COND_L;
2559    case ISD::SETLE:  return X86::COND_LE;
2560    case ISD::SETNE:  return X86::COND_NE;
2561    case ISD::SETULT: return X86::COND_B;
2562    case ISD::SETUGT: return X86::COND_A;
2563    case ISD::SETULE: return X86::COND_BE;
2564    case ISD::SETUGE: return X86::COND_AE;
2565    }
2566  }
2567
2568  // First determine if it is required or is profitable to flip the operands.
2569
2570  // If LHS is a foldable load, but RHS is not, flip the condition.
2571  if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
2572      !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
2573    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2574    std::swap(LHS, RHS);
2575  }
2576
2577  switch (SetCCOpcode) {
2578  default: break;
2579  case ISD::SETOLT:
2580  case ISD::SETOLE:
2581  case ISD::SETUGT:
2582  case ISD::SETUGE:
2583    std::swap(LHS, RHS);
2584    break;
2585  }
2586
2587  // On a floating point condition, the flags are set as follows:
2588  // ZF  PF  CF   op
2589  //  0 | 0 | 0 | X > Y
2590  //  0 | 0 | 1 | X < Y
2591  //  1 | 0 | 0 | X == Y
2592  //  1 | 1 | 1 | unordered
2593  switch (SetCCOpcode) {
2594  default: llvm_unreachable("Condcode should be pre-legalized away");
2595  case ISD::SETUEQ:
2596  case ISD::SETEQ:   return X86::COND_E;
2597  case ISD::SETOLT:              // flipped
2598  case ISD::SETOGT:
2599  case ISD::SETGT:   return X86::COND_A;
2600  case ISD::SETOLE:              // flipped
2601  case ISD::SETOGE:
2602  case ISD::SETGE:   return X86::COND_AE;
2603  case ISD::SETUGT:              // flipped
2604  case ISD::SETULT:
2605  case ISD::SETLT:   return X86::COND_B;
2606  case ISD::SETUGE:              // flipped
2607  case ISD::SETULE:
2608  case ISD::SETLE:   return X86::COND_BE;
2609  case ISD::SETONE:
2610  case ISD::SETNE:   return X86::COND_NE;
2611  case ISD::SETUO:   return X86::COND_P;
2612  case ISD::SETO:    return X86::COND_NP;
2613  case ISD::SETOEQ:
2614  case ISD::SETUNE:  return X86::COND_INVALID;
2615  }
2616}
2617
2618/// hasFPCMov - is there a floating point cmov for the specific X86 condition
2619/// code. Current x86 isa includes the following FP cmov instructions:
2620/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2621static bool hasFPCMov(unsigned X86CC) {
2622  switch (X86CC) {
2623  default:
2624    return false;
2625  case X86::COND_B:
2626  case X86::COND_BE:
2627  case X86::COND_E:
2628  case X86::COND_P:
2629  case X86::COND_A:
2630  case X86::COND_AE:
2631  case X86::COND_NE:
2632  case X86::COND_NP:
2633    return true;
2634  }
2635}
2636
2637/// isFPImmLegal - Returns true if the target can instruction select the
2638/// specified FP immediate natively. If false, the legalizer will
2639/// materialize the FP immediate as a load from a constant pool.
2640bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
2641  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
2642    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
2643      return true;
2644  }
2645  return false;
2646}
2647
2648/// isUndefOrInRange - Return true if Val is undef or if its value falls within
2649/// the specified range (L, H].
2650static bool isUndefOrInRange(int Val, int Low, int Hi) {
2651  return (Val < 0) || (Val >= Low && Val < Hi);
2652}
2653
2654/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
2655/// specified value.
2656static bool isUndefOrEqual(int Val, int CmpVal) {
2657  if (Val < 0 || Val == CmpVal)
2658    return true;
2659  return false;
2660}
2661
2662/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
2663/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
2664/// the second operand.
2665static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2666  if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
2667    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
2668  if (VT == MVT::v2f64 || VT == MVT::v2i64)
2669    return (Mask[0] < 2 && Mask[1] < 2);
2670  return false;
2671}
2672
2673bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
2674  SmallVector<int, 8> M;
2675  N->getMask(M);
2676  return ::isPSHUFDMask(M, N->getValueType(0));
2677}
2678
2679/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
2680/// is suitable for input to PSHUFHW.
2681static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2682  if (VT != MVT::v8i16)
2683    return false;
2684
2685  // Lower quadword copied in order or undef.
2686  for (int i = 0; i != 4; ++i)
2687    if (Mask[i] >= 0 && Mask[i] != i)
2688      return false;
2689
2690  // Upper quadword shuffled.
2691  for (int i = 4; i != 8; ++i)
2692    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
2693      return false;
2694
2695  return true;
2696}
2697
2698bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
2699  SmallVector<int, 8> M;
2700  N->getMask(M);
2701  return ::isPSHUFHWMask(M, N->getValueType(0));
2702}
2703
2704/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
2705/// is suitable for input to PSHUFLW.
2706static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2707  if (VT != MVT::v8i16)
2708    return false;
2709
2710  // Upper quadword copied in order.
2711  for (int i = 4; i != 8; ++i)
2712    if (Mask[i] >= 0 && Mask[i] != i)
2713      return false;
2714
2715  // Lower quadword shuffled.
2716  for (int i = 0; i != 4; ++i)
2717    if (Mask[i] >= 4)
2718      return false;
2719
2720  return true;
2721}
2722
2723bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
2724  SmallVector<int, 8> M;
2725  N->getMask(M);
2726  return ::isPSHUFLWMask(M, N->getValueType(0));
2727}
2728
2729/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
2730/// is suitable for input to PALIGNR.
2731static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
2732                          bool hasSSSE3) {
2733  int i, e = VT.getVectorNumElements();
2734
2735  // Do not handle v2i64 / v2f64 shuffles with palignr.
2736  if (e < 4 || !hasSSSE3)
2737    return false;
2738
2739  for (i = 0; i != e; ++i)
2740    if (Mask[i] >= 0)
2741      break;
2742
2743  // All undef, not a palignr.
2744  if (i == e)
2745    return false;
2746
2747  // Determine if it's ok to perform a palignr with only the LHS, since we
2748  // don't have access to the actual shuffle elements to see if RHS is undef.
2749  bool Unary = Mask[i] < (int)e;
2750  bool NeedsUnary = false;
2751
2752  int s = Mask[i] - i;
2753
2754  // Check the rest of the elements to see if they are consecutive.
2755  for (++i; i != e; ++i) {
2756    int m = Mask[i];
2757    if (m < 0)
2758      continue;
2759
2760    Unary = Unary && (m < (int)e);
2761    NeedsUnary = NeedsUnary || (m < s);
2762
2763    if (NeedsUnary && !Unary)
2764      return false;
2765    if (Unary && m != ((s+i) & (e-1)))
2766      return false;
2767    if (!Unary && m != (s+i))
2768      return false;
2769  }
2770  return true;
2771}
2772
2773bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
2774  SmallVector<int, 8> M;
2775  N->getMask(M);
2776  return ::isPALIGNRMask(M, N->getValueType(0), true);
2777}
2778
2779/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
2780/// specifies a shuffle of elements that is suitable for input to SHUFP*.
2781static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2782  int NumElems = VT.getVectorNumElements();
2783  if (NumElems != 2 && NumElems != 4)
2784    return false;
2785
2786  int Half = NumElems / 2;
2787  for (int i = 0; i < Half; ++i)
2788    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2789      return false;
2790  for (int i = Half; i < NumElems; ++i)
2791    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2792      return false;
2793
2794  return true;
2795}
2796
2797bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
2798  SmallVector<int, 8> M;
2799  N->getMask(M);
2800  return ::isSHUFPMask(M, N->getValueType(0));
2801}
2802
2803/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
2804/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
2805/// half elements to come from vector 1 (which would equal the dest.) and
2806/// the upper half to come from vector 2.
2807static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
2808  int NumElems = VT.getVectorNumElements();
2809
2810  if (NumElems != 2 && NumElems != 4)
2811    return false;
2812
2813  int Half = NumElems / 2;
2814  for (int i = 0; i < Half; ++i)
2815    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
2816      return false;
2817  for (int i = Half; i < NumElems; ++i)
2818    if (!isUndefOrInRange(Mask[i], 0, NumElems))
2819      return false;
2820  return true;
2821}
2822
2823static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
2824  SmallVector<int, 8> M;
2825  N->getMask(M);
2826  return isCommutedSHUFPMask(M, N->getValueType(0));
2827}
2828
2829/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
2830/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
2831bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
2832  if (N->getValueType(0).getVectorNumElements() != 4)
2833    return false;
2834
2835  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
2836  return isUndefOrEqual(N->getMaskElt(0), 6) &&
2837         isUndefOrEqual(N->getMaskElt(1), 7) &&
2838         isUndefOrEqual(N->getMaskElt(2), 2) &&
2839         isUndefOrEqual(N->getMaskElt(3), 3);
2840}
2841
2842/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
2843/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
2844/// <2, 3, 2, 3>
2845bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
2846  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2847
2848  if (NumElems != 4)
2849    return false;
2850
2851  return isUndefOrEqual(N->getMaskElt(0), 2) &&
2852  isUndefOrEqual(N->getMaskElt(1), 3) &&
2853  isUndefOrEqual(N->getMaskElt(2), 2) &&
2854  isUndefOrEqual(N->getMaskElt(3), 3);
2855}
2856
2857/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
2858/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
2859bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
2860  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2861
2862  if (NumElems != 2 && NumElems != 4)
2863    return false;
2864
2865  for (unsigned i = 0; i < NumElems/2; ++i)
2866    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
2867      return false;
2868
2869  for (unsigned i = NumElems/2; i < NumElems; ++i)
2870    if (!isUndefOrEqual(N->getMaskElt(i), i))
2871      return false;
2872
2873  return true;
2874}
2875
2876/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
2877/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
2878bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
2879  unsigned NumElems = N->getValueType(0).getVectorNumElements();
2880
2881  if (NumElems != 2 && NumElems != 4)
2882    return false;
2883
2884  for (unsigned i = 0; i < NumElems/2; ++i)
2885    if (!isUndefOrEqual(N->getMaskElt(i), i))
2886      return false;
2887
2888  for (unsigned i = 0; i < NumElems/2; ++i)
2889    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
2890      return false;
2891
2892  return true;
2893}
2894
2895/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
2896/// specifies a shuffle of elements that is suitable for input to UNPCKL.
2897static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
2898                         bool V2IsSplat = false) {
2899  int NumElts = VT.getVectorNumElements();
2900  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2901    return false;
2902
2903  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2904    int BitI  = Mask[i];
2905    int BitI1 = Mask[i+1];
2906    if (!isUndefOrEqual(BitI, j))
2907      return false;
2908    if (V2IsSplat) {
2909      if (!isUndefOrEqual(BitI1, NumElts))
2910        return false;
2911    } else {
2912      if (!isUndefOrEqual(BitI1, j + NumElts))
2913        return false;
2914    }
2915  }
2916  return true;
2917}
2918
2919bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2920  SmallVector<int, 8> M;
2921  N->getMask(M);
2922  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
2923}
2924
2925/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
2926/// specifies a shuffle of elements that is suitable for input to UNPCKH.
2927static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
2928                         bool V2IsSplat = false) {
2929  int NumElts = VT.getVectorNumElements();
2930  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
2931    return false;
2932
2933  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
2934    int BitI  = Mask[i];
2935    int BitI1 = Mask[i+1];
2936    if (!isUndefOrEqual(BitI, j + NumElts/2))
2937      return false;
2938    if (V2IsSplat) {
2939      if (isUndefOrEqual(BitI1, NumElts))
2940        return false;
2941    } else {
2942      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
2943        return false;
2944    }
2945  }
2946  return true;
2947}
2948
2949bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
2950  SmallVector<int, 8> M;
2951  N->getMask(M);
2952  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
2953}
2954
2955/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
2956/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
2957/// <0, 0, 1, 1>
2958static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2959  int NumElems = VT.getVectorNumElements();
2960  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2961    return false;
2962
2963  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
2964    int BitI  = Mask[i];
2965    int BitI1 = Mask[i+1];
2966    if (!isUndefOrEqual(BitI, j))
2967      return false;
2968    if (!isUndefOrEqual(BitI1, j))
2969      return false;
2970  }
2971  return true;
2972}
2973
2974bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
2975  SmallVector<int, 8> M;
2976  N->getMask(M);
2977  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
2978}
2979
2980/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
2981/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
2982/// <2, 2, 3, 3>
2983static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
2984  int NumElems = VT.getVectorNumElements();
2985  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
2986    return false;
2987
2988  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
2989    int BitI  = Mask[i];
2990    int BitI1 = Mask[i+1];
2991    if (!isUndefOrEqual(BitI, j))
2992      return false;
2993    if (!isUndefOrEqual(BitI1, j))
2994      return false;
2995  }
2996  return true;
2997}
2998
2999bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
3000  SmallVector<int, 8> M;
3001  N->getMask(M);
3002  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
3003}
3004
3005/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
3006/// specifies a shuffle of elements that is suitable for input to MOVSS,
3007/// MOVSD, and MOVD, i.e. setting the lowest element.
3008static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
3009  if (VT.getVectorElementType().getSizeInBits() < 32)
3010    return false;
3011
3012  int NumElts = VT.getVectorNumElements();
3013
3014  if (!isUndefOrEqual(Mask[0], NumElts))
3015    return false;
3016
3017  for (int i = 1; i < NumElts; ++i)
3018    if (!isUndefOrEqual(Mask[i], i))
3019      return false;
3020
3021  return true;
3022}
3023
3024bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
3025  SmallVector<int, 8> M;
3026  N->getMask(M);
3027  return ::isMOVLMask(M, N->getValueType(0));
3028}
3029
3030/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
3031/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
3032/// element of vector 2 and the other elements to come from vector 1 in order.
3033static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
3034                               bool V2IsSplat = false, bool V2IsUndef = false) {
3035  int NumOps = VT.getVectorNumElements();
3036  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
3037    return false;
3038
3039  if (!isUndefOrEqual(Mask[0], 0))
3040    return false;
3041
3042  for (int i = 1; i < NumOps; ++i)
3043    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
3044          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
3045          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
3046      return false;
3047
3048  return true;
3049}
3050
3051static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
3052                           bool V2IsUndef = false) {
3053  SmallVector<int, 8> M;
3054  N->getMask(M);
3055  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
3056}
3057
3058/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3059/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
3060bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
3061  if (N->getValueType(0).getVectorNumElements() != 4)
3062    return false;
3063
3064  // Expect 1, 1, 3, 3
3065  for (unsigned i = 0; i < 2; ++i) {
3066    int Elt = N->getMaskElt(i);
3067    if (Elt >= 0 && Elt != 1)
3068      return false;
3069  }
3070
3071  bool HasHi = false;
3072  for (unsigned i = 2; i < 4; ++i) {
3073    int Elt = N->getMaskElt(i);
3074    if (Elt >= 0 && Elt != 3)
3075      return false;
3076    if (Elt == 3)
3077      HasHi = true;
3078  }
3079  // Don't use movshdup if it can be done with a shufps.
3080  // FIXME: verify that matching u, u, 3, 3 is what we want.
3081  return HasHi;
3082}
3083
3084/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3085/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
3086bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
3087  if (N->getValueType(0).getVectorNumElements() != 4)
3088    return false;
3089
3090  // Expect 0, 0, 2, 2
3091  for (unsigned i = 0; i < 2; ++i)
3092    if (N->getMaskElt(i) > 0)
3093      return false;
3094
3095  bool HasHi = false;
3096  for (unsigned i = 2; i < 4; ++i) {
3097    int Elt = N->getMaskElt(i);
3098    if (Elt >= 0 && Elt != 2)
3099      return false;
3100    if (Elt == 2)
3101      HasHi = true;
3102  }
3103  // Don't use movsldup if it can be done with a shufps.
3104  return HasHi;
3105}
3106
3107/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
3108/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
3109bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
3110  int e = N->getValueType(0).getVectorNumElements() / 2;
3111
3112  for (int i = 0; i < e; ++i)
3113    if (!isUndefOrEqual(N->getMaskElt(i), i))
3114      return false;
3115  for (int i = 0; i < e; ++i)
3116    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
3117      return false;
3118  return true;
3119}
3120
3121/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
3122/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
3123unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
3124  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3125  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
3126
3127  unsigned Shift = (NumOperands == 4) ? 2 : 1;
3128  unsigned Mask = 0;
3129  for (int i = 0; i < NumOperands; ++i) {
3130    int Val = SVOp->getMaskElt(NumOperands-i-1);
3131    if (Val < 0) Val = 0;
3132    if (Val >= NumOperands) Val -= NumOperands;
3133    Mask |= Val;
3134    if (i != NumOperands - 1)
3135      Mask <<= Shift;
3136  }
3137  return Mask;
3138}
3139
3140/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
3141/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
3142unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
3143  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3144  unsigned Mask = 0;
3145  // 8 nodes, but we only care about the last 4.
3146  for (unsigned i = 7; i >= 4; --i) {
3147    int Val = SVOp->getMaskElt(i);
3148    if (Val >= 0)
3149      Mask |= (Val - 4);
3150    if (i != 4)
3151      Mask <<= 2;
3152  }
3153  return Mask;
3154}
3155
3156/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
3157/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
3158unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
3159  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3160  unsigned Mask = 0;
3161  // 8 nodes, but we only care about the first 4.
3162  for (int i = 3; i >= 0; --i) {
3163    int Val = SVOp->getMaskElt(i);
3164    if (Val >= 0)
3165      Mask |= Val;
3166    if (i != 0)
3167      Mask <<= 2;
3168  }
3169  return Mask;
3170}
3171
3172/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
3173/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
3174unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
3175  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
3176  EVT VVT = N->getValueType(0);
3177  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
3178  int Val = 0;
3179
3180  unsigned i, e;
3181  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
3182    Val = SVOp->getMaskElt(i);
3183    if (Val >= 0)
3184      break;
3185  }
3186  return (Val - i) * EltSize;
3187}
3188
3189/// isZeroNode - Returns true if Elt is a constant zero or a floating point
3190/// constant +0.0.
3191bool X86::isZeroNode(SDValue Elt) {
3192  return ((isa<ConstantSDNode>(Elt) &&
3193           cast<ConstantSDNode>(Elt)->isNullValue()) ||
3194          (isa<ConstantFPSDNode>(Elt) &&
3195           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
3196}
3197
3198/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
3199/// their permute mask.
3200static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
3201                                    SelectionDAG &DAG) {
3202  EVT VT = SVOp->getValueType(0);
3203  unsigned NumElems = VT.getVectorNumElements();
3204  SmallVector<int, 8> MaskVec;
3205
3206  for (unsigned i = 0; i != NumElems; ++i) {
3207    int idx = SVOp->getMaskElt(i);
3208    if (idx < 0)
3209      MaskVec.push_back(idx);
3210    else if (idx < (int)NumElems)
3211      MaskVec.push_back(idx + NumElems);
3212    else
3213      MaskVec.push_back(idx - NumElems);
3214  }
3215  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
3216                              SVOp->getOperand(0), &MaskVec[0]);
3217}
3218
3219/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
3220/// the two vector operands have swapped position.
3221static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
3222  unsigned NumElems = VT.getVectorNumElements();
3223  for (unsigned i = 0; i != NumElems; ++i) {
3224    int idx = Mask[i];
3225    if (idx < 0)
3226      continue;
3227    else if (idx < (int)NumElems)
3228      Mask[i] = idx + NumElems;
3229    else
3230      Mask[i] = idx - NumElems;
3231  }
3232}
3233
3234/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
3235/// match movhlps. The lower half elements should come from upper half of
3236/// V1 (and in order), and the upper half elements should come from the upper
3237/// half of V2 (and in order).
3238static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
3239  if (Op->getValueType(0).getVectorNumElements() != 4)
3240    return false;
3241  for (unsigned i = 0, e = 2; i != e; ++i)
3242    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
3243      return false;
3244  for (unsigned i = 2; i != 4; ++i)
3245    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
3246      return false;
3247  return true;
3248}
3249
3250/// isScalarLoadToVector - Returns true if the node is a scalar load that
3251/// is promoted to a vector. It also returns the LoadSDNode by reference if
3252/// required.
3253static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
3254  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
3255    return false;
3256  N = N->getOperand(0).getNode();
3257  if (!ISD::isNON_EXTLoad(N))
3258    return false;
3259  if (LD)
3260    *LD = cast<LoadSDNode>(N);
3261  return true;
3262}
3263
3264/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
3265/// match movlp{s|d}. The lower half elements should come from lower half of
3266/// V1 (and in order), and the upper half elements should come from the upper
3267/// half of V2 (and in order). And since V1 will become the source of the
3268/// MOVLP, it must be either a vector load or a scalar load to vector.
3269static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
3270                               ShuffleVectorSDNode *Op) {
3271  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
3272    return false;
3273  // Is V2 is a vector load, don't do this transformation. We will try to use
3274  // load folding shufps op.
3275  if (ISD::isNON_EXTLoad(V2))
3276    return false;
3277
3278  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
3279
3280  if (NumElems != 2 && NumElems != 4)
3281    return false;
3282  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
3283    if (!isUndefOrEqual(Op->getMaskElt(i), i))
3284      return false;
3285  for (unsigned i = NumElems/2; i != NumElems; ++i)
3286    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
3287      return false;
3288  return true;
3289}
3290
3291/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
3292/// all the same.
3293static bool isSplatVector(SDNode *N) {
3294  if (N->getOpcode() != ISD::BUILD_VECTOR)
3295    return false;
3296
3297  SDValue SplatValue = N->getOperand(0);
3298  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
3299    if (N->getOperand(i) != SplatValue)
3300      return false;
3301  return true;
3302}
3303
3304/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
3305/// to an zero vector.
3306/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
3307static bool isZeroShuffle(ShuffleVectorSDNode *N) {
3308  SDValue V1 = N->getOperand(0);
3309  SDValue V2 = N->getOperand(1);
3310  unsigned NumElems = N->getValueType(0).getVectorNumElements();
3311  for (unsigned i = 0; i != NumElems; ++i) {
3312    int Idx = N->getMaskElt(i);
3313    if (Idx >= (int)NumElems) {
3314      unsigned Opc = V2.getOpcode();
3315      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
3316        continue;
3317      if (Opc != ISD::BUILD_VECTOR ||
3318          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
3319        return false;
3320    } else if (Idx >= 0) {
3321      unsigned Opc = V1.getOpcode();
3322      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
3323        continue;
3324      if (Opc != ISD::BUILD_VECTOR ||
3325          !X86::isZeroNode(V1.getOperand(Idx)))
3326        return false;
3327    }
3328  }
3329  return true;
3330}
3331
3332/// getZeroVector - Returns a vector of specified type with all zero elements.
3333///
3334static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
3335                             DebugLoc dl) {
3336  assert(VT.isVector() && "Expected a vector type");
3337
3338  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3339  // type.  This ensures they get CSE'd.
3340  SDValue Vec;
3341  if (VT.getSizeInBits() == 64) { // MMX
3342    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3343    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3344  } else if (HasSSE2) {  // SSE2
3345    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
3346    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3347  } else { // SSE1
3348    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
3349    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
3350  }
3351  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3352}
3353
3354/// getOnesVector - Returns a vector of specified type with all bits set.
3355///
3356static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
3357  assert(VT.isVector() && "Expected a vector type");
3358
3359  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
3360  // type.  This ensures they get CSE'd.
3361  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
3362  SDValue Vec;
3363  if (VT.getSizeInBits() == 64)  // MMX
3364    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
3365  else                                              // SSE
3366    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
3367  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
3368}
3369
3370
3371/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
3372/// that point to V2 points to its first element.
3373static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
3374  EVT VT = SVOp->getValueType(0);
3375  unsigned NumElems = VT.getVectorNumElements();
3376
3377  bool Changed = false;
3378  SmallVector<int, 8> MaskVec;
3379  SVOp->getMask(MaskVec);
3380
3381  for (unsigned i = 0; i != NumElems; ++i) {
3382    if (MaskVec[i] > (int)NumElems) {
3383      MaskVec[i] = NumElems;
3384      Changed = true;
3385    }
3386  }
3387  if (Changed)
3388    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
3389                                SVOp->getOperand(1), &MaskVec[0]);
3390  return SDValue(SVOp, 0);
3391}
3392
3393/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
3394/// operation of specified width.
3395static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3396                       SDValue V2) {
3397  unsigned NumElems = VT.getVectorNumElements();
3398  SmallVector<int, 8> Mask;
3399  Mask.push_back(NumElems);
3400  for (unsigned i = 1; i != NumElems; ++i)
3401    Mask.push_back(i);
3402  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3403}
3404
3405/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
3406static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3407                          SDValue V2) {
3408  unsigned NumElems = VT.getVectorNumElements();
3409  SmallVector<int, 8> Mask;
3410  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
3411    Mask.push_back(i);
3412    Mask.push_back(i + NumElems);
3413  }
3414  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3415}
3416
3417/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
3418static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
3419                          SDValue V2) {
3420  unsigned NumElems = VT.getVectorNumElements();
3421  unsigned Half = NumElems/2;
3422  SmallVector<int, 8> Mask;
3423  for (unsigned i = 0; i != Half; ++i) {
3424    Mask.push_back(i + Half);
3425    Mask.push_back(i + NumElems + Half);
3426  }
3427  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
3428}
3429
3430/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
3431static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
3432                            bool HasSSE2) {
3433  if (SV->getValueType(0).getVectorNumElements() <= 4)
3434    return SDValue(SV, 0);
3435
3436  EVT PVT = MVT::v4f32;
3437  EVT VT = SV->getValueType(0);
3438  DebugLoc dl = SV->getDebugLoc();
3439  SDValue V1 = SV->getOperand(0);
3440  int NumElems = VT.getVectorNumElements();
3441  int EltNo = SV->getSplatIndex();
3442
3443  // unpack elements to the correct location
3444  while (NumElems > 4) {
3445    if (EltNo < NumElems/2) {
3446      V1 = getUnpackl(DAG, dl, VT, V1, V1);
3447    } else {
3448      V1 = getUnpackh(DAG, dl, VT, V1, V1);
3449      EltNo -= NumElems/2;
3450    }
3451    NumElems >>= 1;
3452  }
3453
3454  // Perform the splat.
3455  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
3456  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
3457  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
3458  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
3459}
3460
3461/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
3462/// vector of zero or undef vector.  This produces a shuffle where the low
3463/// element of V2 is swizzled into the zero/undef vector, landing at element
3464/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
3465static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
3466                                             bool isZero, bool HasSSE2,
3467                                             SelectionDAG &DAG) {
3468  EVT VT = V2.getValueType();
3469  SDValue V1 = isZero
3470    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
3471  unsigned NumElems = VT.getVectorNumElements();
3472  SmallVector<int, 16> MaskVec;
3473  for (unsigned i = 0; i != NumElems; ++i)
3474    // If this is the insertion idx, put the low elt of V2 here.
3475    MaskVec.push_back(i == Idx ? NumElems : i);
3476  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
3477}
3478
3479/// getNumOfConsecutiveZeros - Return the number of elements in a result of
3480/// a shuffle that is zero.
3481static
3482unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
3483                                  bool Low, SelectionDAG &DAG) {
3484  unsigned NumZeros = 0;
3485  for (int i = 0; i < NumElems; ++i) {
3486    unsigned Index = Low ? i : NumElems-i-1;
3487    int Idx = SVOp->getMaskElt(Index);
3488    if (Idx < 0) {
3489      ++NumZeros;
3490      continue;
3491    }
3492    SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
3493    if (Elt.getNode() && X86::isZeroNode(Elt))
3494      ++NumZeros;
3495    else
3496      break;
3497  }
3498  return NumZeros;
3499}
3500
3501/// isVectorShift - Returns true if the shuffle can be implemented as a
3502/// logical left or right shift of a vector.
3503/// FIXME: split into pslldqi, psrldqi, palignr variants.
3504static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
3505                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
3506  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
3507
3508  isLeft = true;
3509  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
3510  if (!NumZeros) {
3511    isLeft = false;
3512    NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
3513    if (!NumZeros)
3514      return false;
3515  }
3516  bool SeenV1 = false;
3517  bool SeenV2 = false;
3518  for (unsigned i = NumZeros; i < NumElems; ++i) {
3519    unsigned Val = isLeft ? (i - NumZeros) : i;
3520    int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
3521    if (Idx_ < 0)
3522      continue;
3523    unsigned Idx = (unsigned) Idx_;
3524    if (Idx < NumElems)
3525      SeenV1 = true;
3526    else {
3527      Idx -= NumElems;
3528      SeenV2 = true;
3529    }
3530    if (Idx != Val)
3531      return false;
3532  }
3533  if (SeenV1 && SeenV2)
3534    return false;
3535
3536  ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
3537  ShAmt = NumZeros;
3538  return true;
3539}
3540
3541
3542/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
3543///
3544static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
3545                                       unsigned NumNonZero, unsigned NumZero,
3546                                       SelectionDAG &DAG,
3547                                       const TargetLowering &TLI) {
3548  if (NumNonZero > 8)
3549    return SDValue();
3550
3551  DebugLoc dl = Op.getDebugLoc();
3552  SDValue V(0, 0);
3553  bool First = true;
3554  for (unsigned i = 0; i < 16; ++i) {
3555    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
3556    if (ThisIsNonZero && First) {
3557      if (NumZero)
3558        V = getZeroVector(MVT::v8i16, true, DAG, dl);
3559      else
3560        V = DAG.getUNDEF(MVT::v8i16);
3561      First = false;
3562    }
3563
3564    if ((i & 1) != 0) {
3565      SDValue ThisElt(0, 0), LastElt(0, 0);
3566      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
3567      if (LastIsNonZero) {
3568        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
3569                              MVT::i16, Op.getOperand(i-1));
3570      }
3571      if (ThisIsNonZero) {
3572        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
3573        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
3574                              ThisElt, DAG.getConstant(8, MVT::i8));
3575        if (LastIsNonZero)
3576          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
3577      } else
3578        ThisElt = LastElt;
3579
3580      if (ThisElt.getNode())
3581        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
3582                        DAG.getIntPtrConstant(i/2));
3583    }
3584  }
3585
3586  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
3587}
3588
3589/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
3590///
3591static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
3592                                     unsigned NumNonZero, unsigned NumZero,
3593                                     SelectionDAG &DAG,
3594                                     const TargetLowering &TLI) {
3595  if (NumNonZero > 4)
3596    return SDValue();
3597
3598  DebugLoc dl = Op.getDebugLoc();
3599  SDValue V(0, 0);
3600  bool First = true;
3601  for (unsigned i = 0; i < 8; ++i) {
3602    bool isNonZero = (NonZeros & (1 << i)) != 0;
3603    if (isNonZero) {
3604      if (First) {
3605        if (NumZero)
3606          V = getZeroVector(MVT::v8i16, true, DAG, dl);
3607        else
3608          V = DAG.getUNDEF(MVT::v8i16);
3609        First = false;
3610      }
3611      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
3612                      MVT::v8i16, V, Op.getOperand(i),
3613                      DAG.getIntPtrConstant(i));
3614    }
3615  }
3616
3617  return V;
3618}
3619
3620/// getVShift - Return a vector logical shift node.
3621///
3622static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
3623                         unsigned NumBits, SelectionDAG &DAG,
3624                         const TargetLowering &TLI, DebugLoc dl) {
3625  bool isMMX = VT.getSizeInBits() == 64;
3626  EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
3627  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
3628  SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
3629  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3630                     DAG.getNode(Opc, dl, ShVT, SrcOp,
3631                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
3632}
3633
3634SDValue
3635X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
3636                                          SelectionDAG &DAG) const {
3637
3638  // Check if the scalar load can be widened into a vector load. And if
3639  // the address is "base + cst" see if the cst can be "absorbed" into
3640  // the shuffle mask.
3641  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
3642    SDValue Ptr = LD->getBasePtr();
3643    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
3644      return SDValue();
3645    EVT PVT = LD->getValueType(0);
3646    if (PVT != MVT::i32 && PVT != MVT::f32)
3647      return SDValue();
3648
3649    int FI = -1;
3650    int64_t Offset = 0;
3651    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
3652      FI = FINode->getIndex();
3653      Offset = 0;
3654    } else if (Ptr.getOpcode() == ISD::ADD &&
3655               isa<ConstantSDNode>(Ptr.getOperand(1)) &&
3656               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
3657      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
3658      Offset = Ptr.getConstantOperandVal(1);
3659      Ptr = Ptr.getOperand(0);
3660    } else {
3661      return SDValue();
3662    }
3663
3664    SDValue Chain = LD->getChain();
3665    // Make sure the stack object alignment is at least 16.
3666    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
3667    if (DAG.InferPtrAlignment(Ptr) < 16) {
3668      if (MFI->isFixedObjectIndex(FI)) {
3669        // Can't change the alignment. FIXME: It's possible to compute
3670        // the exact stack offset and reference FI + adjust offset instead.
3671        // If someone *really* cares about this. That's the way to implement it.
3672        return SDValue();
3673      } else {
3674        MFI->setObjectAlignment(FI, 16);
3675      }
3676    }
3677
3678    // (Offset % 16) must be multiple of 4. Then address is then
3679    // Ptr + (Offset & ~15).
3680    if (Offset < 0)
3681      return SDValue();
3682    if ((Offset % 16) & 3)
3683      return SDValue();
3684    int64_t StartOffset = Offset & ~15;
3685    if (StartOffset)
3686      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
3687                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
3688
3689    int EltNo = (Offset - StartOffset) >> 2;
3690    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
3691    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
3692    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0,
3693                             false, false, 0);
3694    // Canonicalize it to a v4i32 shuffle.
3695    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
3696    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
3697                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
3698                                            DAG.getUNDEF(MVT::v4i32), &Mask[0]));
3699  }
3700
3701  return SDValue();
3702}
3703
3704/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
3705/// vector of type 'VT', see if the elements can be replaced by a single large
3706/// load which has the same value as a build_vector whose operands are 'elts'.
3707///
3708/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
3709///
3710/// FIXME: we'd also like to handle the case where the last elements are zero
3711/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
3712/// There's even a handy isZeroNode for that purpose.
3713static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
3714                                        DebugLoc &dl, SelectionDAG &DAG) {
3715  EVT EltVT = VT.getVectorElementType();
3716  unsigned NumElems = Elts.size();
3717
3718  LoadSDNode *LDBase = NULL;
3719  unsigned LastLoadedElt = -1U;
3720
3721  // For each element in the initializer, see if we've found a load or an undef.
3722  // If we don't find an initial load element, or later load elements are
3723  // non-consecutive, bail out.
3724  for (unsigned i = 0; i < NumElems; ++i) {
3725    SDValue Elt = Elts[i];
3726
3727    if (!Elt.getNode() ||
3728        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
3729      return SDValue();
3730    if (!LDBase) {
3731      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
3732        return SDValue();
3733      LDBase = cast<LoadSDNode>(Elt.getNode());
3734      LastLoadedElt = i;
3735      continue;
3736    }
3737    if (Elt.getOpcode() == ISD::UNDEF)
3738      continue;
3739
3740    LoadSDNode *LD = cast<LoadSDNode>(Elt);
3741    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
3742      return SDValue();
3743    LastLoadedElt = i;
3744  }
3745
3746  // If we have found an entire vector of loads and undefs, then return a large
3747  // load of the entire vector width starting at the base pointer.  If we found
3748  // consecutive loads for the low half, generate a vzext_load node.
3749  if (LastLoadedElt == NumElems - 1) {
3750    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
3751      return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
3752                         LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
3753                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
3754    return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(),
3755                       LDBase->getSrcValue(), LDBase->getSrcValueOffset(),
3756                       LDBase->isVolatile(), LDBase->isNonTemporal(),
3757                       LDBase->getAlignment());
3758  } else if (NumElems == 4 && LastLoadedElt == 1) {
3759    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
3760    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
3761    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
3762    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
3763  }
3764  return SDValue();
3765}
3766
3767SDValue
3768X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
3769  DebugLoc dl = Op.getDebugLoc();
3770  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
3771  if (ISD::isBuildVectorAllZeros(Op.getNode())
3772      || ISD::isBuildVectorAllOnes(Op.getNode())) {
3773    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
3774    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
3775    // eliminated on x86-32 hosts.
3776    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
3777      return Op;
3778
3779    if (ISD::isBuildVectorAllOnes(Op.getNode()))
3780      return getOnesVector(Op.getValueType(), DAG, dl);
3781    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
3782  }
3783
3784  EVT VT = Op.getValueType();
3785  EVT ExtVT = VT.getVectorElementType();
3786  unsigned EVTBits = ExtVT.getSizeInBits();
3787
3788  unsigned NumElems = Op.getNumOperands();
3789  unsigned NumZero  = 0;
3790  unsigned NumNonZero = 0;
3791  unsigned NonZeros = 0;
3792  bool IsAllConstants = true;
3793  SmallSet<SDValue, 8> Values;
3794  for (unsigned i = 0; i < NumElems; ++i) {
3795    SDValue Elt = Op.getOperand(i);
3796    if (Elt.getOpcode() == ISD::UNDEF)
3797      continue;
3798    Values.insert(Elt);
3799    if (Elt.getOpcode() != ISD::Constant &&
3800        Elt.getOpcode() != ISD::ConstantFP)
3801      IsAllConstants = false;
3802    if (X86::isZeroNode(Elt))
3803      NumZero++;
3804    else {
3805      NonZeros |= (1 << i);
3806      NumNonZero++;
3807    }
3808  }
3809
3810  if (NumNonZero == 0) {
3811    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
3812    return DAG.getUNDEF(VT);
3813  }
3814
3815  // Special case for single non-zero, non-undef, element.
3816  if (NumNonZero == 1) {
3817    unsigned Idx = CountTrailingZeros_32(NonZeros);
3818    SDValue Item = Op.getOperand(Idx);
3819
3820    // If this is an insertion of an i64 value on x86-32, and if the top bits of
3821    // the value are obviously zero, truncate the value to i32 and do the
3822    // insertion that way.  Only do this if the value is non-constant or if the
3823    // value is a constant being inserted into element 0.  It is cheaper to do
3824    // a constant pool load than it is to do a movd + shuffle.
3825    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
3826        (!IsAllConstants || Idx == 0)) {
3827      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
3828        // Handle MMX and SSE both.
3829        EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
3830        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
3831
3832        // Truncate the value (which may itself be a constant) to i32, and
3833        // convert it to a vector with movd (S2V+shuffle to zero extend).
3834        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
3835        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
3836        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3837                                           Subtarget->hasSSE2(), DAG);
3838
3839        // Now we have our 32-bit value zero extended in the low element of
3840        // a vector.  If Idx != 0, swizzle it into place.
3841        if (Idx != 0) {
3842          SmallVector<int, 4> Mask;
3843          Mask.push_back(Idx);
3844          for (unsigned i = 1; i != VecElts; ++i)
3845            Mask.push_back(i);
3846          Item = DAG.getVectorShuffle(VecVT, dl, Item,
3847                                      DAG.getUNDEF(Item.getValueType()),
3848                                      &Mask[0]);
3849        }
3850        return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
3851      }
3852    }
3853
3854    // If we have a constant or non-constant insertion into the low element of
3855    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
3856    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
3857    // depending on what the source datatype is.
3858    if (Idx == 0) {
3859      if (NumZero == 0) {
3860        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3861      } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
3862          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
3863        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3864        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
3865        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
3866                                           DAG);
3867      } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
3868        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
3869        EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
3870        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
3871        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
3872                                           Subtarget->hasSSE2(), DAG);
3873        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
3874      }
3875    }
3876
3877    // Is it a vector logical left shift?
3878    if (NumElems == 2 && Idx == 1 &&
3879        X86::isZeroNode(Op.getOperand(0)) &&
3880        !X86::isZeroNode(Op.getOperand(1))) {
3881      unsigned NumBits = VT.getSizeInBits();
3882      return getVShift(true, VT,
3883                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
3884                                   VT, Op.getOperand(1)),
3885                       NumBits/2, DAG, *this, dl);
3886    }
3887
3888    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
3889      return SDValue();
3890
3891    // Otherwise, if this is a vector with i32 or f32 elements, and the element
3892    // is a non-constant being inserted into an element other than the low one,
3893    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
3894    // movd/movss) to move this into the low element, then shuffle it into
3895    // place.
3896    if (EVTBits == 32) {
3897      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
3898
3899      // Turn it into a shuffle of zero and zero-extended scalar to vector.
3900      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
3901                                         Subtarget->hasSSE2(), DAG);
3902      SmallVector<int, 8> MaskVec;
3903      for (unsigned i = 0; i < NumElems; i++)
3904        MaskVec.push_back(i == Idx ? 0 : 1);
3905      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
3906    }
3907  }
3908
3909  // Splat is obviously ok. Let legalizer expand it to a shuffle.
3910  if (Values.size() == 1) {
3911    if (EVTBits == 32) {
3912      // Instead of a shuffle like this:
3913      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
3914      // Check if it's possible to issue this instead.
3915      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
3916      unsigned Idx = CountTrailingZeros_32(NonZeros);
3917      SDValue Item = Op.getOperand(Idx);
3918      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
3919        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
3920    }
3921    return SDValue();
3922  }
3923
3924  // A vector full of immediates; various special cases are already
3925  // handled, so this is best done with a single constant-pool load.
3926  if (IsAllConstants)
3927    return SDValue();
3928
3929  // Let legalizer expand 2-wide build_vectors.
3930  if (EVTBits == 64) {
3931    if (NumNonZero == 1) {
3932      // One half is zero or undef.
3933      unsigned Idx = CountTrailingZeros_32(NonZeros);
3934      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
3935                                 Op.getOperand(Idx));
3936      return getShuffleVectorZeroOrUndef(V2, Idx, true,
3937                                         Subtarget->hasSSE2(), DAG);
3938    }
3939    return SDValue();
3940  }
3941
3942  // If element VT is < 32 bits, convert it to inserts into a zero vector.
3943  if (EVTBits == 8 && NumElems == 16) {
3944    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
3945                                        *this);
3946    if (V.getNode()) return V;
3947  }
3948
3949  if (EVTBits == 16 && NumElems == 8) {
3950    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
3951                                        *this);
3952    if (V.getNode()) return V;
3953  }
3954
3955  // If element VT is == 32 bits, turn it into a number of shuffles.
3956  SmallVector<SDValue, 8> V;
3957  V.resize(NumElems);
3958  if (NumElems == 4 && NumZero > 0) {
3959    for (unsigned i = 0; i < 4; ++i) {
3960      bool isZero = !(NonZeros & (1 << i));
3961      if (isZero)
3962        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
3963      else
3964        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
3965    }
3966
3967    for (unsigned i = 0; i < 2; ++i) {
3968      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
3969        default: break;
3970        case 0:
3971          V[i] = V[i*2];  // Must be a zero vector.
3972          break;
3973        case 1:
3974          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
3975          break;
3976        case 2:
3977          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
3978          break;
3979        case 3:
3980          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
3981          break;
3982      }
3983    }
3984
3985    SmallVector<int, 8> MaskVec;
3986    bool Reverse = (NonZeros & 0x3) == 2;
3987    for (unsigned i = 0; i < 2; ++i)
3988      MaskVec.push_back(Reverse ? 1-i : i);
3989    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
3990    for (unsigned i = 0; i < 2; ++i)
3991      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
3992    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
3993  }
3994
3995  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
3996    // Check for a build vector of consecutive loads.
3997    for (unsigned i = 0; i < NumElems; ++i)
3998      V[i] = Op.getOperand(i);
3999
4000    // Check for elements which are consecutive loads.
4001    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
4002    if (LD.getNode())
4003      return LD;
4004
4005    // For SSE 4.1, use inserts into undef.
4006    if (getSubtarget()->hasSSE41()) {
4007      V[0] = DAG.getUNDEF(VT);
4008      for (unsigned i = 0; i < NumElems; ++i)
4009        if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
4010          V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
4011                             Op.getOperand(i), DAG.getIntPtrConstant(i));
4012      return V[0];
4013    }
4014
4015    // Otherwise, expand into a number of unpckl*
4016    // e.g. for v4f32
4017    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
4018    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
4019    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
4020    for (unsigned i = 0; i < NumElems; ++i)
4021      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
4022    NumElems >>= 1;
4023    while (NumElems != 0) {
4024      for (unsigned i = 0; i < NumElems; ++i)
4025        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
4026      NumElems >>= 1;
4027    }
4028    return V[0];
4029  }
4030  return SDValue();
4031}
4032
4033SDValue
4034X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
4035  // We support concatenate two MMX registers and place them in a MMX
4036  // register.  This is better than doing a stack convert.
4037  DebugLoc dl = Op.getDebugLoc();
4038  EVT ResVT = Op.getValueType();
4039  assert(Op.getNumOperands() == 2);
4040  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
4041         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
4042  int Mask[2];
4043  SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0));
4044  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
4045  InVec = Op.getOperand(1);
4046  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
4047    unsigned NumElts = ResVT.getVectorNumElements();
4048    VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
4049    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
4050                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
4051  } else {
4052    InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec);
4053    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
4054    Mask[0] = 0; Mask[1] = 2;
4055    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
4056  }
4057  return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
4058}
4059
4060// v8i16 shuffles - Prefer shuffles in the following order:
4061// 1. [all]   pshuflw, pshufhw, optional move
4062// 2. [ssse3] 1 x pshufb
4063// 3. [ssse3] 2 x pshufb + 1 x por
4064// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
4065static
4066SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
4067                                 SelectionDAG &DAG,
4068                                 const X86TargetLowering &TLI) {
4069  SDValue V1 = SVOp->getOperand(0);
4070  SDValue V2 = SVOp->getOperand(1);
4071  DebugLoc dl = SVOp->getDebugLoc();
4072  SmallVector<int, 8> MaskVals;
4073
4074  // Determine if more than 1 of the words in each of the low and high quadwords
4075  // of the result come from the same quadword of one of the two inputs.  Undef
4076  // mask values count as coming from any quadword, for better codegen.
4077  SmallVector<unsigned, 4> LoQuad(4);
4078  SmallVector<unsigned, 4> HiQuad(4);
4079  BitVector InputQuads(4);
4080  for (unsigned i = 0; i < 8; ++i) {
4081    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
4082    int EltIdx = SVOp->getMaskElt(i);
4083    MaskVals.push_back(EltIdx);
4084    if (EltIdx < 0) {
4085      ++Quad[0];
4086      ++Quad[1];
4087      ++Quad[2];
4088      ++Quad[3];
4089      continue;
4090    }
4091    ++Quad[EltIdx / 4];
4092    InputQuads.set(EltIdx / 4);
4093  }
4094
4095  int BestLoQuad = -1;
4096  unsigned MaxQuad = 1;
4097  for (unsigned i = 0; i < 4; ++i) {
4098    if (LoQuad[i] > MaxQuad) {
4099      BestLoQuad = i;
4100      MaxQuad = LoQuad[i];
4101    }
4102  }
4103
4104  int BestHiQuad = -1;
4105  MaxQuad = 1;
4106  for (unsigned i = 0; i < 4; ++i) {
4107    if (HiQuad[i] > MaxQuad) {
4108      BestHiQuad = i;
4109      MaxQuad = HiQuad[i];
4110    }
4111  }
4112
4113  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
4114  // of the two input vectors, shuffle them into one input vector so only a
4115  // single pshufb instruction is necessary. If There are more than 2 input
4116  // quads, disable the next transformation since it does not help SSSE3.
4117  bool V1Used = InputQuads[0] || InputQuads[1];
4118  bool V2Used = InputQuads[2] || InputQuads[3];
4119  if (TLI.getSubtarget()->hasSSSE3()) {
4120    if (InputQuads.count() == 2 && V1Used && V2Used) {
4121      BestLoQuad = InputQuads.find_first();
4122      BestHiQuad = InputQuads.find_next(BestLoQuad);
4123    }
4124    if (InputQuads.count() > 2) {
4125      BestLoQuad = -1;
4126      BestHiQuad = -1;
4127    }
4128  }
4129
4130  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
4131  // the shuffle mask.  If a quad is scored as -1, that means that it contains
4132  // words from all 4 input quadwords.
4133  SDValue NewV;
4134  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
4135    SmallVector<int, 8> MaskV;
4136    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
4137    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
4138    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
4139                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
4140                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
4141    NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
4142
4143    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
4144    // source words for the shuffle, to aid later transformations.
4145    bool AllWordsInNewV = true;
4146    bool InOrder[2] = { true, true };
4147    for (unsigned i = 0; i != 8; ++i) {
4148      int idx = MaskVals[i];
4149      if (idx != (int)i)
4150        InOrder[i/4] = false;
4151      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
4152        continue;
4153      AllWordsInNewV = false;
4154      break;
4155    }
4156
4157    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
4158    if (AllWordsInNewV) {
4159      for (int i = 0; i != 8; ++i) {
4160        int idx = MaskVals[i];
4161        if (idx < 0)
4162          continue;
4163        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
4164        if ((idx != i) && idx < 4)
4165          pshufhw = false;
4166        if ((idx != i) && idx > 3)
4167          pshuflw = false;
4168      }
4169      V1 = NewV;
4170      V2Used = false;
4171      BestLoQuad = 0;
4172      BestHiQuad = 1;
4173    }
4174
4175    // If we've eliminated the use of V2, and the new mask is a pshuflw or
4176    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
4177    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
4178      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
4179                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
4180    }
4181  }
4182
4183  // If we have SSSE3, and all words of the result are from 1 input vector,
4184  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
4185  // is present, fall back to case 4.
4186  if (TLI.getSubtarget()->hasSSSE3()) {
4187    SmallVector<SDValue,16> pshufbMask;
4188
4189    // If we have elements from both input vectors, set the high bit of the
4190    // shuffle mask element to zero out elements that come from V2 in the V1
4191    // mask, and elements that come from V1 in the V2 mask, so that the two
4192    // results can be OR'd together.
4193    bool TwoInputs = V1Used && V2Used;
4194    for (unsigned i = 0; i != 8; ++i) {
4195      int EltIdx = MaskVals[i] * 2;
4196      if (TwoInputs && (EltIdx >= 16)) {
4197        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4198        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4199        continue;
4200      }
4201      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
4202      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
4203    }
4204    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
4205    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4206                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4207                                 MVT::v16i8, &pshufbMask[0], 16));
4208    if (!TwoInputs)
4209      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4210
4211    // Calculate the shuffle mask for the second input, shuffle it, and
4212    // OR it with the first shuffled input.
4213    pshufbMask.clear();
4214    for (unsigned i = 0; i != 8; ++i) {
4215      int EltIdx = MaskVals[i] * 2;
4216      if (EltIdx < 16) {
4217        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4218        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4219        continue;
4220      }
4221      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4222      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
4223    }
4224    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
4225    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4226                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4227                                 MVT::v16i8, &pshufbMask[0], 16));
4228    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4229    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4230  }
4231
4232  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
4233  // and update MaskVals with new element order.
4234  BitVector InOrder(8);
4235  if (BestLoQuad >= 0) {
4236    SmallVector<int, 8> MaskV;
4237    for (int i = 0; i != 4; ++i) {
4238      int idx = MaskVals[i];
4239      if (idx < 0) {
4240        MaskV.push_back(-1);
4241        InOrder.set(i);
4242      } else if ((idx / 4) == BestLoQuad) {
4243        MaskV.push_back(idx & 3);
4244        InOrder.set(i);
4245      } else {
4246        MaskV.push_back(-1);
4247      }
4248    }
4249    for (unsigned i = 4; i != 8; ++i)
4250      MaskV.push_back(i);
4251    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4252                                &MaskV[0]);
4253  }
4254
4255  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
4256  // and update MaskVals with the new element order.
4257  if (BestHiQuad >= 0) {
4258    SmallVector<int, 8> MaskV;
4259    for (unsigned i = 0; i != 4; ++i)
4260      MaskV.push_back(i);
4261    for (unsigned i = 4; i != 8; ++i) {
4262      int idx = MaskVals[i];
4263      if (idx < 0) {
4264        MaskV.push_back(-1);
4265        InOrder.set(i);
4266      } else if ((idx / 4) == BestHiQuad) {
4267        MaskV.push_back((idx & 3) + 4);
4268        InOrder.set(i);
4269      } else {
4270        MaskV.push_back(-1);
4271      }
4272    }
4273    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
4274                                &MaskV[0]);
4275  }
4276
4277  // In case BestHi & BestLo were both -1, which means each quadword has a word
4278  // from each of the four input quadwords, calculate the InOrder bitvector now
4279  // before falling through to the insert/extract cleanup.
4280  if (BestLoQuad == -1 && BestHiQuad == -1) {
4281    NewV = V1;
4282    for (int i = 0; i != 8; ++i)
4283      if (MaskVals[i] < 0 || MaskVals[i] == i)
4284        InOrder.set(i);
4285  }
4286
4287  // The other elements are put in the right place using pextrw and pinsrw.
4288  for (unsigned i = 0; i != 8; ++i) {
4289    if (InOrder[i])
4290      continue;
4291    int EltIdx = MaskVals[i];
4292    if (EltIdx < 0)
4293      continue;
4294    SDValue ExtOp = (EltIdx < 8)
4295    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
4296                  DAG.getIntPtrConstant(EltIdx))
4297    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
4298                  DAG.getIntPtrConstant(EltIdx - 8));
4299    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
4300                       DAG.getIntPtrConstant(i));
4301  }
4302  return NewV;
4303}
4304
4305// v16i8 shuffles - Prefer shuffles in the following order:
4306// 1. [ssse3] 1 x pshufb
4307// 2. [ssse3] 2 x pshufb + 1 x por
4308// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
4309static
4310SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
4311                                 SelectionDAG &DAG,
4312                                 const X86TargetLowering &TLI) {
4313  SDValue V1 = SVOp->getOperand(0);
4314  SDValue V2 = SVOp->getOperand(1);
4315  DebugLoc dl = SVOp->getDebugLoc();
4316  SmallVector<int, 16> MaskVals;
4317  SVOp->getMask(MaskVals);
4318
4319  // If we have SSSE3, case 1 is generated when all result bytes come from
4320  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
4321  // present, fall back to case 3.
4322  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
4323  bool V1Only = true;
4324  bool V2Only = true;
4325  for (unsigned i = 0; i < 16; ++i) {
4326    int EltIdx = MaskVals[i];
4327    if (EltIdx < 0)
4328      continue;
4329    if (EltIdx < 16)
4330      V2Only = false;
4331    else
4332      V1Only = false;
4333  }
4334
4335  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
4336  if (TLI.getSubtarget()->hasSSSE3()) {
4337    SmallVector<SDValue,16> pshufbMask;
4338
4339    // If all result elements are from one input vector, then only translate
4340    // undef mask values to 0x80 (zero out result) in the pshufb mask.
4341    //
4342    // Otherwise, we have elements from both input vectors, and must zero out
4343    // elements that come from V2 in the first mask, and V1 in the second mask
4344    // so that we can OR them together.
4345    bool TwoInputs = !(V1Only || V2Only);
4346    for (unsigned i = 0; i != 16; ++i) {
4347      int EltIdx = MaskVals[i];
4348      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
4349        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4350        continue;
4351      }
4352      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
4353    }
4354    // If all the elements are from V2, assign it to V1 and return after
4355    // building the first pshufb.
4356    if (V2Only)
4357      V1 = V2;
4358    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
4359                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4360                                 MVT::v16i8, &pshufbMask[0], 16));
4361    if (!TwoInputs)
4362      return V1;
4363
4364    // Calculate the shuffle mask for the second input, shuffle it, and
4365    // OR it with the first shuffled input.
4366    pshufbMask.clear();
4367    for (unsigned i = 0; i != 16; ++i) {
4368      int EltIdx = MaskVals[i];
4369      if (EltIdx < 16) {
4370        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
4371        continue;
4372      }
4373      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
4374    }
4375    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
4376                     DAG.getNode(ISD::BUILD_VECTOR, dl,
4377                                 MVT::v16i8, &pshufbMask[0], 16));
4378    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
4379  }
4380
4381  // No SSSE3 - Calculate in place words and then fix all out of place words
4382  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
4383  // the 16 different words that comprise the two doublequadword input vectors.
4384  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
4385  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
4386  SDValue NewV = V2Only ? V2 : V1;
4387  for (int i = 0; i != 8; ++i) {
4388    int Elt0 = MaskVals[i*2];
4389    int Elt1 = MaskVals[i*2+1];
4390
4391    // This word of the result is all undef, skip it.
4392    if (Elt0 < 0 && Elt1 < 0)
4393      continue;
4394
4395    // This word of the result is already in the correct place, skip it.
4396    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
4397      continue;
4398    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
4399      continue;
4400
4401    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
4402    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
4403    SDValue InsElt;
4404
4405    // If Elt0 and Elt1 are defined, are consecutive, and can be load
4406    // using a single extract together, load it and store it.
4407    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
4408      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4409                           DAG.getIntPtrConstant(Elt1 / 2));
4410      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4411                        DAG.getIntPtrConstant(i));
4412      continue;
4413    }
4414
4415    // If Elt1 is defined, extract it from the appropriate source.  If the
4416    // source byte is not also odd, shift the extracted word left 8 bits
4417    // otherwise clear the bottom 8 bits if we need to do an or.
4418    if (Elt1 >= 0) {
4419      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
4420                           DAG.getIntPtrConstant(Elt1 / 2));
4421      if ((Elt1 & 1) == 0)
4422        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
4423                             DAG.getConstant(8, TLI.getShiftAmountTy()));
4424      else if (Elt0 >= 0)
4425        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
4426                             DAG.getConstant(0xFF00, MVT::i16));
4427    }
4428    // If Elt0 is defined, extract it from the appropriate source.  If the
4429    // source byte is not also even, shift the extracted word right 8 bits. If
4430    // Elt1 was also defined, OR the extracted values together before
4431    // inserting them in the result.
4432    if (Elt0 >= 0) {
4433      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
4434                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
4435      if ((Elt0 & 1) != 0)
4436        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
4437                              DAG.getConstant(8, TLI.getShiftAmountTy()));
4438      else if (Elt1 >= 0)
4439        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
4440                             DAG.getConstant(0x00FF, MVT::i16));
4441      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
4442                         : InsElt0;
4443    }
4444    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
4445                       DAG.getIntPtrConstant(i));
4446  }
4447  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
4448}
4449
4450/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
4451/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
4452/// done when every pair / quad of shuffle mask elements point to elements in
4453/// the right sequence. e.g.
4454/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
4455static
4456SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
4457                                 SelectionDAG &DAG,
4458                                 const TargetLowering &TLI, DebugLoc dl) {
4459  EVT VT = SVOp->getValueType(0);
4460  SDValue V1 = SVOp->getOperand(0);
4461  SDValue V2 = SVOp->getOperand(1);
4462  unsigned NumElems = VT.getVectorNumElements();
4463  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
4464  EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
4465  EVT MaskEltVT = MaskVT.getVectorElementType();
4466  EVT NewVT = MaskVT;
4467  switch (VT.getSimpleVT().SimpleTy) {
4468  default: assert(false && "Unexpected!");
4469  case MVT::v4f32: NewVT = MVT::v2f64; break;
4470  case MVT::v4i32: NewVT = MVT::v2i64; break;
4471  case MVT::v8i16: NewVT = MVT::v4i32; break;
4472  case MVT::v16i8: NewVT = MVT::v4i32; break;
4473  }
4474
4475  if (NewWidth == 2) {
4476    if (VT.isInteger())
4477      NewVT = MVT::v2i64;
4478    else
4479      NewVT = MVT::v2f64;
4480  }
4481  int Scale = NumElems / NewWidth;
4482  SmallVector<int, 8> MaskVec;
4483  for (unsigned i = 0; i < NumElems; i += Scale) {
4484    int StartIdx = -1;
4485    for (int j = 0; j < Scale; ++j) {
4486      int EltIdx = SVOp->getMaskElt(i+j);
4487      if (EltIdx < 0)
4488        continue;
4489      if (StartIdx == -1)
4490        StartIdx = EltIdx - (EltIdx % Scale);
4491      if (EltIdx != StartIdx + j)
4492        return SDValue();
4493    }
4494    if (StartIdx == -1)
4495      MaskVec.push_back(-1);
4496    else
4497      MaskVec.push_back(StartIdx / Scale);
4498  }
4499
4500  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
4501  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
4502  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
4503}
4504
4505/// getVZextMovL - Return a zero-extending vector move low node.
4506///
4507static SDValue getVZextMovL(EVT VT, EVT OpVT,
4508                            SDValue SrcOp, SelectionDAG &DAG,
4509                            const X86Subtarget *Subtarget, DebugLoc dl) {
4510  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
4511    LoadSDNode *LD = NULL;
4512    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
4513      LD = dyn_cast<LoadSDNode>(SrcOp);
4514    if (!LD) {
4515      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
4516      // instead.
4517      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
4518      if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
4519          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
4520          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
4521          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
4522        // PR2108
4523        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
4524        return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4525                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4526                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
4527                                                   OpVT,
4528                                                   SrcOp.getOperand(0)
4529                                                          .getOperand(0))));
4530      }
4531    }
4532  }
4533
4534  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4535                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
4536                                 DAG.getNode(ISD::BIT_CONVERT, dl,
4537                                             OpVT, SrcOp)));
4538}
4539
4540/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
4541/// shuffles.
4542static SDValue
4543LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
4544  SDValue V1 = SVOp->getOperand(0);
4545  SDValue V2 = SVOp->getOperand(1);
4546  DebugLoc dl = SVOp->getDebugLoc();
4547  EVT VT = SVOp->getValueType(0);
4548
4549  SmallVector<std::pair<int, int>, 8> Locs;
4550  Locs.resize(4);
4551  SmallVector<int, 8> Mask1(4U, -1);
4552  SmallVector<int, 8> PermMask;
4553  SVOp->getMask(PermMask);
4554
4555  unsigned NumHi = 0;
4556  unsigned NumLo = 0;
4557  for (unsigned i = 0; i != 4; ++i) {
4558    int Idx = PermMask[i];
4559    if (Idx < 0) {
4560      Locs[i] = std::make_pair(-1, -1);
4561    } else {
4562      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
4563      if (Idx < 4) {
4564        Locs[i] = std::make_pair(0, NumLo);
4565        Mask1[NumLo] = Idx;
4566        NumLo++;
4567      } else {
4568        Locs[i] = std::make_pair(1, NumHi);
4569        if (2+NumHi < 4)
4570          Mask1[2+NumHi] = Idx;
4571        NumHi++;
4572      }
4573    }
4574  }
4575
4576  if (NumLo <= 2 && NumHi <= 2) {
4577    // If no more than two elements come from either vector. This can be
4578    // implemented with two shuffles. First shuffle gather the elements.
4579    // The second shuffle, which takes the first shuffle as both of its
4580    // vector operands, put the elements into the right order.
4581    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4582
4583    SmallVector<int, 8> Mask2(4U, -1);
4584
4585    for (unsigned i = 0; i != 4; ++i) {
4586      if (Locs[i].first == -1)
4587        continue;
4588      else {
4589        unsigned Idx = (i < 2) ? 0 : 4;
4590        Idx += Locs[i].first * 2 + Locs[i].second;
4591        Mask2[i] = Idx;
4592      }
4593    }
4594
4595    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
4596  } else if (NumLo == 3 || NumHi == 3) {
4597    // Otherwise, we must have three elements from one vector, call it X, and
4598    // one element from the other, call it Y.  First, use a shufps to build an
4599    // intermediate vector with the one element from Y and the element from X
4600    // that will be in the same half in the final destination (the indexes don't
4601    // matter). Then, use a shufps to build the final vector, taking the half
4602    // containing the element from Y from the intermediate, and the other half
4603    // from X.
4604    if (NumHi == 3) {
4605      // Normalize it so the 3 elements come from V1.
4606      CommuteVectorShuffleMask(PermMask, VT);
4607      std::swap(V1, V2);
4608    }
4609
4610    // Find the element from V2.
4611    unsigned HiIndex;
4612    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
4613      int Val = PermMask[HiIndex];
4614      if (Val < 0)
4615        continue;
4616      if (Val >= 4)
4617        break;
4618    }
4619
4620    Mask1[0] = PermMask[HiIndex];
4621    Mask1[1] = -1;
4622    Mask1[2] = PermMask[HiIndex^1];
4623    Mask1[3] = -1;
4624    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4625
4626    if (HiIndex >= 2) {
4627      Mask1[0] = PermMask[0];
4628      Mask1[1] = PermMask[1];
4629      Mask1[2] = HiIndex & 1 ? 6 : 4;
4630      Mask1[3] = HiIndex & 1 ? 4 : 6;
4631      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
4632    } else {
4633      Mask1[0] = HiIndex & 1 ? 2 : 0;
4634      Mask1[1] = HiIndex & 1 ? 0 : 2;
4635      Mask1[2] = PermMask[2];
4636      Mask1[3] = PermMask[3];
4637      if (Mask1[2] >= 0)
4638        Mask1[2] += 4;
4639      if (Mask1[3] >= 0)
4640        Mask1[3] += 4;
4641      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
4642    }
4643  }
4644
4645  // Break it into (shuffle shuffle_hi, shuffle_lo).
4646  Locs.clear();
4647  SmallVector<int,8> LoMask(4U, -1);
4648  SmallVector<int,8> HiMask(4U, -1);
4649
4650  SmallVector<int,8> *MaskPtr = &LoMask;
4651  unsigned MaskIdx = 0;
4652  unsigned LoIdx = 0;
4653  unsigned HiIdx = 2;
4654  for (unsigned i = 0; i != 4; ++i) {
4655    if (i == 2) {
4656      MaskPtr = &HiMask;
4657      MaskIdx = 1;
4658      LoIdx = 0;
4659      HiIdx = 2;
4660    }
4661    int Idx = PermMask[i];
4662    if (Idx < 0) {
4663      Locs[i] = std::make_pair(-1, -1);
4664    } else if (Idx < 4) {
4665      Locs[i] = std::make_pair(MaskIdx, LoIdx);
4666      (*MaskPtr)[LoIdx] = Idx;
4667      LoIdx++;
4668    } else {
4669      Locs[i] = std::make_pair(MaskIdx, HiIdx);
4670      (*MaskPtr)[HiIdx] = Idx;
4671      HiIdx++;
4672    }
4673  }
4674
4675  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
4676  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
4677  SmallVector<int, 8> MaskOps;
4678  for (unsigned i = 0; i != 4; ++i) {
4679    if (Locs[i].first == -1) {
4680      MaskOps.push_back(-1);
4681    } else {
4682      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
4683      MaskOps.push_back(Idx);
4684    }
4685  }
4686  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
4687}
4688
4689SDValue
4690X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
4691  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
4692  SDValue V1 = Op.getOperand(0);
4693  SDValue V2 = Op.getOperand(1);
4694  EVT VT = Op.getValueType();
4695  DebugLoc dl = Op.getDebugLoc();
4696  unsigned NumElems = VT.getVectorNumElements();
4697  bool isMMX = VT.getSizeInBits() == 64;
4698  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
4699  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
4700  bool V1IsSplat = false;
4701  bool V2IsSplat = false;
4702
4703  if (isZeroShuffle(SVOp))
4704    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
4705
4706  // Promote splats to v4f32.
4707  if (SVOp->isSplat()) {
4708    if (isMMX || NumElems < 4)
4709      return Op;
4710    return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
4711  }
4712
4713  // If the shuffle can be profitably rewritten as a narrower shuffle, then
4714  // do it!
4715  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
4716    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4717    if (NewOp.getNode())
4718      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
4719                         LowerVECTOR_SHUFFLE(NewOp, DAG));
4720  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
4721    // FIXME: Figure out a cleaner way to do this.
4722    // Try to make use of movq to zero out the top part.
4723    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
4724      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4725      if (NewOp.getNode()) {
4726        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
4727          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
4728                              DAG, Subtarget, dl);
4729      }
4730    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
4731      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
4732      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
4733        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
4734                            DAG, Subtarget, dl);
4735    }
4736  }
4737
4738  if (X86::isPSHUFDMask(SVOp))
4739    return Op;
4740
4741  // Check if this can be converted into a logical shift.
4742  bool isLeft = false;
4743  unsigned ShAmt = 0;
4744  SDValue ShVal;
4745  bool isShift = getSubtarget()->hasSSE2() &&
4746    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
4747  if (isShift && ShVal.hasOneUse()) {
4748    // If the shifted value has multiple uses, it may be cheaper to use
4749    // v_set0 + movlhps or movhlps, etc.
4750    EVT EltVT = VT.getVectorElementType();
4751    ShAmt *= EltVT.getSizeInBits();
4752    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4753  }
4754
4755  if (X86::isMOVLMask(SVOp)) {
4756    if (V1IsUndef)
4757      return V2;
4758    if (ISD::isBuildVectorAllZeros(V1.getNode()))
4759      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
4760    if (!isMMX)
4761      return Op;
4762  }
4763
4764  // FIXME: fold these into legal mask.
4765  if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
4766                 X86::isMOVSLDUPMask(SVOp) ||
4767                 X86::isMOVHLPSMask(SVOp) ||
4768                 X86::isMOVLHPSMask(SVOp) ||
4769                 X86::isMOVLPMask(SVOp)))
4770    return Op;
4771
4772  if (ShouldXformToMOVHLPS(SVOp) ||
4773      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
4774    return CommuteVectorShuffle(SVOp, DAG);
4775
4776  if (isShift) {
4777    // No better options. Use a vshl / vsrl.
4778    EVT EltVT = VT.getVectorElementType();
4779    ShAmt *= EltVT.getSizeInBits();
4780    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
4781  }
4782
4783  bool Commuted = false;
4784  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
4785  // 1,1,1,1 -> v8i16 though.
4786  V1IsSplat = isSplatVector(V1.getNode());
4787  V2IsSplat = isSplatVector(V2.getNode());
4788
4789  // Canonicalize the splat or undef, if present, to be on the RHS.
4790  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
4791    Op = CommuteVectorShuffle(SVOp, DAG);
4792    SVOp = cast<ShuffleVectorSDNode>(Op);
4793    V1 = SVOp->getOperand(0);
4794    V2 = SVOp->getOperand(1);
4795    std::swap(V1IsSplat, V2IsSplat);
4796    std::swap(V1IsUndef, V2IsUndef);
4797    Commuted = true;
4798  }
4799
4800  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
4801    // Shuffling low element of v1 into undef, just return v1.
4802    if (V2IsUndef)
4803      return V1;
4804    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
4805    // the instruction selector will not match, so get a canonical MOVL with
4806    // swapped operands to undo the commute.
4807    return getMOVL(DAG, dl, VT, V2, V1);
4808  }
4809
4810  if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
4811      X86::isUNPCKH_v_undef_Mask(SVOp) ||
4812      X86::isUNPCKLMask(SVOp) ||
4813      X86::isUNPCKHMask(SVOp))
4814    return Op;
4815
4816  if (V2IsSplat) {
4817    // Normalize mask so all entries that point to V2 points to its first
4818    // element then try to match unpck{h|l} again. If match, return a
4819    // new vector_shuffle with the corrected mask.
4820    SDValue NewMask = NormalizeMask(SVOp, DAG);
4821    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
4822    if (NSVOp != SVOp) {
4823      if (X86::isUNPCKLMask(NSVOp, true)) {
4824        return NewMask;
4825      } else if (X86::isUNPCKHMask(NSVOp, true)) {
4826        return NewMask;
4827      }
4828    }
4829  }
4830
4831  if (Commuted) {
4832    // Commute is back and try unpck* again.
4833    // FIXME: this seems wrong.
4834    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
4835    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
4836    if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
4837        X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
4838        X86::isUNPCKLMask(NewSVOp) ||
4839        X86::isUNPCKHMask(NewSVOp))
4840      return NewOp;
4841  }
4842
4843  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
4844
4845  // Normalize the node to match x86 shuffle ops if needed
4846  if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
4847    return CommuteVectorShuffle(SVOp, DAG);
4848
4849  // Check for legal shuffle and return?
4850  SmallVector<int, 16> PermMask;
4851  SVOp->getMask(PermMask);
4852  if (isShuffleMaskLegal(PermMask, VT))
4853    return Op;
4854
4855  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
4856  if (VT == MVT::v8i16) {
4857    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
4858    if (NewOp.getNode())
4859      return NewOp;
4860  }
4861
4862  if (VT == MVT::v16i8) {
4863    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
4864    if (NewOp.getNode())
4865      return NewOp;
4866  }
4867
4868  // Handle all 4 wide cases with a number of shuffles except for MMX.
4869  if (NumElems == 4 && !isMMX)
4870    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
4871
4872  return SDValue();
4873}
4874
4875SDValue
4876X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
4877                                                SelectionDAG &DAG) const {
4878  EVT VT = Op.getValueType();
4879  DebugLoc dl = Op.getDebugLoc();
4880  if (VT.getSizeInBits() == 8) {
4881    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
4882                                    Op.getOperand(0), Op.getOperand(1));
4883    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4884                                    DAG.getValueType(VT));
4885    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4886  } else if (VT.getSizeInBits() == 16) {
4887    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4888    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
4889    if (Idx == 0)
4890      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4891                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4892                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4893                                                 MVT::v4i32,
4894                                                 Op.getOperand(0)),
4895                                     Op.getOperand(1)));
4896    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
4897                                    Op.getOperand(0), Op.getOperand(1));
4898    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
4899                                    DAG.getValueType(VT));
4900    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4901  } else if (VT == MVT::f32) {
4902    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
4903    // the result back to FR32 register. It's only worth matching if the
4904    // result has a single use which is a store or a bitcast to i32.  And in
4905    // the case of a store, it's not worth it if the index is a constant 0,
4906    // because a MOVSSmr can be used instead, which is smaller and faster.
4907    if (!Op.hasOneUse())
4908      return SDValue();
4909    SDNode *User = *Op.getNode()->use_begin();
4910    if ((User->getOpcode() != ISD::STORE ||
4911         (isa<ConstantSDNode>(Op.getOperand(1)) &&
4912          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
4913        (User->getOpcode() != ISD::BIT_CONVERT ||
4914         User->getValueType(0) != MVT::i32))
4915      return SDValue();
4916    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4917                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
4918                                              Op.getOperand(0)),
4919                                              Op.getOperand(1));
4920    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
4921  } else if (VT == MVT::i32) {
4922    // ExtractPS works with constant index.
4923    if (isa<ConstantSDNode>(Op.getOperand(1)))
4924      return Op;
4925  }
4926  return SDValue();
4927}
4928
4929
4930SDValue
4931X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
4932                                           SelectionDAG &DAG) const {
4933  if (!isa<ConstantSDNode>(Op.getOperand(1)))
4934    return SDValue();
4935
4936  if (Subtarget->hasSSE41()) {
4937    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
4938    if (Res.getNode())
4939      return Res;
4940  }
4941
4942  EVT VT = Op.getValueType();
4943  DebugLoc dl = Op.getDebugLoc();
4944  // TODO: handle v16i8.
4945  if (VT.getSizeInBits() == 16) {
4946    SDValue Vec = Op.getOperand(0);
4947    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4948    if (Idx == 0)
4949      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
4950                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
4951                                     DAG.getNode(ISD::BIT_CONVERT, dl,
4952                                                 MVT::v4i32, Vec),
4953                                     Op.getOperand(1)));
4954    // Transform it so it match pextrw which produces a 32-bit result.
4955    EVT EltVT = MVT::i32;
4956    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
4957                                    Op.getOperand(0), Op.getOperand(1));
4958    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
4959                                    DAG.getValueType(VT));
4960    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
4961  } else if (VT.getSizeInBits() == 32) {
4962    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4963    if (Idx == 0)
4964      return Op;
4965
4966    // SHUFPS the element to the lowest double word, then movss.
4967    int Mask[4] = { Idx, -1, -1, -1 };
4968    EVT VVT = Op.getOperand(0).getValueType();
4969    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4970                                       DAG.getUNDEF(VVT), Mask);
4971    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4972                       DAG.getIntPtrConstant(0));
4973  } else if (VT.getSizeInBits() == 64) {
4974    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
4975    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
4976    //        to match extract_elt for f64.
4977    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4978    if (Idx == 0)
4979      return Op;
4980
4981    // UNPCKHPD the element to the lowest double word, then movsd.
4982    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
4983    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
4984    int Mask[2] = { 1, -1 };
4985    EVT VVT = Op.getOperand(0).getValueType();
4986    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
4987                                       DAG.getUNDEF(VVT), Mask);
4988    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
4989                       DAG.getIntPtrConstant(0));
4990  }
4991
4992  return SDValue();
4993}
4994
4995SDValue
4996X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
4997                                               SelectionDAG &DAG) const {
4998  EVT VT = Op.getValueType();
4999  EVT EltVT = VT.getVectorElementType();
5000  DebugLoc dl = Op.getDebugLoc();
5001
5002  SDValue N0 = Op.getOperand(0);
5003  SDValue N1 = Op.getOperand(1);
5004  SDValue N2 = Op.getOperand(2);
5005
5006  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
5007      isa<ConstantSDNode>(N2)) {
5008    unsigned Opc;
5009    if (VT == MVT::v8i16)
5010      Opc = X86ISD::PINSRW;
5011    else if (VT == MVT::v4i16)
5012      Opc = X86ISD::MMX_PINSRW;
5013    else if (VT == MVT::v16i8)
5014      Opc = X86ISD::PINSRB;
5015    else
5016      Opc = X86ISD::PINSRB;
5017
5018    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
5019    // argument.
5020    if (N1.getValueType() != MVT::i32)
5021      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
5022    if (N2.getValueType() != MVT::i32)
5023      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
5024    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
5025  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
5026    // Bits [7:6] of the constant are the source select.  This will always be
5027    //  zero here.  The DAG Combiner may combine an extract_elt index into these
5028    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
5029    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
5030    // Bits [5:4] of the constant are the destination select.  This is the
5031    //  value of the incoming immediate.
5032    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
5033    //   combine either bitwise AND or insert of float 0.0 to set these bits.
5034    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
5035    // Create this as a scalar to vector..
5036    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
5037    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
5038  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
5039    // PINSR* works with constant index.
5040    return Op;
5041  }
5042  return SDValue();
5043}
5044
5045SDValue
5046X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
5047  EVT VT = Op.getValueType();
5048  EVT EltVT = VT.getVectorElementType();
5049
5050  if (Subtarget->hasSSE41())
5051    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
5052
5053  if (EltVT == MVT::i8)
5054    return SDValue();
5055
5056  DebugLoc dl = Op.getDebugLoc();
5057  SDValue N0 = Op.getOperand(0);
5058  SDValue N1 = Op.getOperand(1);
5059  SDValue N2 = Op.getOperand(2);
5060
5061  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
5062    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
5063    // as its second argument.
5064    if (N1.getValueType() != MVT::i32)
5065      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
5066    if (N2.getValueType() != MVT::i32)
5067      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
5068    return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW,
5069                       dl, VT, N0, N1, N2);
5070  }
5071  return SDValue();
5072}
5073
5074SDValue
5075X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
5076  DebugLoc dl = Op.getDebugLoc();
5077  if (Op.getValueType() == MVT::v2f32)
5078    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
5079                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
5080                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
5081                                               Op.getOperand(0))));
5082
5083  if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
5084    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
5085
5086  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
5087  EVT VT = MVT::v2i32;
5088  switch (Op.getValueType().getSimpleVT().SimpleTy) {
5089  default: break;
5090  case MVT::v16i8:
5091  case MVT::v8i16:
5092    VT = MVT::v4i32;
5093    break;
5094  }
5095  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
5096                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
5097}
5098
5099// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
5100// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
5101// one of the above mentioned nodes. It has to be wrapped because otherwise
5102// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
5103// be used to form addressing mode. These wrapped nodes will be selected
5104// into MOV32ri.
5105SDValue
5106X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
5107  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
5108
5109  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5110  // global base reg.
5111  unsigned char OpFlag = 0;
5112  unsigned WrapperKind = X86ISD::Wrapper;
5113  CodeModel::Model M = getTargetMachine().getCodeModel();
5114
5115  if (Subtarget->isPICStyleRIPRel() &&
5116      (M == CodeModel::Small || M == CodeModel::Kernel))
5117    WrapperKind = X86ISD::WrapperRIP;
5118  else if (Subtarget->isPICStyleGOT())
5119    OpFlag = X86II::MO_GOTOFF;
5120  else if (Subtarget->isPICStyleStubPIC())
5121    OpFlag = X86II::MO_PIC_BASE_OFFSET;
5122
5123  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
5124                                             CP->getAlignment(),
5125                                             CP->getOffset(), OpFlag);
5126  DebugLoc DL = CP->getDebugLoc();
5127  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5128  // With PIC, the address is actually $g + Offset.
5129  if (OpFlag) {
5130    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5131                         DAG.getNode(X86ISD::GlobalBaseReg,
5132                                     DebugLoc(), getPointerTy()),
5133                         Result);
5134  }
5135
5136  return Result;
5137}
5138
5139SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
5140  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
5141
5142  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5143  // global base reg.
5144  unsigned char OpFlag = 0;
5145  unsigned WrapperKind = X86ISD::Wrapper;
5146  CodeModel::Model M = getTargetMachine().getCodeModel();
5147
5148  if (Subtarget->isPICStyleRIPRel() &&
5149      (M == CodeModel::Small || M == CodeModel::Kernel))
5150    WrapperKind = X86ISD::WrapperRIP;
5151  else if (Subtarget->isPICStyleGOT())
5152    OpFlag = X86II::MO_GOTOFF;
5153  else if (Subtarget->isPICStyleStubPIC())
5154    OpFlag = X86II::MO_PIC_BASE_OFFSET;
5155
5156  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
5157                                          OpFlag);
5158  DebugLoc DL = JT->getDebugLoc();
5159  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5160
5161  // With PIC, the address is actually $g + Offset.
5162  if (OpFlag) {
5163    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5164                         DAG.getNode(X86ISD::GlobalBaseReg,
5165                                     DebugLoc(), getPointerTy()),
5166                         Result);
5167  }
5168
5169  return Result;
5170}
5171
5172SDValue
5173X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
5174  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
5175
5176  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5177  // global base reg.
5178  unsigned char OpFlag = 0;
5179  unsigned WrapperKind = X86ISD::Wrapper;
5180  CodeModel::Model M = getTargetMachine().getCodeModel();
5181
5182  if (Subtarget->isPICStyleRIPRel() &&
5183      (M == CodeModel::Small || M == CodeModel::Kernel))
5184    WrapperKind = X86ISD::WrapperRIP;
5185  else if (Subtarget->isPICStyleGOT())
5186    OpFlag = X86II::MO_GOTOFF;
5187  else if (Subtarget->isPICStyleStubPIC())
5188    OpFlag = X86II::MO_PIC_BASE_OFFSET;
5189
5190  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
5191
5192  DebugLoc DL = Op.getDebugLoc();
5193  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5194
5195
5196  // With PIC, the address is actually $g + Offset.
5197  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
5198      !Subtarget->is64Bit()) {
5199    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5200                         DAG.getNode(X86ISD::GlobalBaseReg,
5201                                     DebugLoc(), getPointerTy()),
5202                         Result);
5203  }
5204
5205  return Result;
5206}
5207
5208SDValue
5209X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
5210  // Create the TargetBlockAddressAddress node.
5211  unsigned char OpFlags =
5212    Subtarget->ClassifyBlockAddressReference();
5213  CodeModel::Model M = getTargetMachine().getCodeModel();
5214  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
5215  DebugLoc dl = Op.getDebugLoc();
5216  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
5217                                       /*isTarget=*/true, OpFlags);
5218
5219  if (Subtarget->isPICStyleRIPRel() &&
5220      (M == CodeModel::Small || M == CodeModel::Kernel))
5221    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5222  else
5223    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5224
5225  // With PIC, the address is actually $g + Offset.
5226  if (isGlobalRelativeToPICBase(OpFlags)) {
5227    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5228                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5229                         Result);
5230  }
5231
5232  return Result;
5233}
5234
5235SDValue
5236X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
5237                                      int64_t Offset,
5238                                      SelectionDAG &DAG) const {
5239  // Create the TargetGlobalAddress node, folding in the constant
5240  // offset if it is legal.
5241  unsigned char OpFlags =
5242    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
5243  CodeModel::Model M = getTargetMachine().getCodeModel();
5244  SDValue Result;
5245  if (OpFlags == X86II::MO_NO_FLAG &&
5246      X86::isOffsetSuitableForCodeModel(Offset, M)) {
5247    // A direct static reference to a global.
5248    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
5249    Offset = 0;
5250  } else {
5251    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
5252  }
5253
5254  if (Subtarget->isPICStyleRIPRel() &&
5255      (M == CodeModel::Small || M == CodeModel::Kernel))
5256    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
5257  else
5258    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
5259
5260  // With PIC, the address is actually $g + Offset.
5261  if (isGlobalRelativeToPICBase(OpFlags)) {
5262    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
5263                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
5264                         Result);
5265  }
5266
5267  // For globals that require a load from a stub to get the address, emit the
5268  // load.
5269  if (isGlobalStubReference(OpFlags))
5270    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
5271                         PseudoSourceValue::getGOT(), 0, false, false, 0);
5272
5273  // If there was a non-zero offset that we didn't fold, create an explicit
5274  // addition for it.
5275  if (Offset != 0)
5276    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
5277                         DAG.getConstant(Offset, getPointerTy()));
5278
5279  return Result;
5280}
5281
5282SDValue
5283X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
5284  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
5285  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
5286  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
5287}
5288
5289static SDValue
5290GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
5291           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
5292           unsigned char OperandFlags) {
5293  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5294  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
5295  DebugLoc dl = GA->getDebugLoc();
5296  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
5297                                           GA->getValueType(0),
5298                                           GA->getOffset(),
5299                                           OperandFlags);
5300  if (InFlag) {
5301    SDValue Ops[] = { Chain,  TGA, *InFlag };
5302    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
5303  } else {
5304    SDValue Ops[]  = { Chain, TGA };
5305    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
5306  }
5307
5308  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
5309  MFI->setAdjustsStack(true);
5310
5311  SDValue Flag = Chain.getValue(1);
5312  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
5313}
5314
5315// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
5316static SDValue
5317LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5318                                const EVT PtrVT) {
5319  SDValue InFlag;
5320  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
5321  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
5322                                     DAG.getNode(X86ISD::GlobalBaseReg,
5323                                                 DebugLoc(), PtrVT), InFlag);
5324  InFlag = Chain.getValue(1);
5325
5326  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
5327}
5328
5329// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
5330static SDValue
5331LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5332                                const EVT PtrVT) {
5333  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
5334                    X86::RAX, X86II::MO_TLSGD);
5335}
5336
5337// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
5338// "local exec" model.
5339static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
5340                                   const EVT PtrVT, TLSModel::Model model,
5341                                   bool is64Bit) {
5342  DebugLoc dl = GA->getDebugLoc();
5343  // Get the Thread Pointer
5344  SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
5345                             DebugLoc(), PtrVT,
5346                             DAG.getRegister(is64Bit? X86::FS : X86::GS,
5347                                             MVT::i32));
5348
5349  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
5350                                      NULL, 0, false, false, 0);
5351
5352  unsigned char OperandFlags = 0;
5353  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
5354  // initialexec.
5355  unsigned WrapperKind = X86ISD::Wrapper;
5356  if (model == TLSModel::LocalExec) {
5357    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
5358  } else if (is64Bit) {
5359    assert(model == TLSModel::InitialExec);
5360    OperandFlags = X86II::MO_GOTTPOFF;
5361    WrapperKind = X86ISD::WrapperRIP;
5362  } else {
5363    assert(model == TLSModel::InitialExec);
5364    OperandFlags = X86II::MO_INDNTPOFF;
5365  }
5366
5367  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
5368  // exec)
5369  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
5370                                           GA->getOffset(), OperandFlags);
5371  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
5372
5373  if (model == TLSModel::InitialExec)
5374    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
5375                         PseudoSourceValue::getGOT(), 0, false, false, 0);
5376
5377  // The address of the thread local variable is the add of the thread
5378  // pointer with the offset of the variable.
5379  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
5380}
5381
5382SDValue
5383X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
5384
5385  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
5386  const GlobalValue *GV = GA->getGlobal();
5387
5388  if (Subtarget->isTargetELF()) {
5389    // TODO: implement the "local dynamic" model
5390    // TODO: implement the "initial exec"model for pic executables
5391
5392    // If GV is an alias then use the aliasee for determining
5393    // thread-localness.
5394    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
5395      GV = GA->resolveAliasedGlobal(false);
5396
5397    TLSModel::Model model
5398      = getTLSModel(GV, getTargetMachine().getRelocationModel());
5399
5400    switch (model) {
5401      case TLSModel::GeneralDynamic:
5402      case TLSModel::LocalDynamic: // not implemented
5403        if (Subtarget->is64Bit())
5404          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
5405        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
5406
5407      case TLSModel::InitialExec:
5408      case TLSModel::LocalExec:
5409        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
5410                                   Subtarget->is64Bit());
5411    }
5412  } else if (Subtarget->isTargetDarwin()) {
5413    // Darwin only has one model of TLS.  Lower to that.
5414    unsigned char OpFlag = 0;
5415    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
5416                           X86ISD::WrapperRIP : X86ISD::Wrapper;
5417
5418    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
5419    // global base reg.
5420    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
5421                  !Subtarget->is64Bit();
5422    if (PIC32)
5423      OpFlag = X86II::MO_TLVP_PIC_BASE;
5424    else
5425      OpFlag = X86II::MO_TLVP;
5426
5427    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(),
5428                                                getPointerTy(),
5429                                                GA->getOffset(), OpFlag);
5430
5431    DebugLoc DL = Op.getDebugLoc();
5432    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
5433
5434    // With PIC32, the address is actually $g + Offset.
5435    if (PIC32)
5436      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
5437                           DAG.getNode(X86ISD::GlobalBaseReg,
5438                                       DebugLoc(), getPointerTy()),
5439                           Offset);
5440
5441    // Lowering the machine isd will make sure everything is in the right
5442    // location.
5443    SDValue Args[] = { Offset };
5444    SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1);
5445
5446    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
5447    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
5448    MFI->setAdjustsStack(true);
5449
5450    // And our return value (tls address) is in the standard call return value
5451    // location.
5452    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
5453    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
5454  }
5455
5456  assert(false &&
5457         "TLS not implemented for this target.");
5458
5459  llvm_unreachable("Unreachable");
5460  return SDValue();
5461}
5462
5463
5464/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
5465/// take a 2 x i32 value to shift plus a shift amount.
5466SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
5467  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
5468  EVT VT = Op.getValueType();
5469  unsigned VTBits = VT.getSizeInBits();
5470  DebugLoc dl = Op.getDebugLoc();
5471  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
5472  SDValue ShOpLo = Op.getOperand(0);
5473  SDValue ShOpHi = Op.getOperand(1);
5474  SDValue ShAmt  = Op.getOperand(2);
5475  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
5476                                     DAG.getConstant(VTBits - 1, MVT::i8))
5477                       : DAG.getConstant(0, VT);
5478
5479  SDValue Tmp2, Tmp3;
5480  if (Op.getOpcode() == ISD::SHL_PARTS) {
5481    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
5482    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
5483  } else {
5484    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
5485    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
5486  }
5487
5488  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
5489                                DAG.getConstant(VTBits, MVT::i8));
5490  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
5491                             AndNode, DAG.getConstant(0, MVT::i8));
5492
5493  SDValue Hi, Lo;
5494  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
5495  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
5496  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
5497
5498  if (Op.getOpcode() == ISD::SHL_PARTS) {
5499    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5500    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5501  } else {
5502    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
5503    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
5504  }
5505
5506  SDValue Ops[2] = { Lo, Hi };
5507  return DAG.getMergeValues(Ops, 2, dl);
5508}
5509
5510SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
5511                                           SelectionDAG &DAG) const {
5512  EVT SrcVT = Op.getOperand(0).getValueType();
5513
5514  if (SrcVT.isVector()) {
5515    if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
5516      return Op;
5517    }
5518    return SDValue();
5519  }
5520
5521  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
5522         "Unknown SINT_TO_FP to lower!");
5523
5524  // These are really Legal; return the operand so the caller accepts it as
5525  // Legal.
5526  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
5527    return Op;
5528  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
5529      Subtarget->is64Bit()) {
5530    return Op;
5531  }
5532
5533  DebugLoc dl = Op.getDebugLoc();
5534  unsigned Size = SrcVT.getSizeInBits()/8;
5535  MachineFunction &MF = DAG.getMachineFunction();
5536  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
5537  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5538  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5539                               StackSlot,
5540                               PseudoSourceValue::getFixedStack(SSFI), 0,
5541                               false, false, 0);
5542  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
5543}
5544
5545SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
5546                                     SDValue StackSlot,
5547                                     SelectionDAG &DAG) const {
5548  // Build the FILD
5549  DebugLoc dl = Op.getDebugLoc();
5550  SDVTList Tys;
5551  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
5552  if (useSSE)
5553    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
5554  else
5555    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
5556  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
5557  SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
5558                               Tys, Ops, array_lengthof(Ops));
5559
5560  if (useSSE) {
5561    Chain = Result.getValue(1);
5562    SDValue InFlag = Result.getValue(2);
5563
5564    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
5565    // shouldn't be necessary except that RFP cannot be live across
5566    // multiple blocks. When stackifier is fixed, they can be uncoupled.
5567    MachineFunction &MF = DAG.getMachineFunction();
5568    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
5569    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5570    Tys = DAG.getVTList(MVT::Other);
5571    SDValue Ops[] = {
5572      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
5573    };
5574    Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops));
5575    Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
5576                         PseudoSourceValue::getFixedStack(SSFI), 0,
5577                         false, false, 0);
5578  }
5579
5580  return Result;
5581}
5582
5583// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
5584SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
5585                                               SelectionDAG &DAG) const {
5586  // This algorithm is not obvious. Here it is in C code, more or less:
5587  /*
5588    double uint64_to_double( uint32_t hi, uint32_t lo ) {
5589      static const __m128i exp = { 0x4330000045300000ULL, 0 };
5590      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
5591
5592      // Copy ints to xmm registers.
5593      __m128i xh = _mm_cvtsi32_si128( hi );
5594      __m128i xl = _mm_cvtsi32_si128( lo );
5595
5596      // Combine into low half of a single xmm register.
5597      __m128i x = _mm_unpacklo_epi32( xh, xl );
5598      __m128d d;
5599      double sd;
5600
5601      // Merge in appropriate exponents to give the integer bits the right
5602      // magnitude.
5603      x = _mm_unpacklo_epi32( x, exp );
5604
5605      // Subtract away the biases to deal with the IEEE-754 double precision
5606      // implicit 1.
5607      d = _mm_sub_pd( (__m128d) x, bias );
5608
5609      // All conversions up to here are exact. The correctly rounded result is
5610      // calculated using the current rounding mode using the following
5611      // horizontal add.
5612      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
5613      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
5614                                // store doesn't really need to be here (except
5615                                // maybe to zero the other double)
5616      return sd;
5617    }
5618  */
5619
5620  DebugLoc dl = Op.getDebugLoc();
5621  LLVMContext *Context = DAG.getContext();
5622
5623  // Build some magic constants.
5624  std::vector<Constant*> CV0;
5625  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
5626  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
5627  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5628  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
5629  Constant *C0 = ConstantVector::get(CV0);
5630  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
5631
5632  std::vector<Constant*> CV1;
5633  CV1.push_back(
5634    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
5635  CV1.push_back(
5636    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
5637  Constant *C1 = ConstantVector::get(CV1);
5638  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
5639
5640  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5641                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5642                                        Op.getOperand(0),
5643                                        DAG.getIntPtrConstant(1)));
5644  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5645                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5646                                        Op.getOperand(0),
5647                                        DAG.getIntPtrConstant(0)));
5648  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
5649  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
5650                              PseudoSourceValue::getConstantPool(), 0,
5651                              false, false, 16);
5652  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
5653  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
5654  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
5655                              PseudoSourceValue::getConstantPool(), 0,
5656                              false, false, 16);
5657  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
5658
5659  // Add the halves; easiest way is to swap them into another reg first.
5660  int ShufMask[2] = { 1, -1 };
5661  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
5662                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
5663  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
5664  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
5665                     DAG.getIntPtrConstant(0));
5666}
5667
5668// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
5669SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
5670                                               SelectionDAG &DAG) const {
5671  DebugLoc dl = Op.getDebugLoc();
5672  // FP constant to bias correct the final result.
5673  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
5674                                   MVT::f64);
5675
5676  // Load the 32-bit value into an XMM register.
5677  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
5678                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
5679                                         Op.getOperand(0),
5680                                         DAG.getIntPtrConstant(0)));
5681
5682  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5683                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
5684                     DAG.getIntPtrConstant(0));
5685
5686  // Or the load with the bias.
5687  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
5688                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5689                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5690                                                   MVT::v2f64, Load)),
5691                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5692                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
5693                                                   MVT::v2f64, Bias)));
5694  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
5695                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
5696                   DAG.getIntPtrConstant(0));
5697
5698  // Subtract the bias.
5699  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
5700
5701  // Handle final rounding.
5702  EVT DestVT = Op.getValueType();
5703
5704  if (DestVT.bitsLT(MVT::f64)) {
5705    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
5706                       DAG.getIntPtrConstant(0));
5707  } else if (DestVT.bitsGT(MVT::f64)) {
5708    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
5709  }
5710
5711  // Handle final rounding.
5712  return Sub;
5713}
5714
5715SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
5716                                           SelectionDAG &DAG) const {
5717  SDValue N0 = Op.getOperand(0);
5718  DebugLoc dl = Op.getDebugLoc();
5719
5720  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
5721  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
5722  // the optimization here.
5723  if (DAG.SignBitIsZero(N0))
5724    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
5725
5726  EVT SrcVT = N0.getValueType();
5727  EVT DstVT = Op.getValueType();
5728  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
5729    return LowerUINT_TO_FP_i64(Op, DAG);
5730  else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
5731    return LowerUINT_TO_FP_i32(Op, DAG);
5732
5733  // Make a 64-bit buffer, and use it to build an FILD.
5734  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
5735  if (SrcVT == MVT::i32) {
5736    SDValue WordOff = DAG.getConstant(4, getPointerTy());
5737    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
5738                                     getPointerTy(), StackSlot, WordOff);
5739    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5740                                  StackSlot, NULL, 0, false, false, 0);
5741    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
5742                                  OffsetSlot, NULL, 0, false, false, 0);
5743    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
5744    return Fild;
5745  }
5746
5747  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
5748  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
5749                                StackSlot, NULL, 0, false, false, 0);
5750  // For i64 source, we need to add the appropriate power of 2 if the input
5751  // was negative.  This is the same as the optimization in
5752  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
5753  // we must be careful to do the computation in x87 extended precision, not
5754  // in SSE. (The generic code can't know it's OK to do this, or how to.)
5755  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
5756  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
5757  SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3);
5758
5759  APInt FF(32, 0x5F800000ULL);
5760
5761  // Check whether the sign bit is set.
5762  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
5763                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
5764                                 ISD::SETLT);
5765
5766  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
5767  SDValue FudgePtr = DAG.getConstantPool(
5768                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
5769                                         getPointerTy());
5770
5771  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
5772  SDValue Zero = DAG.getIntPtrConstant(0);
5773  SDValue Four = DAG.getIntPtrConstant(4);
5774  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
5775                               Zero, Four);
5776  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);
5777
5778  // Load the value out, extending it from f32 to f80.
5779  // FIXME: Avoid the extend by constructing the right constant pool?
5780  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
5781                                 FudgePtr, PseudoSourceValue::getConstantPool(),
5782                                 0, MVT::f32, false, false, 4);
5783  // Extend everything to 80 bits to force it to be done on x87.
5784  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
5785  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
5786}
5787
5788std::pair<SDValue,SDValue> X86TargetLowering::
5789FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
5790  DebugLoc dl = Op.getDebugLoc();
5791
5792  EVT DstTy = Op.getValueType();
5793
5794  if (!IsSigned) {
5795    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
5796    DstTy = MVT::i64;
5797  }
5798
5799  assert(DstTy.getSimpleVT() <= MVT::i64 &&
5800         DstTy.getSimpleVT() >= MVT::i16 &&
5801         "Unknown FP_TO_SINT to lower!");
5802
5803  // These are really Legal.
5804  if (DstTy == MVT::i32 &&
5805      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5806    return std::make_pair(SDValue(), SDValue());
5807  if (Subtarget->is64Bit() &&
5808      DstTy == MVT::i64 &&
5809      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
5810    return std::make_pair(SDValue(), SDValue());
5811
5812  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
5813  // stack slot.
5814  MachineFunction &MF = DAG.getMachineFunction();
5815  unsigned MemSize = DstTy.getSizeInBits()/8;
5816  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5817  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5818
5819  unsigned Opc;
5820  switch (DstTy.getSimpleVT().SimpleTy) {
5821  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
5822  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
5823  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
5824  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
5825  }
5826
5827  SDValue Chain = DAG.getEntryNode();
5828  SDValue Value = Op.getOperand(0);
5829  if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
5830    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
5831    Chain = DAG.getStore(Chain, dl, Value, StackSlot,
5832                         PseudoSourceValue::getFixedStack(SSFI), 0,
5833                         false, false, 0);
5834    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
5835    SDValue Ops[] = {
5836      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
5837    };
5838    Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
5839    Chain = Value.getValue(1);
5840    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
5841    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
5842  }
5843
5844  // Build the FP_TO_INT*_IN_MEM
5845  SDValue Ops[] = { Chain, Value, StackSlot };
5846  SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
5847
5848  return std::make_pair(FIST, StackSlot);
5849}
5850
5851SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
5852                                           SelectionDAG &DAG) const {
5853  if (Op.getValueType().isVector()) {
5854    if (Op.getValueType() == MVT::v2i32 &&
5855        Op.getOperand(0).getValueType() == MVT::v2f64) {
5856      return Op;
5857    }
5858    return SDValue();
5859  }
5860
5861  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
5862  SDValue FIST = Vals.first, StackSlot = Vals.second;
5863  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
5864  if (FIST.getNode() == 0) return Op;
5865
5866  // Load the result.
5867  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5868                     FIST, StackSlot, NULL, 0, false, false, 0);
5869}
5870
5871SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
5872                                           SelectionDAG &DAG) const {
5873  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
5874  SDValue FIST = Vals.first, StackSlot = Vals.second;
5875  assert(FIST.getNode() && "Unexpected failure");
5876
5877  // Load the result.
5878  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
5879                     FIST, StackSlot, NULL, 0, false, false, 0);
5880}
5881
5882SDValue X86TargetLowering::LowerFABS(SDValue Op,
5883                                     SelectionDAG &DAG) const {
5884  LLVMContext *Context = DAG.getContext();
5885  DebugLoc dl = Op.getDebugLoc();
5886  EVT VT = Op.getValueType();
5887  EVT EltVT = VT;
5888  if (VT.isVector())
5889    EltVT = VT.getVectorElementType();
5890  std::vector<Constant*> CV;
5891  if (EltVT == MVT::f64) {
5892    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
5893    CV.push_back(C);
5894    CV.push_back(C);
5895  } else {
5896    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
5897    CV.push_back(C);
5898    CV.push_back(C);
5899    CV.push_back(C);
5900    CV.push_back(C);
5901  }
5902  Constant *C = ConstantVector::get(CV);
5903  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5904  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5905                             PseudoSourceValue::getConstantPool(), 0,
5906                             false, false, 16);
5907  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
5908}
5909
5910SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
5911  LLVMContext *Context = DAG.getContext();
5912  DebugLoc dl = Op.getDebugLoc();
5913  EVT VT = Op.getValueType();
5914  EVT EltVT = VT;
5915  if (VT.isVector())
5916    EltVT = VT.getVectorElementType();
5917  std::vector<Constant*> CV;
5918  if (EltVT == MVT::f64) {
5919    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
5920    CV.push_back(C);
5921    CV.push_back(C);
5922  } else {
5923    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
5924    CV.push_back(C);
5925    CV.push_back(C);
5926    CV.push_back(C);
5927    CV.push_back(C);
5928  }
5929  Constant *C = ConstantVector::get(CV);
5930  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5931  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
5932                             PseudoSourceValue::getConstantPool(), 0,
5933                             false, false, 16);
5934  if (VT.isVector()) {
5935    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
5936                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
5937                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
5938                                Op.getOperand(0)),
5939                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
5940  } else {
5941    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
5942  }
5943}
5944
5945SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5946  LLVMContext *Context = DAG.getContext();
5947  SDValue Op0 = Op.getOperand(0);
5948  SDValue Op1 = Op.getOperand(1);
5949  DebugLoc dl = Op.getDebugLoc();
5950  EVT VT = Op.getValueType();
5951  EVT SrcVT = Op1.getValueType();
5952
5953  // If second operand is smaller, extend it first.
5954  if (SrcVT.bitsLT(VT)) {
5955    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
5956    SrcVT = VT;
5957  }
5958  // And if it is bigger, shrink it first.
5959  if (SrcVT.bitsGT(VT)) {
5960    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
5961    SrcVT = VT;
5962  }
5963
5964  // At this point the operands and the result should have the same
5965  // type, and that won't be f80 since that is not custom lowered.
5966
5967  // First get the sign bit of second operand.
5968  std::vector<Constant*> CV;
5969  if (SrcVT == MVT::f64) {
5970    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
5971    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
5972  } else {
5973    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
5974    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5975    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5976    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
5977  }
5978  Constant *C = ConstantVector::get(CV);
5979  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
5980  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
5981                              PseudoSourceValue::getConstantPool(), 0,
5982                              false, false, 16);
5983  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
5984
5985  // Shift sign bit right or left if the two operands have different types.
5986  if (SrcVT.bitsGT(VT)) {
5987    // Op0 is MVT::f32, Op1 is MVT::f64.
5988    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
5989    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
5990                          DAG.getConstant(32, MVT::i32));
5991    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
5992    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
5993                          DAG.getIntPtrConstant(0));
5994  }
5995
5996  // Clear first operand sign bit.
5997  CV.clear();
5998  if (VT == MVT::f64) {
5999    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
6000    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
6001  } else {
6002    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
6003    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6004    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6005    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
6006  }
6007  C = ConstantVector::get(CV);
6008  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
6009  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
6010                              PseudoSourceValue::getConstantPool(), 0,
6011                              false, false, 16);
6012  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
6013
6014  // Or the value with the sign bit.
6015  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
6016}
6017
6018/// Emit nodes that will be selected as "test Op0,Op0", or something
6019/// equivalent.
6020SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
6021                                    SelectionDAG &DAG) const {
6022  DebugLoc dl = Op.getDebugLoc();
6023
6024  // CF and OF aren't always set the way we want. Determine which
6025  // of these we need.
6026  bool NeedCF = false;
6027  bool NeedOF = false;
6028  switch (X86CC) {
6029  default: break;
6030  case X86::COND_A: case X86::COND_AE:
6031  case X86::COND_B: case X86::COND_BE:
6032    NeedCF = true;
6033    break;
6034  case X86::COND_G: case X86::COND_GE:
6035  case X86::COND_L: case X86::COND_LE:
6036  case X86::COND_O: case X86::COND_NO:
6037    NeedOF = true;
6038    break;
6039  }
6040
6041  // See if we can use the EFLAGS value from the operand instead of
6042  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
6043  // we prove that the arithmetic won't overflow, we can't use OF or CF.
6044  if (Op.getResNo() != 0 || NeedOF || NeedCF)
6045    // Emit a CMP with 0, which is the TEST pattern.
6046    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
6047                       DAG.getConstant(0, Op.getValueType()));
6048
6049  unsigned Opcode = 0;
6050  unsigned NumOperands = 0;
6051  switch (Op.getNode()->getOpcode()) {
6052  case ISD::ADD:
6053    // Due to an isel shortcoming, be conservative if this add is likely to be
6054    // selected as part of a load-modify-store instruction. When the root node
6055    // in a match is a store, isel doesn't know how to remap non-chain non-flag
6056    // uses of other nodes in the match, such as the ADD in this case. This
6057    // leads to the ADD being left around and reselected, with the result being
6058    // two adds in the output.  Alas, even if none our users are stores, that
6059    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
6060    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
6061    // climbing the DAG back to the root, and it doesn't seem to be worth the
6062    // effort.
6063    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
6064           UE = Op.getNode()->use_end(); UI != UE; ++UI)
6065      if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
6066        goto default_case;
6067
6068    if (ConstantSDNode *C =
6069        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
6070      // An add of one will be selected as an INC.
6071      if (C->getAPIntValue() == 1) {
6072        Opcode = X86ISD::INC;
6073        NumOperands = 1;
6074        break;
6075      }
6076
6077      // An add of negative one (subtract of one) will be selected as a DEC.
6078      if (C->getAPIntValue().isAllOnesValue()) {
6079        Opcode = X86ISD::DEC;
6080        NumOperands = 1;
6081        break;
6082      }
6083    }
6084
6085    // Otherwise use a regular EFLAGS-setting add.
6086    Opcode = X86ISD::ADD;
6087    NumOperands = 2;
6088    break;
6089  case ISD::AND: {
6090    // If the primary and result isn't used, don't bother using X86ISD::AND,
6091    // because a TEST instruction will be better.
6092    bool NonFlagUse = false;
6093    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
6094           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
6095      SDNode *User = *UI;
6096      unsigned UOpNo = UI.getOperandNo();
6097      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
6098        // Look pass truncate.
6099        UOpNo = User->use_begin().getOperandNo();
6100        User = *User->use_begin();
6101      }
6102
6103      if (User->getOpcode() != ISD::BRCOND &&
6104          User->getOpcode() != ISD::SETCC &&
6105          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
6106        NonFlagUse = true;
6107        break;
6108      }
6109    }
6110
6111    if (!NonFlagUse)
6112      break;
6113  }
6114    // FALL THROUGH
6115  case ISD::SUB:
6116  case ISD::OR:
6117  case ISD::XOR:
6118    // Due to the ISEL shortcoming noted above, be conservative if this op is
6119    // likely to be selected as part of a load-modify-store instruction.
6120    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
6121           UE = Op.getNode()->use_end(); UI != UE; ++UI)
6122      if (UI->getOpcode() == ISD::STORE)
6123        goto default_case;
6124
6125    // Otherwise use a regular EFLAGS-setting instruction.
6126    switch (Op.getNode()->getOpcode()) {
6127    default: llvm_unreachable("unexpected operator!");
6128    case ISD::SUB: Opcode = X86ISD::SUB; break;
6129    case ISD::OR:  Opcode = X86ISD::OR;  break;
6130    case ISD::XOR: Opcode = X86ISD::XOR; break;
6131    case ISD::AND: Opcode = X86ISD::AND; break;
6132    }
6133
6134    NumOperands = 2;
6135    break;
6136  case X86ISD::ADD:
6137  case X86ISD::SUB:
6138  case X86ISD::INC:
6139  case X86ISD::DEC:
6140  case X86ISD::OR:
6141  case X86ISD::XOR:
6142  case X86ISD::AND:
6143    return SDValue(Op.getNode(), 1);
6144  default:
6145  default_case:
6146    break;
6147  }
6148
6149  if (Opcode == 0)
6150    // Emit a CMP with 0, which is the TEST pattern.
6151    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
6152                       DAG.getConstant(0, Op.getValueType()));
6153
6154  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
6155  SmallVector<SDValue, 4> Ops;
6156  for (unsigned i = 0; i != NumOperands; ++i)
6157    Ops.push_back(Op.getOperand(i));
6158
6159  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
6160  DAG.ReplaceAllUsesWith(Op, New);
6161  return SDValue(New.getNode(), 1);
6162}
6163
6164/// Emit nodes that will be selected as "cmp Op0,Op1", or something
6165/// equivalent.
6166SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
6167                                   SelectionDAG &DAG) const {
6168  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
6169    if (C->getAPIntValue() == 0)
6170      return EmitTest(Op0, X86CC, DAG);
6171
6172  DebugLoc dl = Op0.getDebugLoc();
6173  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
6174}
6175
6176/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
6177/// if it's possible.
6178SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
6179                                     DebugLoc dl, SelectionDAG &DAG) const {
6180  SDValue Op0 = And.getOperand(0);
6181  SDValue Op1 = And.getOperand(1);
6182  if (Op0.getOpcode() == ISD::TRUNCATE)
6183    Op0 = Op0.getOperand(0);
6184  if (Op1.getOpcode() == ISD::TRUNCATE)
6185    Op1 = Op1.getOperand(0);
6186
6187  SDValue LHS, RHS;
6188  if (Op1.getOpcode() == ISD::SHL)
6189    std::swap(Op0, Op1);
6190  if (Op0.getOpcode() == ISD::SHL) {
6191    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
6192      if (And00C->getZExtValue() == 1) {
6193        // If we looked past a truncate, check that it's only truncating away
6194        // known zeros.
6195        unsigned BitWidth = Op0.getValueSizeInBits();
6196        unsigned AndBitWidth = And.getValueSizeInBits();
6197        if (BitWidth > AndBitWidth) {
6198          APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones;
6199          DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones);
6200          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
6201            return SDValue();
6202        }
6203        LHS = Op1;
6204        RHS = Op0.getOperand(1);
6205      }
6206  } else if (Op1.getOpcode() == ISD::Constant) {
6207    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
6208    SDValue AndLHS = Op0;
6209    if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
6210      LHS = AndLHS.getOperand(0);
6211      RHS = AndLHS.getOperand(1);
6212    }
6213  }
6214
6215  if (LHS.getNode()) {
6216    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
6217    // instruction.  Since the shift amount is in-range-or-undefined, we know
6218    // that doing a bittest on the i32 value is ok.  We extend to i32 because
6219    // the encoding for the i16 version is larger than the i32 version.
6220    // Also promote i16 to i32 for performance / code size reason.
6221    if (LHS.getValueType() == MVT::i8 ||
6222        LHS.getValueType() == MVT::i16)
6223      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
6224
6225    // If the operand types disagree, extend the shift amount to match.  Since
6226    // BT ignores high bits (like shifts) we can use anyextend.
6227    if (LHS.getValueType() != RHS.getValueType())
6228      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
6229
6230    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
6231    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
6232    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6233                       DAG.getConstant(Cond, MVT::i8), BT);
6234  }
6235
6236  return SDValue();
6237}
6238
6239SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
6240  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
6241  SDValue Op0 = Op.getOperand(0);
6242  SDValue Op1 = Op.getOperand(1);
6243  DebugLoc dl = Op.getDebugLoc();
6244  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6245
6246  // Optimize to BT if possible.
6247  // Lower (X & (1 << N)) == 0 to BT(X, N).
6248  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
6249  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
6250  if (Op0.getOpcode() == ISD::AND &&
6251      Op0.hasOneUse() &&
6252      Op1.getOpcode() == ISD::Constant &&
6253      cast<ConstantSDNode>(Op1)->isNullValue() &&
6254      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6255    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
6256    if (NewSetCC.getNode())
6257      return NewSetCC;
6258  }
6259
6260  // Look for "(setcc) == / != 1" to avoid unncessary setcc.
6261  if (Op0.getOpcode() == X86ISD::SETCC &&
6262      Op1.getOpcode() == ISD::Constant &&
6263      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
6264       cast<ConstantSDNode>(Op1)->isNullValue()) &&
6265      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
6266    X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
6267    bool Invert = (CC == ISD::SETNE) ^
6268      cast<ConstantSDNode>(Op1)->isNullValue();
6269    if (Invert)
6270      CCode = X86::GetOppositeBranchCondition(CCode);
6271    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6272                       DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
6273  }
6274
6275  bool isFP = Op1.getValueType().isFloatingPoint();
6276  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
6277  if (X86CC == X86::COND_INVALID)
6278    return SDValue();
6279
6280  SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
6281
6282  // Use sbb x, x to materialize carry bit into a GPR.
6283  if (X86CC == X86::COND_B)
6284    return DAG.getNode(ISD::AND, dl, MVT::i8,
6285                       DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8,
6286                                   DAG.getConstant(X86CC, MVT::i8), Cond),
6287                       DAG.getConstant(1, MVT::i8));
6288
6289  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6290                     DAG.getConstant(X86CC, MVT::i8), Cond);
6291}
6292
6293SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
6294  SDValue Cond;
6295  SDValue Op0 = Op.getOperand(0);
6296  SDValue Op1 = Op.getOperand(1);
6297  SDValue CC = Op.getOperand(2);
6298  EVT VT = Op.getValueType();
6299  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6300  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
6301  DebugLoc dl = Op.getDebugLoc();
6302
6303  if (isFP) {
6304    unsigned SSECC = 8;
6305    EVT VT0 = Op0.getValueType();
6306    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
6307    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
6308    bool Swap = false;
6309
6310    switch (SetCCOpcode) {
6311    default: break;
6312    case ISD::SETOEQ:
6313    case ISD::SETEQ:  SSECC = 0; break;
6314    case ISD::SETOGT:
6315    case ISD::SETGT: Swap = true; // Fallthrough
6316    case ISD::SETLT:
6317    case ISD::SETOLT: SSECC = 1; break;
6318    case ISD::SETOGE:
6319    case ISD::SETGE: Swap = true; // Fallthrough
6320    case ISD::SETLE:
6321    case ISD::SETOLE: SSECC = 2; break;
6322    case ISD::SETUO:  SSECC = 3; break;
6323    case ISD::SETUNE:
6324    case ISD::SETNE:  SSECC = 4; break;
6325    case ISD::SETULE: Swap = true;
6326    case ISD::SETUGE: SSECC = 5; break;
6327    case ISD::SETULT: Swap = true;
6328    case ISD::SETUGT: SSECC = 6; break;
6329    case ISD::SETO:   SSECC = 7; break;
6330    }
6331    if (Swap)
6332      std::swap(Op0, Op1);
6333
6334    // In the two special cases we can't handle, emit two comparisons.
6335    if (SSECC == 8) {
6336      if (SetCCOpcode == ISD::SETUEQ) {
6337        SDValue UNORD, EQ;
6338        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
6339        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
6340        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
6341      }
6342      else if (SetCCOpcode == ISD::SETONE) {
6343        SDValue ORD, NEQ;
6344        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
6345        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
6346        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
6347      }
6348      llvm_unreachable("Illegal FP comparison");
6349    }
6350    // Handle all other FP comparisons here.
6351    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
6352  }
6353
6354  // We are handling one of the integer comparisons here.  Since SSE only has
6355  // GT and EQ comparisons for integer, swapping operands and multiple
6356  // operations may be required for some comparisons.
6357  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
6358  bool Swap = false, Invert = false, FlipSigns = false;
6359
6360  switch (VT.getSimpleVT().SimpleTy) {
6361  default: break;
6362  case MVT::v8i8:
6363  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
6364  case MVT::v4i16:
6365  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
6366  case MVT::v2i32:
6367  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
6368  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
6369  }
6370
6371  switch (SetCCOpcode) {
6372  default: break;
6373  case ISD::SETNE:  Invert = true;
6374  case ISD::SETEQ:  Opc = EQOpc; break;
6375  case ISD::SETLT:  Swap = true;
6376  case ISD::SETGT:  Opc = GTOpc; break;
6377  case ISD::SETGE:  Swap = true;
6378  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
6379  case ISD::SETULT: Swap = true;
6380  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
6381  case ISD::SETUGE: Swap = true;
6382  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
6383  }
6384  if (Swap)
6385    std::swap(Op0, Op1);
6386
6387  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
6388  // bits of the inputs before performing those operations.
6389  if (FlipSigns) {
6390    EVT EltVT = VT.getVectorElementType();
6391    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
6392                                      EltVT);
6393    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
6394    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
6395                                    SignBits.size());
6396    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
6397    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
6398  }
6399
6400  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
6401
6402  // If the logical-not of the result is required, perform that now.
6403  if (Invert)
6404    Result = DAG.getNOT(dl, Result, VT);
6405
6406  return Result;
6407}
6408
6409// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
6410static bool isX86LogicalCmp(SDValue Op) {
6411  unsigned Opc = Op.getNode()->getOpcode();
6412  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
6413    return true;
6414  if (Op.getResNo() == 1 &&
6415      (Opc == X86ISD::ADD ||
6416       Opc == X86ISD::SUB ||
6417       Opc == X86ISD::SMUL ||
6418       Opc == X86ISD::UMUL ||
6419       Opc == X86ISD::INC ||
6420       Opc == X86ISD::DEC ||
6421       Opc == X86ISD::OR ||
6422       Opc == X86ISD::XOR ||
6423       Opc == X86ISD::AND))
6424    return true;
6425
6426  return false;
6427}
6428
6429SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
6430  bool addTest = true;
6431  SDValue Cond  = Op.getOperand(0);
6432  DebugLoc dl = Op.getDebugLoc();
6433  SDValue CC;
6434
6435  if (Cond.getOpcode() == ISD::SETCC) {
6436    SDValue NewCond = LowerSETCC(Cond, DAG);
6437    if (NewCond.getNode())
6438      Cond = NewCond;
6439  }
6440
6441  // (select (x == 0), -1, 0) -> (sign_bit (x - 1))
6442  SDValue Op1 = Op.getOperand(1);
6443  SDValue Op2 = Op.getOperand(2);
6444  if (Cond.getOpcode() == X86ISD::SETCC &&
6445      cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) {
6446    SDValue Cmp = Cond.getOperand(1);
6447    if (Cmp.getOpcode() == X86ISD::CMP) {
6448      ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1);
6449      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
6450      ConstantSDNode *RHSC =
6451        dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode());
6452      if (N1C && N1C->isAllOnesValue() &&
6453          N2C && N2C->isNullValue() &&
6454          RHSC && RHSC->isNullValue()) {
6455        SDValue CmpOp0 = Cmp.getOperand(0);
6456        Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
6457                          CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
6458        return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(),
6459                           DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
6460      }
6461    }
6462  }
6463
6464  // Look pass (and (setcc_carry (cmp ...)), 1).
6465  if (Cond.getOpcode() == ISD::AND &&
6466      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
6467    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
6468    if (C && C->getAPIntValue() == 1)
6469      Cond = Cond.getOperand(0);
6470  }
6471
6472  // If condition flag is set by a X86ISD::CMP, then use it as the condition
6473  // setting operand in place of the X86ISD::SETCC.
6474  if (Cond.getOpcode() == X86ISD::SETCC ||
6475      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
6476    CC = Cond.getOperand(0);
6477
6478    SDValue Cmp = Cond.getOperand(1);
6479    unsigned Opc = Cmp.getOpcode();
6480    EVT VT = Op.getValueType();
6481
6482    bool IllegalFPCMov = false;
6483    if (VT.isFloatingPoint() && !VT.isVector() &&
6484        !isScalarFPTypeInSSEReg(VT))  // FPStack?
6485      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
6486
6487    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
6488        Opc == X86ISD::BT) { // FIXME
6489      Cond = Cmp;
6490      addTest = false;
6491    }
6492  }
6493
6494  if (addTest) {
6495    // Look pass the truncate.
6496    if (Cond.getOpcode() == ISD::TRUNCATE)
6497      Cond = Cond.getOperand(0);
6498
6499    // We know the result of AND is compared against zero. Try to match
6500    // it to BT.
6501    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
6502      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
6503      if (NewSetCC.getNode()) {
6504        CC = NewSetCC.getOperand(0);
6505        Cond = NewSetCC.getOperand(1);
6506        addTest = false;
6507      }
6508    }
6509  }
6510
6511  if (addTest) {
6512    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6513    Cond = EmitTest(Cond, X86::COND_NE, DAG);
6514  }
6515
6516  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
6517  // condition is true.
6518  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
6519  SDValue Ops[] = { Op2, Op1, CC, Cond };
6520  return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops));
6521}
6522
6523// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
6524// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
6525// from the AND / OR.
6526static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
6527  Opc = Op.getOpcode();
6528  if (Opc != ISD::OR && Opc != ISD::AND)
6529    return false;
6530  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
6531          Op.getOperand(0).hasOneUse() &&
6532          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
6533          Op.getOperand(1).hasOneUse());
6534}
6535
6536// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
6537// 1 and that the SETCC node has a single use.
6538static bool isXor1OfSetCC(SDValue Op) {
6539  if (Op.getOpcode() != ISD::XOR)
6540    return false;
6541  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
6542  if (N1C && N1C->getAPIntValue() == 1) {
6543    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
6544      Op.getOperand(0).hasOneUse();
6545  }
6546  return false;
6547}
6548
6549SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
6550  bool addTest = true;
6551  SDValue Chain = Op.getOperand(0);
6552  SDValue Cond  = Op.getOperand(1);
6553  SDValue Dest  = Op.getOperand(2);
6554  DebugLoc dl = Op.getDebugLoc();
6555  SDValue CC;
6556
6557  if (Cond.getOpcode() == ISD::SETCC) {
6558    SDValue NewCond = LowerSETCC(Cond, DAG);
6559    if (NewCond.getNode())
6560      Cond = NewCond;
6561  }
6562#if 0
6563  // FIXME: LowerXALUO doesn't handle these!!
6564  else if (Cond.getOpcode() == X86ISD::ADD  ||
6565           Cond.getOpcode() == X86ISD::SUB  ||
6566           Cond.getOpcode() == X86ISD::SMUL ||
6567           Cond.getOpcode() == X86ISD::UMUL)
6568    Cond = LowerXALUO(Cond, DAG);
6569#endif
6570
6571  // Look pass (and (setcc_carry (cmp ...)), 1).
6572  if (Cond.getOpcode() == ISD::AND &&
6573      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
6574    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
6575    if (C && C->getAPIntValue() == 1)
6576      Cond = Cond.getOperand(0);
6577  }
6578
6579  // If condition flag is set by a X86ISD::CMP, then use it as the condition
6580  // setting operand in place of the X86ISD::SETCC.
6581  if (Cond.getOpcode() == X86ISD::SETCC ||
6582      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
6583    CC = Cond.getOperand(0);
6584
6585    SDValue Cmp = Cond.getOperand(1);
6586    unsigned Opc = Cmp.getOpcode();
6587    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
6588    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
6589      Cond = Cmp;
6590      addTest = false;
6591    } else {
6592      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
6593      default: break;
6594      case X86::COND_O:
6595      case X86::COND_B:
6596        // These can only come from an arithmetic instruction with overflow,
6597        // e.g. SADDO, UADDO.
6598        Cond = Cond.getNode()->getOperand(1);
6599        addTest = false;
6600        break;
6601      }
6602    }
6603  } else {
6604    unsigned CondOpc;
6605    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
6606      SDValue Cmp = Cond.getOperand(0).getOperand(1);
6607      if (CondOpc == ISD::OR) {
6608        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
6609        // two branches instead of an explicit OR instruction with a
6610        // separate test.
6611        if (Cmp == Cond.getOperand(1).getOperand(1) &&
6612            isX86LogicalCmp(Cmp)) {
6613          CC = Cond.getOperand(0).getOperand(0);
6614          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6615                              Chain, Dest, CC, Cmp);
6616          CC = Cond.getOperand(1).getOperand(0);
6617          Cond = Cmp;
6618          addTest = false;
6619        }
6620      } else { // ISD::AND
6621        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
6622        // two branches instead of an explicit AND instruction with a
6623        // separate test. However, we only do this if this block doesn't
6624        // have a fall-through edge, because this requires an explicit
6625        // jmp when the condition is false.
6626        if (Cmp == Cond.getOperand(1).getOperand(1) &&
6627            isX86LogicalCmp(Cmp) &&
6628            Op.getNode()->hasOneUse()) {
6629          X86::CondCode CCode =
6630            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6631          CCode = X86::GetOppositeBranchCondition(CCode);
6632          CC = DAG.getConstant(CCode, MVT::i8);
6633          SDNode *User = *Op.getNode()->use_begin();
6634          // Look for an unconditional branch following this conditional branch.
6635          // We need this because we need to reverse the successors in order
6636          // to implement FCMP_OEQ.
6637          if (User->getOpcode() == ISD::BR) {
6638            SDValue FalseBB = User->getOperand(1);
6639            SDNode *NewBR =
6640              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
6641            assert(NewBR == User);
6642            (void)NewBR;
6643            Dest = FalseBB;
6644
6645            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6646                                Chain, Dest, CC, Cmp);
6647            X86::CondCode CCode =
6648              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
6649            CCode = X86::GetOppositeBranchCondition(CCode);
6650            CC = DAG.getConstant(CCode, MVT::i8);
6651            Cond = Cmp;
6652            addTest = false;
6653          }
6654        }
6655      }
6656    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
6657      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
6658      // It should be transformed during dag combiner except when the condition
6659      // is set by a arithmetics with overflow node.
6660      X86::CondCode CCode =
6661        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
6662      CCode = X86::GetOppositeBranchCondition(CCode);
6663      CC = DAG.getConstant(CCode, MVT::i8);
6664      Cond = Cond.getOperand(0).getOperand(1);
6665      addTest = false;
6666    }
6667  }
6668
6669  if (addTest) {
6670    // Look pass the truncate.
6671    if (Cond.getOpcode() == ISD::TRUNCATE)
6672      Cond = Cond.getOperand(0);
6673
6674    // We know the result of AND is compared against zero. Try to match
6675    // it to BT.
6676    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
6677      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
6678      if (NewSetCC.getNode()) {
6679        CC = NewSetCC.getOperand(0);
6680        Cond = NewSetCC.getOperand(1);
6681        addTest = false;
6682      }
6683    }
6684  }
6685
6686  if (addTest) {
6687    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
6688    Cond = EmitTest(Cond, X86::COND_NE, DAG);
6689  }
6690  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
6691                     Chain, Dest, CC, Cond);
6692}
6693
6694
6695// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
6696// Calls to _alloca is needed to probe the stack when allocating more than 4k
6697// bytes in one go. Touching the stack at 4K increments is necessary to ensure
6698// that the guard pages used by the OS virtual memory manager are allocated in
6699// correct sequence.
6700SDValue
6701X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
6702                                           SelectionDAG &DAG) const {
6703  assert(Subtarget->isTargetCygMing() &&
6704         "This should be used only on Cygwin/Mingw targets");
6705  DebugLoc dl = Op.getDebugLoc();
6706
6707  // Get the inputs.
6708  SDValue Chain = Op.getOperand(0);
6709  SDValue Size  = Op.getOperand(1);
6710  // FIXME: Ensure alignment here
6711
6712  SDValue Flag;
6713
6714  EVT IntPtr = getPointerTy();
6715  EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
6716
6717  Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
6718  Flag = Chain.getValue(1);
6719
6720  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
6721
6722  Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag);
6723  Flag = Chain.getValue(1);
6724
6725  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
6726
6727  SDValue Ops1[2] = { Chain.getValue(0), Chain };
6728  return DAG.getMergeValues(Ops1, 2, dl);
6729}
6730
6731SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
6732  MachineFunction &MF = DAG.getMachineFunction();
6733  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
6734
6735  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
6736  DebugLoc dl = Op.getDebugLoc();
6737
6738  if (!Subtarget->is64Bit()) {
6739    // vastart just stores the address of the VarArgsFrameIndex slot into the
6740    // memory location argument.
6741    SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
6742                                   getPointerTy());
6743    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0,
6744                        false, false, 0);
6745  }
6746
6747  // __va_list_tag:
6748  //   gp_offset         (0 - 6 * 8)
6749  //   fp_offset         (48 - 48 + 8 * 16)
6750  //   overflow_arg_area (point to parameters coming in memory).
6751  //   reg_save_area
6752  SmallVector<SDValue, 8> MemOps;
6753  SDValue FIN = Op.getOperand(1);
6754  // Store gp_offset
6755  SDValue Store = DAG.getStore(Op.getOperand(0), dl,
6756                               DAG.getConstant(FuncInfo->getVarArgsGPOffset(),
6757                                               MVT::i32),
6758                               FIN, SV, 0, false, false, 0);
6759  MemOps.push_back(Store);
6760
6761  // Store fp_offset
6762  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6763                    FIN, DAG.getIntPtrConstant(4));
6764  Store = DAG.getStore(Op.getOperand(0), dl,
6765                       DAG.getConstant(FuncInfo->getVarArgsFPOffset(),
6766                                       MVT::i32),
6767                       FIN, SV, 0, false, false, 0);
6768  MemOps.push_back(Store);
6769
6770  // Store ptr to overflow_arg_area
6771  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6772                    FIN, DAG.getIntPtrConstant(4));
6773  SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
6774                                    getPointerTy());
6775  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0,
6776                       false, false, 0);
6777  MemOps.push_back(Store);
6778
6779  // Store ptr to reg_save_area.
6780  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
6781                    FIN, DAG.getIntPtrConstant(8));
6782  SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
6783                                    getPointerTy());
6784  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0,
6785                       false, false, 0);
6786  MemOps.push_back(Store);
6787  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
6788                     &MemOps[0], MemOps.size());
6789}
6790
6791SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
6792  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6793  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
6794  SDValue Chain = Op.getOperand(0);
6795  SDValue SrcPtr = Op.getOperand(1);
6796  SDValue SrcSV = Op.getOperand(2);
6797
6798  report_fatal_error("VAArgInst is not yet implemented for x86-64!");
6799  return SDValue();
6800}
6801
6802SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
6803  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
6804  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
6805  SDValue Chain = Op.getOperand(0);
6806  SDValue DstPtr = Op.getOperand(1);
6807  SDValue SrcPtr = Op.getOperand(2);
6808  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
6809  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
6810  DebugLoc dl = Op.getDebugLoc();
6811
6812  return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
6813                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
6814                       false, DstSV, 0, SrcSV, 0);
6815}
6816
6817SDValue
6818X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
6819  DebugLoc dl = Op.getDebugLoc();
6820  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6821  switch (IntNo) {
6822  default: return SDValue();    // Don't custom lower most intrinsics.
6823  // Comparison intrinsics.
6824  case Intrinsic::x86_sse_comieq_ss:
6825  case Intrinsic::x86_sse_comilt_ss:
6826  case Intrinsic::x86_sse_comile_ss:
6827  case Intrinsic::x86_sse_comigt_ss:
6828  case Intrinsic::x86_sse_comige_ss:
6829  case Intrinsic::x86_sse_comineq_ss:
6830  case Intrinsic::x86_sse_ucomieq_ss:
6831  case Intrinsic::x86_sse_ucomilt_ss:
6832  case Intrinsic::x86_sse_ucomile_ss:
6833  case Intrinsic::x86_sse_ucomigt_ss:
6834  case Intrinsic::x86_sse_ucomige_ss:
6835  case Intrinsic::x86_sse_ucomineq_ss:
6836  case Intrinsic::x86_sse2_comieq_sd:
6837  case Intrinsic::x86_sse2_comilt_sd:
6838  case Intrinsic::x86_sse2_comile_sd:
6839  case Intrinsic::x86_sse2_comigt_sd:
6840  case Intrinsic::x86_sse2_comige_sd:
6841  case Intrinsic::x86_sse2_comineq_sd:
6842  case Intrinsic::x86_sse2_ucomieq_sd:
6843  case Intrinsic::x86_sse2_ucomilt_sd:
6844  case Intrinsic::x86_sse2_ucomile_sd:
6845  case Intrinsic::x86_sse2_ucomigt_sd:
6846  case Intrinsic::x86_sse2_ucomige_sd:
6847  case Intrinsic::x86_sse2_ucomineq_sd: {
6848    unsigned Opc = 0;
6849    ISD::CondCode CC = ISD::SETCC_INVALID;
6850    switch (IntNo) {
6851    default: break;
6852    case Intrinsic::x86_sse_comieq_ss:
6853    case Intrinsic::x86_sse2_comieq_sd:
6854      Opc = X86ISD::COMI;
6855      CC = ISD::SETEQ;
6856      break;
6857    case Intrinsic::x86_sse_comilt_ss:
6858    case Intrinsic::x86_sse2_comilt_sd:
6859      Opc = X86ISD::COMI;
6860      CC = ISD::SETLT;
6861      break;
6862    case Intrinsic::x86_sse_comile_ss:
6863    case Intrinsic::x86_sse2_comile_sd:
6864      Opc = X86ISD::COMI;
6865      CC = ISD::SETLE;
6866      break;
6867    case Intrinsic::x86_sse_comigt_ss:
6868    case Intrinsic::x86_sse2_comigt_sd:
6869      Opc = X86ISD::COMI;
6870      CC = ISD::SETGT;
6871      break;
6872    case Intrinsic::x86_sse_comige_ss:
6873    case Intrinsic::x86_sse2_comige_sd:
6874      Opc = X86ISD::COMI;
6875      CC = ISD::SETGE;
6876      break;
6877    case Intrinsic::x86_sse_comineq_ss:
6878    case Intrinsic::x86_sse2_comineq_sd:
6879      Opc = X86ISD::COMI;
6880      CC = ISD::SETNE;
6881      break;
6882    case Intrinsic::x86_sse_ucomieq_ss:
6883    case Intrinsic::x86_sse2_ucomieq_sd:
6884      Opc = X86ISD::UCOMI;
6885      CC = ISD::SETEQ;
6886      break;
6887    case Intrinsic::x86_sse_ucomilt_ss:
6888    case Intrinsic::x86_sse2_ucomilt_sd:
6889      Opc = X86ISD::UCOMI;
6890      CC = ISD::SETLT;
6891      break;
6892    case Intrinsic::x86_sse_ucomile_ss:
6893    case Intrinsic::x86_sse2_ucomile_sd:
6894      Opc = X86ISD::UCOMI;
6895      CC = ISD::SETLE;
6896      break;
6897    case Intrinsic::x86_sse_ucomigt_ss:
6898    case Intrinsic::x86_sse2_ucomigt_sd:
6899      Opc = X86ISD::UCOMI;
6900      CC = ISD::SETGT;
6901      break;
6902    case Intrinsic::x86_sse_ucomige_ss:
6903    case Intrinsic::x86_sse2_ucomige_sd:
6904      Opc = X86ISD::UCOMI;
6905      CC = ISD::SETGE;
6906      break;
6907    case Intrinsic::x86_sse_ucomineq_ss:
6908    case Intrinsic::x86_sse2_ucomineq_sd:
6909      Opc = X86ISD::UCOMI;
6910      CC = ISD::SETNE;
6911      break;
6912    }
6913
6914    SDValue LHS = Op.getOperand(1);
6915    SDValue RHS = Op.getOperand(2);
6916    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
6917    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
6918    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
6919    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
6920                                DAG.getConstant(X86CC, MVT::i8), Cond);
6921    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6922  }
6923  // ptest intrinsics. The intrinsic these come from are designed to return
6924  // an integer value, not just an instruction so lower it to the ptest
6925  // pattern and a setcc for the result.
6926  case Intrinsic::x86_sse41_ptestz:
6927  case Intrinsic::x86_sse41_ptestc:
6928  case Intrinsic::x86_sse41_ptestnzc:{
6929    unsigned X86CC = 0;
6930    switch (IntNo) {
6931    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
6932    case Intrinsic::x86_sse41_ptestz:
6933      // ZF = 1
6934      X86CC = X86::COND_E;
6935      break;
6936    case Intrinsic::x86_sse41_ptestc:
6937      // CF = 1
6938      X86CC = X86::COND_B;
6939      break;
6940    case Intrinsic::x86_sse41_ptestnzc:
6941      // ZF and CF = 0
6942      X86CC = X86::COND_A;
6943      break;
6944    }
6945
6946    SDValue LHS = Op.getOperand(1);
6947    SDValue RHS = Op.getOperand(2);
6948    SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
6949    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
6950    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
6951    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
6952  }
6953
6954  // Fix vector shift instructions where the last operand is a non-immediate
6955  // i32 value.
6956  case Intrinsic::x86_sse2_pslli_w:
6957  case Intrinsic::x86_sse2_pslli_d:
6958  case Intrinsic::x86_sse2_pslli_q:
6959  case Intrinsic::x86_sse2_psrli_w:
6960  case Intrinsic::x86_sse2_psrli_d:
6961  case Intrinsic::x86_sse2_psrli_q:
6962  case Intrinsic::x86_sse2_psrai_w:
6963  case Intrinsic::x86_sse2_psrai_d:
6964  case Intrinsic::x86_mmx_pslli_w:
6965  case Intrinsic::x86_mmx_pslli_d:
6966  case Intrinsic::x86_mmx_pslli_q:
6967  case Intrinsic::x86_mmx_psrli_w:
6968  case Intrinsic::x86_mmx_psrli_d:
6969  case Intrinsic::x86_mmx_psrli_q:
6970  case Intrinsic::x86_mmx_psrai_w:
6971  case Intrinsic::x86_mmx_psrai_d: {
6972    SDValue ShAmt = Op.getOperand(2);
6973    if (isa<ConstantSDNode>(ShAmt))
6974      return SDValue();
6975
6976    unsigned NewIntNo = 0;
6977    EVT ShAmtVT = MVT::v4i32;
6978    switch (IntNo) {
6979    case Intrinsic::x86_sse2_pslli_w:
6980      NewIntNo = Intrinsic::x86_sse2_psll_w;
6981      break;
6982    case Intrinsic::x86_sse2_pslli_d:
6983      NewIntNo = Intrinsic::x86_sse2_psll_d;
6984      break;
6985    case Intrinsic::x86_sse2_pslli_q:
6986      NewIntNo = Intrinsic::x86_sse2_psll_q;
6987      break;
6988    case Intrinsic::x86_sse2_psrli_w:
6989      NewIntNo = Intrinsic::x86_sse2_psrl_w;
6990      break;
6991    case Intrinsic::x86_sse2_psrli_d:
6992      NewIntNo = Intrinsic::x86_sse2_psrl_d;
6993      break;
6994    case Intrinsic::x86_sse2_psrli_q:
6995      NewIntNo = Intrinsic::x86_sse2_psrl_q;
6996      break;
6997    case Intrinsic::x86_sse2_psrai_w:
6998      NewIntNo = Intrinsic::x86_sse2_psra_w;
6999      break;
7000    case Intrinsic::x86_sse2_psrai_d:
7001      NewIntNo = Intrinsic::x86_sse2_psra_d;
7002      break;
7003    default: {
7004      ShAmtVT = MVT::v2i32;
7005      switch (IntNo) {
7006      case Intrinsic::x86_mmx_pslli_w:
7007        NewIntNo = Intrinsic::x86_mmx_psll_w;
7008        break;
7009      case Intrinsic::x86_mmx_pslli_d:
7010        NewIntNo = Intrinsic::x86_mmx_psll_d;
7011        break;
7012      case Intrinsic::x86_mmx_pslli_q:
7013        NewIntNo = Intrinsic::x86_mmx_psll_q;
7014        break;
7015      case Intrinsic::x86_mmx_psrli_w:
7016        NewIntNo = Intrinsic::x86_mmx_psrl_w;
7017        break;
7018      case Intrinsic::x86_mmx_psrli_d:
7019        NewIntNo = Intrinsic::x86_mmx_psrl_d;
7020        break;
7021      case Intrinsic::x86_mmx_psrli_q:
7022        NewIntNo = Intrinsic::x86_mmx_psrl_q;
7023        break;
7024      case Intrinsic::x86_mmx_psrai_w:
7025        NewIntNo = Intrinsic::x86_mmx_psra_w;
7026        break;
7027      case Intrinsic::x86_mmx_psrai_d:
7028        NewIntNo = Intrinsic::x86_mmx_psra_d;
7029        break;
7030      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
7031      }
7032      break;
7033    }
7034    }
7035
7036    // The vector shift intrinsics with scalars uses 32b shift amounts but
7037    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
7038    // to be zero.
7039    SDValue ShOps[4];
7040    ShOps[0] = ShAmt;
7041    ShOps[1] = DAG.getConstant(0, MVT::i32);
7042    if (ShAmtVT == MVT::v4i32) {
7043      ShOps[2] = DAG.getUNDEF(MVT::i32);
7044      ShOps[3] = DAG.getUNDEF(MVT::i32);
7045      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
7046    } else {
7047      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
7048    }
7049
7050    EVT VT = Op.getValueType();
7051    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
7052    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7053                       DAG.getConstant(NewIntNo, MVT::i32),
7054                       Op.getOperand(1), ShAmt);
7055  }
7056  }
7057}
7058
7059SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
7060                                           SelectionDAG &DAG) const {
7061  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7062  MFI->setReturnAddressIsTaken(true);
7063
7064  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7065  DebugLoc dl = Op.getDebugLoc();
7066
7067  if (Depth > 0) {
7068    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
7069    SDValue Offset =
7070      DAG.getConstant(TD->getPointerSize(),
7071                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
7072    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
7073                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
7074                                   FrameAddr, Offset),
7075                       NULL, 0, false, false, 0);
7076  }
7077
7078  // Just load the return address.
7079  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
7080  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
7081                     RetAddrFI, NULL, 0, false, false, 0);
7082}
7083
7084SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
7085  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
7086  MFI->setFrameAddressIsTaken(true);
7087
7088  EVT VT = Op.getValueType();
7089  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
7090  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7091  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
7092  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
7093  while (Depth--)
7094    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0,
7095                            false, false, 0);
7096  return FrameAddr;
7097}
7098
7099SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
7100                                                     SelectionDAG &DAG) const {
7101  return DAG.getIntPtrConstant(2*TD->getPointerSize());
7102}
7103
7104SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
7105  MachineFunction &MF = DAG.getMachineFunction();
7106  SDValue Chain     = Op.getOperand(0);
7107  SDValue Offset    = Op.getOperand(1);
7108  SDValue Handler   = Op.getOperand(2);
7109  DebugLoc dl       = Op.getDebugLoc();
7110
7111  SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
7112                                  getPointerTy());
7113  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
7114
7115  SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
7116                                  DAG.getIntPtrConstant(-TD->getPointerSize()));
7117  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
7118  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0);
7119  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
7120  MF.getRegInfo().addLiveOut(StoreAddrReg);
7121
7122  return DAG.getNode(X86ISD::EH_RETURN, dl,
7123                     MVT::Other,
7124                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
7125}
7126
7127SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
7128                                             SelectionDAG &DAG) const {
7129  SDValue Root = Op.getOperand(0);
7130  SDValue Trmp = Op.getOperand(1); // trampoline
7131  SDValue FPtr = Op.getOperand(2); // nested function
7132  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7133  DebugLoc dl  = Op.getDebugLoc();
7134
7135  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7136
7137  if (Subtarget->is64Bit()) {
7138    SDValue OutChains[6];
7139
7140    // Large code-model.
7141    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
7142    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
7143
7144    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
7145    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
7146
7147    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
7148
7149    // Load the pointer to the nested function into R11.
7150    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
7151    SDValue Addr = Trmp;
7152    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7153                                Addr, TrmpAddr, 0, false, false, 0);
7154
7155    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7156                       DAG.getConstant(2, MVT::i64));
7157    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2,
7158                                false, false, 2);
7159
7160    // Load the 'nest' parameter value into R10.
7161    // R10 is specified in X86CallingConv.td
7162    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
7163    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7164                       DAG.getConstant(10, MVT::i64));
7165    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7166                                Addr, TrmpAddr, 10, false, false, 0);
7167
7168    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7169                       DAG.getConstant(12, MVT::i64));
7170    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12,
7171                                false, false, 2);
7172
7173    // Jump to the nested function.
7174    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
7175    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7176                       DAG.getConstant(20, MVT::i64));
7177    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
7178                                Addr, TrmpAddr, 20, false, false, 0);
7179
7180    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
7181    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
7182                       DAG.getConstant(22, MVT::i64));
7183    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
7184                                TrmpAddr, 22, false, false, 0);
7185
7186    SDValue Ops[] =
7187      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
7188    return DAG.getMergeValues(Ops, 2, dl);
7189  } else {
7190    const Function *Func =
7191      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7192    CallingConv::ID CC = Func->getCallingConv();
7193    unsigned NestReg;
7194
7195    switch (CC) {
7196    default:
7197      llvm_unreachable("Unsupported calling convention");
7198    case CallingConv::C:
7199    case CallingConv::X86_StdCall: {
7200      // Pass 'nest' parameter in ECX.
7201      // Must be kept in sync with X86CallingConv.td
7202      NestReg = X86::ECX;
7203
7204      // Check that ECX wasn't needed by an 'inreg' parameter.
7205      const FunctionType *FTy = Func->getFunctionType();
7206      const AttrListPtr &Attrs = Func->getAttributes();
7207
7208      if (!Attrs.isEmpty() && !Func->isVarArg()) {
7209        unsigned InRegCount = 0;
7210        unsigned Idx = 1;
7211
7212        for (FunctionType::param_iterator I = FTy->param_begin(),
7213             E = FTy->param_end(); I != E; ++I, ++Idx)
7214          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
7215            // FIXME: should only count parameters that are lowered to integers.
7216            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
7217
7218        if (InRegCount > 2) {
7219          report_fatal_error("Nest register in use - reduce number of inreg parameters!");
7220        }
7221      }
7222      break;
7223    }
7224    case CallingConv::X86_FastCall:
7225    case CallingConv::X86_ThisCall:
7226    case CallingConv::Fast:
7227      // Pass 'nest' parameter in EAX.
7228      // Must be kept in sync with X86CallingConv.td
7229      NestReg = X86::EAX;
7230      break;
7231    }
7232
7233    SDValue OutChains[4];
7234    SDValue Addr, Disp;
7235
7236    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7237                       DAG.getConstant(10, MVT::i32));
7238    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
7239
7240    // This is storing the opcode for MOV32ri.
7241    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
7242    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
7243    OutChains[0] = DAG.getStore(Root, dl,
7244                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
7245                                Trmp, TrmpAddr, 0, false, false, 0);
7246
7247    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7248                       DAG.getConstant(1, MVT::i32));
7249    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1,
7250                                false, false, 1);
7251
7252    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
7253    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7254                       DAG.getConstant(5, MVT::i32));
7255    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
7256                                TrmpAddr, 5, false, false, 1);
7257
7258    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
7259                       DAG.getConstant(6, MVT::i32));
7260    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6,
7261                                false, false, 1);
7262
7263    SDValue Ops[] =
7264      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
7265    return DAG.getMergeValues(Ops, 2, dl);
7266  }
7267}
7268
7269SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
7270                                            SelectionDAG &DAG) const {
7271  /*
7272   The rounding mode is in bits 11:10 of FPSR, and has the following
7273   settings:
7274     00 Round to nearest
7275     01 Round to -inf
7276     10 Round to +inf
7277     11 Round to 0
7278
7279  FLT_ROUNDS, on the other hand, expects the following:
7280    -1 Undefined
7281     0 Round to 0
7282     1 Round to nearest
7283     2 Round to +inf
7284     3 Round to -inf
7285
7286  To perform the conversion, we do:
7287    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
7288  */
7289
7290  MachineFunction &MF = DAG.getMachineFunction();
7291  const TargetMachine &TM = MF.getTarget();
7292  const TargetFrameInfo &TFI = *TM.getFrameInfo();
7293  unsigned StackAlignment = TFI.getStackAlignment();
7294  EVT VT = Op.getValueType();
7295  DebugLoc dl = Op.getDebugLoc();
7296
7297  // Save FP Control Word to stack slot
7298  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
7299  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
7300
7301  SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
7302                              DAG.getEntryNode(), StackSlot);
7303
7304  // Load FP Control Word from stack slot
7305  SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0,
7306                            false, false, 0);
7307
7308  // Transform as necessary
7309  SDValue CWD1 =
7310    DAG.getNode(ISD::SRL, dl, MVT::i16,
7311                DAG.getNode(ISD::AND, dl, MVT::i16,
7312                            CWD, DAG.getConstant(0x800, MVT::i16)),
7313                DAG.getConstant(11, MVT::i8));
7314  SDValue CWD2 =
7315    DAG.getNode(ISD::SRL, dl, MVT::i16,
7316                DAG.getNode(ISD::AND, dl, MVT::i16,
7317                            CWD, DAG.getConstant(0x400, MVT::i16)),
7318                DAG.getConstant(9, MVT::i8));
7319
7320  SDValue RetVal =
7321    DAG.getNode(ISD::AND, dl, MVT::i16,
7322                DAG.getNode(ISD::ADD, dl, MVT::i16,
7323                            DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
7324                            DAG.getConstant(1, MVT::i16)),
7325                DAG.getConstant(3, MVT::i16));
7326
7327
7328  return DAG.getNode((VT.getSizeInBits() < 16 ?
7329                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
7330}
7331
7332SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
7333  EVT VT = Op.getValueType();
7334  EVT OpVT = VT;
7335  unsigned NumBits = VT.getSizeInBits();
7336  DebugLoc dl = Op.getDebugLoc();
7337
7338  Op = Op.getOperand(0);
7339  if (VT == MVT::i8) {
7340    // Zero extend to i32 since there is not an i8 bsr.
7341    OpVT = MVT::i32;
7342    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
7343  }
7344
7345  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
7346  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
7347  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
7348
7349  // If src is zero (i.e. bsr sets ZF), returns NumBits.
7350  SDValue Ops[] = {
7351    Op,
7352    DAG.getConstant(NumBits+NumBits-1, OpVT),
7353    DAG.getConstant(X86::COND_E, MVT::i8),
7354    Op.getValue(1)
7355  };
7356  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
7357
7358  // Finally xor with NumBits-1.
7359  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
7360
7361  if (VT == MVT::i8)
7362    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
7363  return Op;
7364}
7365
7366SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
7367  EVT VT = Op.getValueType();
7368  EVT OpVT = VT;
7369  unsigned NumBits = VT.getSizeInBits();
7370  DebugLoc dl = Op.getDebugLoc();
7371
7372  Op = Op.getOperand(0);
7373  if (VT == MVT::i8) {
7374    OpVT = MVT::i32;
7375    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
7376  }
7377
7378  // Issue a bsf (scan bits forward) which also sets EFLAGS.
7379  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
7380  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
7381
7382  // If src is zero (i.e. bsf sets ZF), returns NumBits.
7383  SDValue Ops[] = {
7384    Op,
7385    DAG.getConstant(NumBits, OpVT),
7386    DAG.getConstant(X86::COND_E, MVT::i8),
7387    Op.getValue(1)
7388  };
7389  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
7390
7391  if (VT == MVT::i8)
7392    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
7393  return Op;
7394}
7395
7396SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
7397  EVT VT = Op.getValueType();
7398  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
7399  DebugLoc dl = Op.getDebugLoc();
7400
7401  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
7402  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
7403  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
7404  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
7405  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
7406  //
7407  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
7408  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
7409  //  return AloBlo + AloBhi + AhiBlo;
7410
7411  SDValue A = Op.getOperand(0);
7412  SDValue B = Op.getOperand(1);
7413
7414  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7415                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7416                       A, DAG.getConstant(32, MVT::i32));
7417  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7418                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
7419                       B, DAG.getConstant(32, MVT::i32));
7420  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7421                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7422                       A, B);
7423  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7424                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7425                       A, Bhi);
7426  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7427                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
7428                       Ahi, B);
7429  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7430                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7431                       AloBhi, DAG.getConstant(32, MVT::i32));
7432  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
7433                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
7434                       AhiBlo, DAG.getConstant(32, MVT::i32));
7435  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
7436  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
7437  return Res;
7438}
7439
7440
7441SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
7442  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
7443  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
7444  // looks for this combo and may remove the "setcc" instruction if the "setcc"
7445  // has only one use.
7446  SDNode *N = Op.getNode();
7447  SDValue LHS = N->getOperand(0);
7448  SDValue RHS = N->getOperand(1);
7449  unsigned BaseOp = 0;
7450  unsigned Cond = 0;
7451  DebugLoc dl = Op.getDebugLoc();
7452
7453  switch (Op.getOpcode()) {
7454  default: llvm_unreachable("Unknown ovf instruction!");
7455  case ISD::SADDO:
7456    // A subtract of one will be selected as a INC. Note that INC doesn't
7457    // set CF, so we can't do this for UADDO.
7458    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7459      if (C->getAPIntValue() == 1) {
7460        BaseOp = X86ISD::INC;
7461        Cond = X86::COND_O;
7462        break;
7463      }
7464    BaseOp = X86ISD::ADD;
7465    Cond = X86::COND_O;
7466    break;
7467  case ISD::UADDO:
7468    BaseOp = X86ISD::ADD;
7469    Cond = X86::COND_B;
7470    break;
7471  case ISD::SSUBO:
7472    // A subtract of one will be selected as a DEC. Note that DEC doesn't
7473    // set CF, so we can't do this for USUBO.
7474    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
7475      if (C->getAPIntValue() == 1) {
7476        BaseOp = X86ISD::DEC;
7477        Cond = X86::COND_O;
7478        break;
7479      }
7480    BaseOp = X86ISD::SUB;
7481    Cond = X86::COND_O;
7482    break;
7483  case ISD::USUBO:
7484    BaseOp = X86ISD::SUB;
7485    Cond = X86::COND_B;
7486    break;
7487  case ISD::SMULO:
7488    BaseOp = X86ISD::SMUL;
7489    Cond = X86::COND_O;
7490    break;
7491  case ISD::UMULO:
7492    BaseOp = X86ISD::UMUL;
7493    Cond = X86::COND_B;
7494    break;
7495  }
7496
7497  // Also sets EFLAGS.
7498  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
7499  SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
7500
7501  SDValue SetCC =
7502    DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
7503                DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
7504
7505  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
7506  return Sum;
7507}
7508
7509SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
7510  EVT T = Op.getValueType();
7511  DebugLoc dl = Op.getDebugLoc();
7512  unsigned Reg = 0;
7513  unsigned size = 0;
7514  switch(T.getSimpleVT().SimpleTy) {
7515  default:
7516    assert(false && "Invalid value type!");
7517  case MVT::i8:  Reg = X86::AL;  size = 1; break;
7518  case MVT::i16: Reg = X86::AX;  size = 2; break;
7519  case MVT::i32: Reg = X86::EAX; size = 4; break;
7520  case MVT::i64:
7521    assert(Subtarget->is64Bit() && "Node not type legal!");
7522    Reg = X86::RAX; size = 8;
7523    break;
7524  }
7525  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
7526                                    Op.getOperand(2), SDValue());
7527  SDValue Ops[] = { cpIn.getValue(0),
7528                    Op.getOperand(1),
7529                    Op.getOperand(3),
7530                    DAG.getTargetConstant(size, MVT::i8),
7531                    cpIn.getValue(1) };
7532  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7533  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
7534  SDValue cpOut =
7535    DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
7536  return cpOut;
7537}
7538
7539SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
7540                                                 SelectionDAG &DAG) const {
7541  assert(Subtarget->is64Bit() && "Result not type legalized?");
7542  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7543  SDValue TheChain = Op.getOperand(0);
7544  DebugLoc dl = Op.getDebugLoc();
7545  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7546  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
7547  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
7548                                   rax.getValue(2));
7549  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
7550                            DAG.getConstant(32, MVT::i8));
7551  SDValue Ops[] = {
7552    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
7553    rdx.getValue(1)
7554  };
7555  return DAG.getMergeValues(Ops, 2, dl);
7556}
7557
7558SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op,
7559                                            SelectionDAG &DAG) const {
7560  EVT SrcVT = Op.getOperand(0).getValueType();
7561  EVT DstVT = Op.getValueType();
7562  assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
7563          Subtarget->hasMMX() && !DisableMMX) &&
7564         "Unexpected custom BIT_CONVERT");
7565  assert((DstVT == MVT::i64 ||
7566          (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
7567         "Unexpected custom BIT_CONVERT");
7568  // i64 <=> MMX conversions are Legal.
7569  if (SrcVT==MVT::i64 && DstVT.isVector())
7570    return Op;
7571  if (DstVT==MVT::i64 && SrcVT.isVector())
7572    return Op;
7573  // MMX <=> MMX conversions are Legal.
7574  if (SrcVT.isVector() && DstVT.isVector())
7575    return Op;
7576  // All other conversions need to be expanded.
7577  return SDValue();
7578}
7579SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
7580  SDNode *Node = Op.getNode();
7581  DebugLoc dl = Node->getDebugLoc();
7582  EVT T = Node->getValueType(0);
7583  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
7584                              DAG.getConstant(0, T), Node->getOperand(2));
7585  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
7586                       cast<AtomicSDNode>(Node)->getMemoryVT(),
7587                       Node->getOperand(0),
7588                       Node->getOperand(1), negOp,
7589                       cast<AtomicSDNode>(Node)->getSrcValue(),
7590                       cast<AtomicSDNode>(Node)->getAlignment());
7591}
7592
7593/// LowerOperation - Provide custom lowering hooks for some operations.
7594///
7595SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
7596  switch (Op.getOpcode()) {
7597  default: llvm_unreachable("Should not custom lower this!");
7598  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
7599  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
7600  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
7601  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
7602  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
7603  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7604  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
7605  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
7606  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
7607  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
7608  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
7609  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
7610  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
7611  case ISD::SHL_PARTS:
7612  case ISD::SRA_PARTS:
7613  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
7614  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
7615  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
7616  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
7617  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
7618  case ISD::FABS:               return LowerFABS(Op, DAG);
7619  case ISD::FNEG:               return LowerFNEG(Op, DAG);
7620  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
7621  case ISD::SETCC:              return LowerSETCC(Op, DAG);
7622  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
7623  case ISD::SELECT:             return LowerSELECT(Op, DAG);
7624  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
7625  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
7626  case ISD::VASTART:            return LowerVASTART(Op, DAG);
7627  case ISD::VAARG:              return LowerVAARG(Op, DAG);
7628  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
7629  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7630  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
7631  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
7632  case ISD::FRAME_TO_ARGS_OFFSET:
7633                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
7634  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
7635  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
7636  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
7637  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
7638  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
7639  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
7640  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
7641  case ISD::SADDO:
7642  case ISD::UADDO:
7643  case ISD::SSUBO:
7644  case ISD::USUBO:
7645  case ISD::SMULO:
7646  case ISD::UMULO:              return LowerXALUO(Op, DAG);
7647  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
7648  case ISD::BIT_CONVERT:        return LowerBIT_CONVERT(Op, DAG);
7649  }
7650}
7651
7652void X86TargetLowering::
7653ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
7654                        SelectionDAG &DAG, unsigned NewOp) const {
7655  EVT T = Node->getValueType(0);
7656  DebugLoc dl = Node->getDebugLoc();
7657  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
7658
7659  SDValue Chain = Node->getOperand(0);
7660  SDValue In1 = Node->getOperand(1);
7661  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7662                             Node->getOperand(2), DAG.getIntPtrConstant(0));
7663  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
7664                             Node->getOperand(2), DAG.getIntPtrConstant(1));
7665  SDValue Ops[] = { Chain, In1, In2L, In2H };
7666  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
7667  SDValue Result =
7668    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
7669                            cast<MemSDNode>(Node)->getMemOperand());
7670  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
7671  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7672  Results.push_back(Result.getValue(2));
7673}
7674
7675/// ReplaceNodeResults - Replace a node with an illegal result type
7676/// with a new node built out of custom code.
7677void X86TargetLowering::ReplaceNodeResults(SDNode *N,
7678                                           SmallVectorImpl<SDValue>&Results,
7679                                           SelectionDAG &DAG) const {
7680  DebugLoc dl = N->getDebugLoc();
7681  switch (N->getOpcode()) {
7682  default:
7683    assert(false && "Do not know how to custom type legalize this operation!");
7684    return;
7685  case ISD::FP_TO_SINT: {
7686    std::pair<SDValue,SDValue> Vals =
7687        FP_TO_INTHelper(SDValue(N, 0), DAG, true);
7688    SDValue FIST = Vals.first, StackSlot = Vals.second;
7689    if (FIST.getNode() != 0) {
7690      EVT VT = N->getValueType(0);
7691      // Return a load from the stack slot.
7692      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0,
7693                                    false, false, 0));
7694    }
7695    return;
7696  }
7697  case ISD::READCYCLECOUNTER: {
7698    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7699    SDValue TheChain = N->getOperand(0);
7700    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
7701    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
7702                                     rd.getValue(1));
7703    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
7704                                     eax.getValue(2));
7705    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
7706    SDValue Ops[] = { eax, edx };
7707    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
7708    Results.push_back(edx.getValue(1));
7709    return;
7710  }
7711  case ISD::ATOMIC_CMP_SWAP: {
7712    EVT T = N->getValueType(0);
7713    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
7714    SDValue cpInL, cpInH;
7715    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7716                        DAG.getConstant(0, MVT::i32));
7717    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
7718                        DAG.getConstant(1, MVT::i32));
7719    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
7720    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
7721                             cpInL.getValue(1));
7722    SDValue swapInL, swapInH;
7723    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7724                          DAG.getConstant(0, MVT::i32));
7725    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
7726                          DAG.getConstant(1, MVT::i32));
7727    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
7728                               cpInH.getValue(1));
7729    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
7730                               swapInL.getValue(1));
7731    SDValue Ops[] = { swapInH.getValue(0),
7732                      N->getOperand(1),
7733                      swapInH.getValue(1) };
7734    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
7735    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
7736    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
7737                                        MVT::i32, Result.getValue(1));
7738    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
7739                                        MVT::i32, cpOutL.getValue(2));
7740    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
7741    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
7742    Results.push_back(cpOutH.getValue(1));
7743    return;
7744  }
7745  case ISD::ATOMIC_LOAD_ADD:
7746    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
7747    return;
7748  case ISD::ATOMIC_LOAD_AND:
7749    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
7750    return;
7751  case ISD::ATOMIC_LOAD_NAND:
7752    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
7753    return;
7754  case ISD::ATOMIC_LOAD_OR:
7755    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
7756    return;
7757  case ISD::ATOMIC_LOAD_SUB:
7758    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
7759    return;
7760  case ISD::ATOMIC_LOAD_XOR:
7761    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
7762    return;
7763  case ISD::ATOMIC_SWAP:
7764    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
7765    return;
7766  }
7767}
7768
7769const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
7770  switch (Opcode) {
7771  default: return NULL;
7772  case X86ISD::BSF:                return "X86ISD::BSF";
7773  case X86ISD::BSR:                return "X86ISD::BSR";
7774  case X86ISD::SHLD:               return "X86ISD::SHLD";
7775  case X86ISD::SHRD:               return "X86ISD::SHRD";
7776  case X86ISD::FAND:               return "X86ISD::FAND";
7777  case X86ISD::FOR:                return "X86ISD::FOR";
7778  case X86ISD::FXOR:               return "X86ISD::FXOR";
7779  case X86ISD::FSRL:               return "X86ISD::FSRL";
7780  case X86ISD::FILD:               return "X86ISD::FILD";
7781  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
7782  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
7783  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
7784  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
7785  case X86ISD::FLD:                return "X86ISD::FLD";
7786  case X86ISD::FST:                return "X86ISD::FST";
7787  case X86ISD::CALL:               return "X86ISD::CALL";
7788  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
7789  case X86ISD::BT:                 return "X86ISD::BT";
7790  case X86ISD::CMP:                return "X86ISD::CMP";
7791  case X86ISD::COMI:               return "X86ISD::COMI";
7792  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
7793  case X86ISD::SETCC:              return "X86ISD::SETCC";
7794  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
7795  case X86ISD::CMOV:               return "X86ISD::CMOV";
7796  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
7797  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
7798  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
7799  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
7800  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
7801  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
7802  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
7803  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
7804  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
7805  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
7806  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
7807  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
7808  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
7809  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
7810  case X86ISD::FMAX:               return "X86ISD::FMAX";
7811  case X86ISD::FMIN:               return "X86ISD::FMIN";
7812  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
7813  case X86ISD::FRCP:               return "X86ISD::FRCP";
7814  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
7815  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
7816  case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
7817  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
7818  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
7819  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
7820  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
7821  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
7822  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
7823  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
7824  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
7825  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
7826  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
7827  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
7828  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
7829  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
7830  case X86ISD::VSHL:               return "X86ISD::VSHL";
7831  case X86ISD::VSRL:               return "X86ISD::VSRL";
7832  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
7833  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
7834  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
7835  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
7836  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
7837  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
7838  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
7839  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
7840  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
7841  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
7842  case X86ISD::ADD:                return "X86ISD::ADD";
7843  case X86ISD::SUB:                return "X86ISD::SUB";
7844  case X86ISD::SMUL:               return "X86ISD::SMUL";
7845  case X86ISD::UMUL:               return "X86ISD::UMUL";
7846  case X86ISD::INC:                return "X86ISD::INC";
7847  case X86ISD::DEC:                return "X86ISD::DEC";
7848  case X86ISD::OR:                 return "X86ISD::OR";
7849  case X86ISD::XOR:                return "X86ISD::XOR";
7850  case X86ISD::AND:                return "X86ISD::AND";
7851  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
7852  case X86ISD::PTEST:              return "X86ISD::PTEST";
7853  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
7854  case X86ISD::MINGW_ALLOCA:       return "X86ISD::MINGW_ALLOCA";
7855  }
7856}
7857
7858// isLegalAddressingMode - Return true if the addressing mode represented
7859// by AM is legal for this target, for a load/store of the specified type.
7860bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
7861                                              const Type *Ty) const {
7862  // X86 supports extremely general addressing modes.
7863  CodeModel::Model M = getTargetMachine().getCodeModel();
7864
7865  // X86 allows a sign-extended 32-bit immediate field as a displacement.
7866  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
7867    return false;
7868
7869  if (AM.BaseGV) {
7870    unsigned GVFlags =
7871      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
7872
7873    // If a reference to this global requires an extra load, we can't fold it.
7874    if (isGlobalStubReference(GVFlags))
7875      return false;
7876
7877    // If BaseGV requires a register for the PIC base, we cannot also have a
7878    // BaseReg specified.
7879    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
7880      return false;
7881
7882    // If lower 4G is not available, then we must use rip-relative addressing.
7883    if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
7884      return false;
7885  }
7886
7887  switch (AM.Scale) {
7888  case 0:
7889  case 1:
7890  case 2:
7891  case 4:
7892  case 8:
7893    // These scales always work.
7894    break;
7895  case 3:
7896  case 5:
7897  case 9:
7898    // These scales are formed with basereg+scalereg.  Only accept if there is
7899    // no basereg yet.
7900    if (AM.HasBaseReg)
7901      return false;
7902    break;
7903  default:  // Other stuff never works.
7904    return false;
7905  }
7906
7907  return true;
7908}
7909
7910
7911bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
7912  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
7913    return false;
7914  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
7915  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
7916  if (NumBits1 <= NumBits2)
7917    return false;
7918  return true;
7919}
7920
7921bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
7922  if (!VT1.isInteger() || !VT2.isInteger())
7923    return false;
7924  unsigned NumBits1 = VT1.getSizeInBits();
7925  unsigned NumBits2 = VT2.getSizeInBits();
7926  if (NumBits1 <= NumBits2)
7927    return false;
7928  return true;
7929}
7930
7931bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
7932  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7933  return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
7934}
7935
7936bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
7937  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
7938  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
7939}
7940
7941bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
7942  // i16 instructions are longer (0x66 prefix) and potentially slower.
7943  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
7944}
7945
7946/// isShuffleMaskLegal - Targets can use this to indicate that they only
7947/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
7948/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
7949/// are assumed to be legal.
7950bool
7951X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
7952                                      EVT VT) const {
7953  // Very little shuffling can be done for 64-bit vectors right now.
7954  if (VT.getSizeInBits() == 64)
7955    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
7956
7957  // FIXME: pshufb, blends, shifts.
7958  return (VT.getVectorNumElements() == 2 ||
7959          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
7960          isMOVLMask(M, VT) ||
7961          isSHUFPMask(M, VT) ||
7962          isPSHUFDMask(M, VT) ||
7963          isPSHUFHWMask(M, VT) ||
7964          isPSHUFLWMask(M, VT) ||
7965          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
7966          isUNPCKLMask(M, VT) ||
7967          isUNPCKHMask(M, VT) ||
7968          isUNPCKL_v_undef_Mask(M, VT) ||
7969          isUNPCKH_v_undef_Mask(M, VT));
7970}
7971
7972bool
7973X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
7974                                          EVT VT) const {
7975  unsigned NumElts = VT.getVectorNumElements();
7976  // FIXME: This collection of masks seems suspect.
7977  if (NumElts == 2)
7978    return true;
7979  if (NumElts == 4 && VT.getSizeInBits() == 128) {
7980    return (isMOVLMask(Mask, VT)  ||
7981            isCommutedMOVLMask(Mask, VT, true) ||
7982            isSHUFPMask(Mask, VT) ||
7983            isCommutedSHUFPMask(Mask, VT));
7984  }
7985  return false;
7986}
7987
7988//===----------------------------------------------------------------------===//
7989//                           X86 Scheduler Hooks
7990//===----------------------------------------------------------------------===//
7991
7992// private utility function
7993MachineBasicBlock *
7994X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
7995                                                       MachineBasicBlock *MBB,
7996                                                       unsigned regOpc,
7997                                                       unsigned immOpc,
7998                                                       unsigned LoadOpc,
7999                                                       unsigned CXchgOpc,
8000                                                       unsigned copyOpc,
8001                                                       unsigned notOpc,
8002                                                       unsigned EAXreg,
8003                                                       TargetRegisterClass *RC,
8004                                                       bool invSrc) const {
8005  // For the atomic bitwise operator, we generate
8006  //   thisMBB:
8007  //   newMBB:
8008  //     ld  t1 = [bitinstr.addr]
8009  //     op  t2 = t1, [bitinstr.val]
8010  //     mov EAX = t1
8011  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
8012  //     bz  newMBB
8013  //     fallthrough -->nextMBB
8014  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8015  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8016  MachineFunction::iterator MBBIter = MBB;
8017  ++MBBIter;
8018
8019  /// First build the CFG
8020  MachineFunction *F = MBB->getParent();
8021  MachineBasicBlock *thisMBB = MBB;
8022  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8023  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8024  F->insert(MBBIter, newMBB);
8025  F->insert(MBBIter, nextMBB);
8026
8027  // Move all successors to thisMBB to nextMBB
8028  nextMBB->transferSuccessors(thisMBB);
8029
8030  // Update thisMBB to fall through to newMBB
8031  thisMBB->addSuccessor(newMBB);
8032
8033  // newMBB jumps to itself and fall through to nextMBB
8034  newMBB->addSuccessor(nextMBB);
8035  newMBB->addSuccessor(newMBB);
8036
8037  // Insert instructions into newMBB based on incoming instruction
8038  assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
8039         "unexpected number of operands");
8040  DebugLoc dl = bInstr->getDebugLoc();
8041  MachineOperand& destOper = bInstr->getOperand(0);
8042  MachineOperand* argOpers[2 + X86AddrNumOperands];
8043  int numArgs = bInstr->getNumOperands() - 1;
8044  for (int i=0; i < numArgs; ++i)
8045    argOpers[i] = &bInstr->getOperand(i+1);
8046
8047  // x86 address has 4 operands: base, index, scale, and displacement
8048  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
8049  int valArgIndx = lastAddrIndx + 1;
8050
8051  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
8052  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
8053  for (int i=0; i <= lastAddrIndx; ++i)
8054    (*MIB).addOperand(*argOpers[i]);
8055
8056  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
8057  if (invSrc) {
8058    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
8059  }
8060  else
8061    tt = t1;
8062
8063  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
8064  assert((argOpers[valArgIndx]->isReg() ||
8065          argOpers[valArgIndx]->isImm()) &&
8066         "invalid operand");
8067  if (argOpers[valArgIndx]->isReg())
8068    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
8069  else
8070    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
8071  MIB.addReg(tt);
8072  (*MIB).addOperand(*argOpers[valArgIndx]);
8073
8074  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
8075  MIB.addReg(t1);
8076
8077  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
8078  for (int i=0; i <= lastAddrIndx; ++i)
8079    (*MIB).addOperand(*argOpers[i]);
8080  MIB.addReg(t2);
8081  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8082  (*MIB).setMemRefs(bInstr->memoperands_begin(),
8083                    bInstr->memoperands_end());
8084
8085  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
8086  MIB.addReg(EAXreg);
8087
8088  // insert branch
8089  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8090
8091  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
8092  return nextMBB;
8093}
8094
8095// private utility function:  64 bit atomics on 32 bit host.
8096MachineBasicBlock *
8097X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
8098                                                       MachineBasicBlock *MBB,
8099                                                       unsigned regOpcL,
8100                                                       unsigned regOpcH,
8101                                                       unsigned immOpcL,
8102                                                       unsigned immOpcH,
8103                                                       bool invSrc) const {
8104  // For the atomic bitwise operator, we generate
8105  //   thisMBB (instructions are in pairs, except cmpxchg8b)
8106  //     ld t1,t2 = [bitinstr.addr]
8107  //   newMBB:
8108  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
8109  //     op  t5, t6 <- out1, out2, [bitinstr.val]
8110  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
8111  //     mov ECX, EBX <- t5, t6
8112  //     mov EAX, EDX <- t1, t2
8113  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
8114  //     mov t3, t4 <- EAX, EDX
8115  //     bz  newMBB
8116  //     result in out1, out2
8117  //     fallthrough -->nextMBB
8118
8119  const TargetRegisterClass *RC = X86::GR32RegisterClass;
8120  const unsigned LoadOpc = X86::MOV32rm;
8121  const unsigned copyOpc = X86::MOV32rr;
8122  const unsigned NotOpc = X86::NOT32r;
8123  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8124  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8125  MachineFunction::iterator MBBIter = MBB;
8126  ++MBBIter;
8127
8128  /// First build the CFG
8129  MachineFunction *F = MBB->getParent();
8130  MachineBasicBlock *thisMBB = MBB;
8131  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8132  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8133  F->insert(MBBIter, newMBB);
8134  F->insert(MBBIter, nextMBB);
8135
8136  // Move all successors to thisMBB to nextMBB
8137  nextMBB->transferSuccessors(thisMBB);
8138
8139  // Update thisMBB to fall through to newMBB
8140  thisMBB->addSuccessor(newMBB);
8141
8142  // newMBB jumps to itself and fall through to nextMBB
8143  newMBB->addSuccessor(nextMBB);
8144  newMBB->addSuccessor(newMBB);
8145
8146  DebugLoc dl = bInstr->getDebugLoc();
8147  // Insert instructions into newMBB based on incoming instruction
8148  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
8149  assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
8150         "unexpected number of operands");
8151  MachineOperand& dest1Oper = bInstr->getOperand(0);
8152  MachineOperand& dest2Oper = bInstr->getOperand(1);
8153  MachineOperand* argOpers[2 + X86AddrNumOperands];
8154  for (int i=0; i < 2 + X86AddrNumOperands; ++i) {
8155    argOpers[i] = &bInstr->getOperand(i+2);
8156
8157    // We use some of the operands multiple times, so conservatively just
8158    // clear any kill flags that might be present.
8159    if (argOpers[i]->isReg() && argOpers[i]->isUse())
8160      argOpers[i]->setIsKill(false);
8161  }
8162
8163  // x86 address has 5 operands: base, index, scale, displacement, and segment.
8164  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
8165
8166  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
8167  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
8168  for (int i=0; i <= lastAddrIndx; ++i)
8169    (*MIB).addOperand(*argOpers[i]);
8170  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
8171  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
8172  // add 4 to displacement.
8173  for (int i=0; i <= lastAddrIndx-2; ++i)
8174    (*MIB).addOperand(*argOpers[i]);
8175  MachineOperand newOp3 = *(argOpers[3]);
8176  if (newOp3.isImm())
8177    newOp3.setImm(newOp3.getImm()+4);
8178  else
8179    newOp3.setOffset(newOp3.getOffset()+4);
8180  (*MIB).addOperand(newOp3);
8181  (*MIB).addOperand(*argOpers[lastAddrIndx]);
8182
8183  // t3/4 are defined later, at the bottom of the loop
8184  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
8185  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
8186  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
8187    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
8188  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
8189    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
8190
8191  // The subsequent operations should be using the destination registers of
8192  //the PHI instructions.
8193  if (invSrc) {
8194    t1 = F->getRegInfo().createVirtualRegister(RC);
8195    t2 = F->getRegInfo().createVirtualRegister(RC);
8196    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
8197    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
8198  } else {
8199    t1 = dest1Oper.getReg();
8200    t2 = dest2Oper.getReg();
8201  }
8202
8203  int valArgIndx = lastAddrIndx + 1;
8204  assert((argOpers[valArgIndx]->isReg() ||
8205          argOpers[valArgIndx]->isImm()) &&
8206         "invalid operand");
8207  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
8208  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
8209  if (argOpers[valArgIndx]->isReg())
8210    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
8211  else
8212    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
8213  if (regOpcL != X86::MOV32rr)
8214    MIB.addReg(t1);
8215  (*MIB).addOperand(*argOpers[valArgIndx]);
8216  assert(argOpers[valArgIndx + 1]->isReg() ==
8217         argOpers[valArgIndx]->isReg());
8218  assert(argOpers[valArgIndx + 1]->isImm() ==
8219         argOpers[valArgIndx]->isImm());
8220  if (argOpers[valArgIndx + 1]->isReg())
8221    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
8222  else
8223    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
8224  if (regOpcH != X86::MOV32rr)
8225    MIB.addReg(t2);
8226  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
8227
8228  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
8229  MIB.addReg(t1);
8230  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
8231  MIB.addReg(t2);
8232
8233  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
8234  MIB.addReg(t5);
8235  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
8236  MIB.addReg(t6);
8237
8238  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
8239  for (int i=0; i <= lastAddrIndx; ++i)
8240    (*MIB).addOperand(*argOpers[i]);
8241
8242  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8243  (*MIB).setMemRefs(bInstr->memoperands_begin(),
8244                    bInstr->memoperands_end());
8245
8246  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
8247  MIB.addReg(X86::EAX);
8248  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
8249  MIB.addReg(X86::EDX);
8250
8251  // insert branch
8252  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8253
8254  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
8255  return nextMBB;
8256}
8257
8258// private utility function
8259MachineBasicBlock *
8260X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
8261                                                      MachineBasicBlock *MBB,
8262                                                      unsigned cmovOpc) const {
8263  // For the atomic min/max operator, we generate
8264  //   thisMBB:
8265  //   newMBB:
8266  //     ld t1 = [min/max.addr]
8267  //     mov t2 = [min/max.val]
8268  //     cmp  t1, t2
8269  //     cmov[cond] t2 = t1
8270  //     mov EAX = t1
8271  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
8272  //     bz   newMBB
8273  //     fallthrough -->nextMBB
8274  //
8275  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8276  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8277  MachineFunction::iterator MBBIter = MBB;
8278  ++MBBIter;
8279
8280  /// First build the CFG
8281  MachineFunction *F = MBB->getParent();
8282  MachineBasicBlock *thisMBB = MBB;
8283  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
8284  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
8285  F->insert(MBBIter, newMBB);
8286  F->insert(MBBIter, nextMBB);
8287
8288  // Move all successors of thisMBB to nextMBB
8289  nextMBB->transferSuccessors(thisMBB);
8290
8291  // Update thisMBB to fall through to newMBB
8292  thisMBB->addSuccessor(newMBB);
8293
8294  // newMBB jumps to newMBB and fall through to nextMBB
8295  newMBB->addSuccessor(nextMBB);
8296  newMBB->addSuccessor(newMBB);
8297
8298  DebugLoc dl = mInstr->getDebugLoc();
8299  // Insert instructions into newMBB based on incoming instruction
8300  assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
8301         "unexpected number of operands");
8302  MachineOperand& destOper = mInstr->getOperand(0);
8303  MachineOperand* argOpers[2 + X86AddrNumOperands];
8304  int numArgs = mInstr->getNumOperands() - 1;
8305  for (int i=0; i < numArgs; ++i)
8306    argOpers[i] = &mInstr->getOperand(i+1);
8307
8308  // x86 address has 4 operands: base, index, scale, and displacement
8309  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
8310  int valArgIndx = lastAddrIndx + 1;
8311
8312  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8313  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
8314  for (int i=0; i <= lastAddrIndx; ++i)
8315    (*MIB).addOperand(*argOpers[i]);
8316
8317  // We only support register and immediate values
8318  assert((argOpers[valArgIndx]->isReg() ||
8319          argOpers[valArgIndx]->isImm()) &&
8320         "invalid operand");
8321
8322  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8323  if (argOpers[valArgIndx]->isReg())
8324    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
8325  else
8326    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
8327  (*MIB).addOperand(*argOpers[valArgIndx]);
8328
8329  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
8330  MIB.addReg(t1);
8331
8332  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
8333  MIB.addReg(t1);
8334  MIB.addReg(t2);
8335
8336  // Generate movc
8337  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
8338  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
8339  MIB.addReg(t2);
8340  MIB.addReg(t1);
8341
8342  // Cmp and exchange if none has modified the memory location
8343  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
8344  for (int i=0; i <= lastAddrIndx; ++i)
8345    (*MIB).addOperand(*argOpers[i]);
8346  MIB.addReg(t3);
8347  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
8348  (*MIB).setMemRefs(mInstr->memoperands_begin(),
8349                    mInstr->memoperands_end());
8350
8351  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
8352  MIB.addReg(X86::EAX);
8353
8354  // insert branch
8355  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
8356
8357  F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
8358  return nextMBB;
8359}
8360
8361// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
8362// all of this code can be replaced with that in the .td file.
8363MachineBasicBlock *
8364X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
8365                            unsigned numArgs, bool memArg) const {
8366
8367  MachineFunction *F = BB->getParent();
8368  DebugLoc dl = MI->getDebugLoc();
8369  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8370
8371  unsigned Opc;
8372  if (memArg)
8373    Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
8374  else
8375    Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
8376
8377  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
8378
8379  for (unsigned i = 0; i < numArgs; ++i) {
8380    MachineOperand &Op = MI->getOperand(i+1);
8381
8382    if (!(Op.isReg() && Op.isImplicit()))
8383      MIB.addOperand(Op);
8384  }
8385
8386  BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
8387    .addReg(X86::XMM0);
8388
8389  F->DeleteMachineInstr(MI);
8390
8391  return BB;
8392}
8393
8394MachineBasicBlock *
8395X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
8396                                                 MachineInstr *MI,
8397                                                 MachineBasicBlock *MBB) const {
8398  // Emit code to save XMM registers to the stack. The ABI says that the
8399  // number of registers to save is given in %al, so it's theoretically
8400  // possible to do an indirect jump trick to avoid saving all of them,
8401  // however this code takes a simpler approach and just executes all
8402  // of the stores if %al is non-zero. It's less code, and it's probably
8403  // easier on the hardware branch predictor, and stores aren't all that
8404  // expensive anyway.
8405
8406  // Create the new basic blocks. One block contains all the XMM stores,
8407  // and one block is the final destination regardless of whether any
8408  // stores were performed.
8409  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
8410  MachineFunction *F = MBB->getParent();
8411  MachineFunction::iterator MBBIter = MBB;
8412  ++MBBIter;
8413  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
8414  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
8415  F->insert(MBBIter, XMMSaveMBB);
8416  F->insert(MBBIter, EndMBB);
8417
8418  // Set up the CFG.
8419  // Move any original successors of MBB to the end block.
8420  EndMBB->transferSuccessors(MBB);
8421  // The original block will now fall through to the XMM save block.
8422  MBB->addSuccessor(XMMSaveMBB);
8423  // The XMMSaveMBB will fall through to the end block.
8424  XMMSaveMBB->addSuccessor(EndMBB);
8425
8426  // Now add the instructions.
8427  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8428  DebugLoc DL = MI->getDebugLoc();
8429
8430  unsigned CountReg = MI->getOperand(0).getReg();
8431  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
8432  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
8433
8434  if (!Subtarget->isTargetWin64()) {
8435    // If %al is 0, branch around the XMM save block.
8436    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
8437    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
8438    MBB->addSuccessor(EndMBB);
8439  }
8440
8441  // In the XMM save block, save all the XMM argument registers.
8442  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
8443    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
8444    MachineMemOperand *MMO =
8445      F->getMachineMemOperand(
8446        PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
8447        MachineMemOperand::MOStore, Offset,
8448        /*Size=*/16, /*Align=*/16);
8449    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
8450      .addFrameIndex(RegSaveFrameIndex)
8451      .addImm(/*Scale=*/1)
8452      .addReg(/*IndexReg=*/0)
8453      .addImm(/*Disp=*/Offset)
8454      .addReg(/*Segment=*/0)
8455      .addReg(MI->getOperand(i).getReg())
8456      .addMemOperand(MMO);
8457  }
8458
8459  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8460
8461  return EndMBB;
8462}
8463
8464MachineBasicBlock *
8465X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
8466                                     MachineBasicBlock *BB) const {
8467  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8468  DebugLoc DL = MI->getDebugLoc();
8469
8470  // To "insert" a SELECT_CC instruction, we actually have to insert the
8471  // diamond control-flow pattern.  The incoming instruction knows the
8472  // destination vreg to set, the condition code register to branch on, the
8473  // true/false values to select between, and a branch opcode to use.
8474  const BasicBlock *LLVM_BB = BB->getBasicBlock();
8475  MachineFunction::iterator It = BB;
8476  ++It;
8477
8478  //  thisMBB:
8479  //  ...
8480  //   TrueVal = ...
8481  //   cmpTY ccX, r1, r2
8482  //   bCC copy1MBB
8483  //   fallthrough --> copy0MBB
8484  MachineBasicBlock *thisMBB = BB;
8485  MachineFunction *F = BB->getParent();
8486  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
8487  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
8488  unsigned Opc =
8489    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
8490
8491  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
8492  F->insert(It, copy0MBB);
8493  F->insert(It, sinkMBB);
8494
8495  // Update machine-CFG edges by first adding all successors of the current
8496  // block to the new block which will contain the Phi node for the select.
8497  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
8498         E = BB->succ_end(); I != E; ++I)
8499    sinkMBB->addSuccessor(*I);
8500
8501  // Next, remove all successors of the current block, and add the true
8502  // and fallthrough blocks as its successors.
8503  while (!BB->succ_empty())
8504    BB->removeSuccessor(BB->succ_begin());
8505
8506  // Add the true and fallthrough blocks as its successors.
8507  BB->addSuccessor(copy0MBB);
8508  BB->addSuccessor(sinkMBB);
8509
8510  // If the EFLAGS register isn't dead in the terminator, then claim that it's
8511  // live into the sink and copy blocks.
8512  const MachineFunction *MF = BB->getParent();
8513  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
8514  BitVector ReservedRegs = TRI->getReservedRegs(*MF);
8515  const MachineInstr *Term = BB->getFirstTerminator();
8516
8517  for (unsigned I = 0, E = Term->getNumOperands(); I != E; ++I) {
8518    const MachineOperand &MO = Term->getOperand(I);
8519    if (!MO.isReg() || MO.isKill() || MO.isDead()) continue;
8520    unsigned Reg = MO.getReg();
8521    if (Reg != X86::EFLAGS) continue;
8522    copy0MBB->addLiveIn(Reg);
8523    sinkMBB->addLiveIn(Reg);
8524  }
8525
8526  //  copy0MBB:
8527  //   %FalseValue = ...
8528  //   # fallthrough to sinkMBB
8529  copy0MBB->addSuccessor(sinkMBB);
8530
8531  //  sinkMBB:
8532  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
8533  //  ...
8534  BuildMI(sinkMBB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg())
8535    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
8536    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
8537
8538  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8539  return sinkMBB;
8540}
8541
8542MachineBasicBlock *
8543X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI,
8544                                          MachineBasicBlock *BB) const {
8545  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8546  DebugLoc DL = MI->getDebugLoc();
8547  MachineFunction *F = BB->getParent();
8548
8549  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
8550  // non-trivial part is impdef of ESP.
8551  // FIXME: The code should be tweaked as soon as we'll try to do codegen for
8552  // mingw-w64.
8553
8554  BuildMI(BB, DL, TII->get(X86::CALLpcrel32))
8555    .addExternalSymbol("_alloca")
8556    .addReg(X86::EAX, RegState::Implicit)
8557    .addReg(X86::ESP, RegState::Implicit)
8558    .addReg(X86::EAX, RegState::Define | RegState::Implicit)
8559    .addReg(X86::ESP, RegState::Define | RegState::Implicit);
8560
8561  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8562  return BB;
8563}
8564
8565MachineBasicBlock *
8566X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
8567                                      MachineBasicBlock *BB) const {
8568  // This is pretty easy.  We're taking the value that we received from
8569  // our load from the relocation, sticking it in either RDI (x86-64)
8570  // or EAX and doing an indirect call.  The return value will then
8571  // be in the normal return register.
8572  const X86InstrInfo *TII
8573    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
8574  DebugLoc DL = MI->getDebugLoc();
8575  MachineFunction *F = BB->getParent();
8576
8577  assert(MI->getOperand(3).isGlobal() && "This should be a global");
8578
8579  if (Subtarget->is64Bit()) {
8580    MachineInstrBuilder MIB = BuildMI(BB, DL, TII->get(X86::MOV64rm), X86::RDI)
8581    .addReg(X86::RIP)
8582    .addImm(0).addReg(0)
8583    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
8584                      MI->getOperand(3).getTargetFlags())
8585    .addReg(0);
8586    MIB = BuildMI(BB, DL, TII->get(X86::CALL64m));
8587    addDirectMem(MIB, X86::RDI).addReg(0);
8588  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
8589    MachineInstrBuilder MIB = BuildMI(BB, DL, TII->get(X86::MOV32rm), X86::EAX)
8590    .addReg(0)
8591    .addImm(0).addReg(0)
8592    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
8593                      MI->getOperand(3).getTargetFlags())
8594    .addReg(0);
8595    MIB = BuildMI(BB, DL, TII->get(X86::CALL32m));
8596    addDirectMem(MIB, X86::EAX).addReg(0);
8597  } else {
8598    MachineInstrBuilder MIB = BuildMI(BB, DL, TII->get(X86::MOV32rm), X86::EAX)
8599    .addReg(TII->getGlobalBaseReg(F))
8600    .addImm(0).addReg(0)
8601    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
8602                      MI->getOperand(3).getTargetFlags())
8603    .addReg(0);
8604    MIB = BuildMI(BB, DL, TII->get(X86::CALL32m));
8605    addDirectMem(MIB, X86::EAX).addReg(0);
8606  }
8607
8608  F->DeleteMachineInstr(MI); // The pseudo instruction is gone now.
8609  return BB;
8610}
8611
8612MachineBasicBlock *
8613X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
8614                                               MachineBasicBlock *BB) const {
8615  switch (MI->getOpcode()) {
8616  default: assert(false && "Unexpected instr type to insert");
8617  case X86::MINGW_ALLOCA:
8618    return EmitLoweredMingwAlloca(MI, BB);
8619  case X86::TLSCall_32:
8620  case X86::TLSCall_64:
8621    return EmitLoweredTLSCall(MI, BB);
8622  case X86::CMOV_GR8:
8623  case X86::CMOV_V1I64:
8624  case X86::CMOV_FR32:
8625  case X86::CMOV_FR64:
8626  case X86::CMOV_V4F32:
8627  case X86::CMOV_V2F64:
8628  case X86::CMOV_V2I64:
8629  case X86::CMOV_GR16:
8630  case X86::CMOV_GR32:
8631  case X86::CMOV_RFP32:
8632  case X86::CMOV_RFP64:
8633  case X86::CMOV_RFP80:
8634    return EmitLoweredSelect(MI, BB);
8635
8636  case X86::FP32_TO_INT16_IN_MEM:
8637  case X86::FP32_TO_INT32_IN_MEM:
8638  case X86::FP32_TO_INT64_IN_MEM:
8639  case X86::FP64_TO_INT16_IN_MEM:
8640  case X86::FP64_TO_INT32_IN_MEM:
8641  case X86::FP64_TO_INT64_IN_MEM:
8642  case X86::FP80_TO_INT16_IN_MEM:
8643  case X86::FP80_TO_INT32_IN_MEM:
8644  case X86::FP80_TO_INT64_IN_MEM: {
8645    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
8646    DebugLoc DL = MI->getDebugLoc();
8647
8648    // Change the floating point control register to use "round towards zero"
8649    // mode when truncating to an integer value.
8650    MachineFunction *F = BB->getParent();
8651    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
8652    addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx);
8653
8654    // Load the old value of the high byte of the control word...
8655    unsigned OldCW =
8656      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
8657    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW),
8658                      CWFrameIdx);
8659
8660    // Set the high part to be round to zero...
8661    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
8662      .addImm(0xC7F);
8663
8664    // Reload the modified control word now...
8665    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
8666
8667    // Restore the memory image of control word to original value
8668    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
8669      .addReg(OldCW);
8670
8671    // Get the X86 opcode to use.
8672    unsigned Opc;
8673    switch (MI->getOpcode()) {
8674    default: llvm_unreachable("illegal opcode!");
8675    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
8676    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
8677    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
8678    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
8679    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
8680    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
8681    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
8682    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
8683    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
8684    }
8685
8686    X86AddressMode AM;
8687    MachineOperand &Op = MI->getOperand(0);
8688    if (Op.isReg()) {
8689      AM.BaseType = X86AddressMode::RegBase;
8690      AM.Base.Reg = Op.getReg();
8691    } else {
8692      AM.BaseType = X86AddressMode::FrameIndexBase;
8693      AM.Base.FrameIndex = Op.getIndex();
8694    }
8695    Op = MI->getOperand(1);
8696    if (Op.isImm())
8697      AM.Scale = Op.getImm();
8698    Op = MI->getOperand(2);
8699    if (Op.isImm())
8700      AM.IndexReg = Op.getImm();
8701    Op = MI->getOperand(3);
8702    if (Op.isGlobal()) {
8703      AM.GV = Op.getGlobal();
8704    } else {
8705      AM.Disp = Op.getImm();
8706    }
8707    addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM)
8708                      .addReg(MI->getOperand(X86AddrNumOperands).getReg());
8709
8710    // Reload the original control word now.
8711    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
8712
8713    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
8714    return BB;
8715  }
8716    // String/text processing lowering.
8717  case X86::PCMPISTRM128REG:
8718    return EmitPCMP(MI, BB, 3, false /* in-mem */);
8719  case X86::PCMPISTRM128MEM:
8720    return EmitPCMP(MI, BB, 3, true /* in-mem */);
8721  case X86::PCMPESTRM128REG:
8722    return EmitPCMP(MI, BB, 5, false /* in mem */);
8723  case X86::PCMPESTRM128MEM:
8724    return EmitPCMP(MI, BB, 5, true /* in mem */);
8725
8726    // Atomic Lowering.
8727  case X86::ATOMAND32:
8728    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8729                                               X86::AND32ri, X86::MOV32rm,
8730                                               X86::LCMPXCHG32, X86::MOV32rr,
8731                                               X86::NOT32r, X86::EAX,
8732                                               X86::GR32RegisterClass);
8733  case X86::ATOMOR32:
8734    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
8735                                               X86::OR32ri, X86::MOV32rm,
8736                                               X86::LCMPXCHG32, X86::MOV32rr,
8737                                               X86::NOT32r, X86::EAX,
8738                                               X86::GR32RegisterClass);
8739  case X86::ATOMXOR32:
8740    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
8741                                               X86::XOR32ri, X86::MOV32rm,
8742                                               X86::LCMPXCHG32, X86::MOV32rr,
8743                                               X86::NOT32r, X86::EAX,
8744                                               X86::GR32RegisterClass);
8745  case X86::ATOMNAND32:
8746    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
8747                                               X86::AND32ri, X86::MOV32rm,
8748                                               X86::LCMPXCHG32, X86::MOV32rr,
8749                                               X86::NOT32r, X86::EAX,
8750                                               X86::GR32RegisterClass, true);
8751  case X86::ATOMMIN32:
8752    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
8753  case X86::ATOMMAX32:
8754    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
8755  case X86::ATOMUMIN32:
8756    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
8757  case X86::ATOMUMAX32:
8758    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
8759
8760  case X86::ATOMAND16:
8761    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8762                                               X86::AND16ri, X86::MOV16rm,
8763                                               X86::LCMPXCHG16, X86::MOV16rr,
8764                                               X86::NOT16r, X86::AX,
8765                                               X86::GR16RegisterClass);
8766  case X86::ATOMOR16:
8767    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
8768                                               X86::OR16ri, X86::MOV16rm,
8769                                               X86::LCMPXCHG16, X86::MOV16rr,
8770                                               X86::NOT16r, X86::AX,
8771                                               X86::GR16RegisterClass);
8772  case X86::ATOMXOR16:
8773    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
8774                                               X86::XOR16ri, X86::MOV16rm,
8775                                               X86::LCMPXCHG16, X86::MOV16rr,
8776                                               X86::NOT16r, X86::AX,
8777                                               X86::GR16RegisterClass);
8778  case X86::ATOMNAND16:
8779    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
8780                                               X86::AND16ri, X86::MOV16rm,
8781                                               X86::LCMPXCHG16, X86::MOV16rr,
8782                                               X86::NOT16r, X86::AX,
8783                                               X86::GR16RegisterClass, true);
8784  case X86::ATOMMIN16:
8785    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
8786  case X86::ATOMMAX16:
8787    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
8788  case X86::ATOMUMIN16:
8789    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
8790  case X86::ATOMUMAX16:
8791    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
8792
8793  case X86::ATOMAND8:
8794    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8795                                               X86::AND8ri, X86::MOV8rm,
8796                                               X86::LCMPXCHG8, X86::MOV8rr,
8797                                               X86::NOT8r, X86::AL,
8798                                               X86::GR8RegisterClass);
8799  case X86::ATOMOR8:
8800    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
8801                                               X86::OR8ri, X86::MOV8rm,
8802                                               X86::LCMPXCHG8, X86::MOV8rr,
8803                                               X86::NOT8r, X86::AL,
8804                                               X86::GR8RegisterClass);
8805  case X86::ATOMXOR8:
8806    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
8807                                               X86::XOR8ri, X86::MOV8rm,
8808                                               X86::LCMPXCHG8, X86::MOV8rr,
8809                                               X86::NOT8r, X86::AL,
8810                                               X86::GR8RegisterClass);
8811  case X86::ATOMNAND8:
8812    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
8813                                               X86::AND8ri, X86::MOV8rm,
8814                                               X86::LCMPXCHG8, X86::MOV8rr,
8815                                               X86::NOT8r, X86::AL,
8816                                               X86::GR8RegisterClass, true);
8817  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
8818  // This group is for 64-bit host.
8819  case X86::ATOMAND64:
8820    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8821                                               X86::AND64ri32, X86::MOV64rm,
8822                                               X86::LCMPXCHG64, X86::MOV64rr,
8823                                               X86::NOT64r, X86::RAX,
8824                                               X86::GR64RegisterClass);
8825  case X86::ATOMOR64:
8826    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
8827                                               X86::OR64ri32, X86::MOV64rm,
8828                                               X86::LCMPXCHG64, X86::MOV64rr,
8829                                               X86::NOT64r, X86::RAX,
8830                                               X86::GR64RegisterClass);
8831  case X86::ATOMXOR64:
8832    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
8833                                               X86::XOR64ri32, X86::MOV64rm,
8834                                               X86::LCMPXCHG64, X86::MOV64rr,
8835                                               X86::NOT64r, X86::RAX,
8836                                               X86::GR64RegisterClass);
8837  case X86::ATOMNAND64:
8838    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
8839                                               X86::AND64ri32, X86::MOV64rm,
8840                                               X86::LCMPXCHG64, X86::MOV64rr,
8841                                               X86::NOT64r, X86::RAX,
8842                                               X86::GR64RegisterClass, true);
8843  case X86::ATOMMIN64:
8844    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
8845  case X86::ATOMMAX64:
8846    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
8847  case X86::ATOMUMIN64:
8848    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
8849  case X86::ATOMUMAX64:
8850    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
8851
8852  // This group does 64-bit operations on a 32-bit host.
8853  case X86::ATOMAND6432:
8854    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8855                                               X86::AND32rr, X86::AND32rr,
8856                                               X86::AND32ri, X86::AND32ri,
8857                                               false);
8858  case X86::ATOMOR6432:
8859    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8860                                               X86::OR32rr, X86::OR32rr,
8861                                               X86::OR32ri, X86::OR32ri,
8862                                               false);
8863  case X86::ATOMXOR6432:
8864    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8865                                               X86::XOR32rr, X86::XOR32rr,
8866                                               X86::XOR32ri, X86::XOR32ri,
8867                                               false);
8868  case X86::ATOMNAND6432:
8869    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8870                                               X86::AND32rr, X86::AND32rr,
8871                                               X86::AND32ri, X86::AND32ri,
8872                                               true);
8873  case X86::ATOMADD6432:
8874    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8875                                               X86::ADD32rr, X86::ADC32rr,
8876                                               X86::ADD32ri, X86::ADC32ri,
8877                                               false);
8878  case X86::ATOMSUB6432:
8879    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8880                                               X86::SUB32rr, X86::SBB32rr,
8881                                               X86::SUB32ri, X86::SBB32ri,
8882                                               false);
8883  case X86::ATOMSWAP6432:
8884    return EmitAtomicBit6432WithCustomInserter(MI, BB,
8885                                               X86::MOV32rr, X86::MOV32rr,
8886                                               X86::MOV32ri, X86::MOV32ri,
8887                                               false);
8888  case X86::VASTART_SAVE_XMM_REGS:
8889    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
8890  }
8891}
8892
8893//===----------------------------------------------------------------------===//
8894//                           X86 Optimization Hooks
8895//===----------------------------------------------------------------------===//
8896
8897void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
8898                                                       const APInt &Mask,
8899                                                       APInt &KnownZero,
8900                                                       APInt &KnownOne,
8901                                                       const SelectionDAG &DAG,
8902                                                       unsigned Depth) const {
8903  unsigned Opc = Op.getOpcode();
8904  assert((Opc >= ISD::BUILTIN_OP_END ||
8905          Opc == ISD::INTRINSIC_WO_CHAIN ||
8906          Opc == ISD::INTRINSIC_W_CHAIN ||
8907          Opc == ISD::INTRINSIC_VOID) &&
8908         "Should use MaskedValueIsZero if you don't know whether Op"
8909         " is a target node!");
8910
8911  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
8912  switch (Opc) {
8913  default: break;
8914  case X86ISD::ADD:
8915  case X86ISD::SUB:
8916  case X86ISD::SMUL:
8917  case X86ISD::UMUL:
8918  case X86ISD::INC:
8919  case X86ISD::DEC:
8920  case X86ISD::OR:
8921  case X86ISD::XOR:
8922  case X86ISD::AND:
8923    // These nodes' second result is a boolean.
8924    if (Op.getResNo() == 0)
8925      break;
8926    // Fallthrough
8927  case X86ISD::SETCC:
8928    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
8929                                       Mask.getBitWidth() - 1);
8930    break;
8931  }
8932}
8933
8934/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
8935/// node is a GlobalAddress + offset.
8936bool X86TargetLowering::isGAPlusOffset(SDNode *N,
8937                                       const GlobalValue* &GA,
8938                                       int64_t &Offset) const {
8939  if (N->getOpcode() == X86ISD::Wrapper) {
8940    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
8941      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
8942      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
8943      return true;
8944    }
8945  }
8946  return TargetLowering::isGAPlusOffset(N, GA, Offset);
8947}
8948
8949/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
8950/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
8951/// if the load addresses are consecutive, non-overlapping, and in the right
8952/// order.
8953static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
8954                                     const TargetLowering &TLI) {
8955  DebugLoc dl = N->getDebugLoc();
8956  EVT VT = N->getValueType(0);
8957  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
8958
8959  if (VT.getSizeInBits() != 128)
8960    return SDValue();
8961
8962  SmallVector<SDValue, 16> Elts;
8963  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
8964    Elts.push_back(DAG.getShuffleScalarElt(SVN, i));
8965
8966  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
8967}
8968
8969/// PerformShuffleCombine - Detect vector gather/scatter index generation
8970/// and convert it from being a bunch of shuffles and extracts to a simple
8971/// store and scalar loads to extract the elements.
8972static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
8973                                                const TargetLowering &TLI) {
8974  SDValue InputVector = N->getOperand(0);
8975
8976  // Only operate on vectors of 4 elements, where the alternative shuffling
8977  // gets to be more expensive.
8978  if (InputVector.getValueType() != MVT::v4i32)
8979    return SDValue();
8980
8981  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
8982  // single use which is a sign-extend or zero-extend, and all elements are
8983  // used.
8984  SmallVector<SDNode *, 4> Uses;
8985  unsigned ExtractedElements = 0;
8986  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
8987       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
8988    if (UI.getUse().getResNo() != InputVector.getResNo())
8989      return SDValue();
8990
8991    SDNode *Extract = *UI;
8992    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8993      return SDValue();
8994
8995    if (Extract->getValueType(0) != MVT::i32)
8996      return SDValue();
8997    if (!Extract->hasOneUse())
8998      return SDValue();
8999    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
9000        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
9001      return SDValue();
9002    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
9003      return SDValue();
9004
9005    // Record which element was extracted.
9006    ExtractedElements |=
9007      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
9008
9009    Uses.push_back(Extract);
9010  }
9011
9012  // If not all the elements were used, this may not be worthwhile.
9013  if (ExtractedElements != 15)
9014    return SDValue();
9015
9016  // Ok, we've now decided to do the transformation.
9017  DebugLoc dl = InputVector.getDebugLoc();
9018
9019  // Store the value to a temporary stack slot.
9020  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
9021  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0,
9022                            false, false, 0);
9023
9024  // Replace each use (extract) with a load of the appropriate element.
9025  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
9026       UE = Uses.end(); UI != UE; ++UI) {
9027    SDNode *Extract = *UI;
9028
9029    // Compute the element's address.
9030    SDValue Idx = Extract->getOperand(1);
9031    unsigned EltSize =
9032        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
9033    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
9034    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());
9035
9036    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr);
9037
9038    // Load the scalar.
9039    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr,
9040                          NULL, 0, false, false, 0);
9041
9042    // Replace the exact with the load.
9043    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
9044  }
9045
9046  // The replacement was made in place; don't return anything.
9047  return SDValue();
9048}
9049
9050/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
9051static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
9052                                    const X86Subtarget *Subtarget) {
9053  DebugLoc DL = N->getDebugLoc();
9054  SDValue Cond = N->getOperand(0);
9055  // Get the LHS/RHS of the select.
9056  SDValue LHS = N->getOperand(1);
9057  SDValue RHS = N->getOperand(2);
9058
9059  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
9060  // instructions match the semantics of the common C idiom x<y?x:y but not
9061  // x<=y?x:y, because of how they handle negative zero (which can be
9062  // ignored in unsafe-math mode).
9063  if (Subtarget->hasSSE2() &&
9064      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
9065      Cond.getOpcode() == ISD::SETCC) {
9066    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
9067
9068    unsigned Opcode = 0;
9069    // Check for x CC y ? x : y.
9070    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
9071        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
9072      switch (CC) {
9073      default: break;
9074      case ISD::SETULT:
9075        // Converting this to a min would handle NaNs incorrectly, and swapping
9076        // the operands would cause it to handle comparisons between positive
9077        // and negative zero incorrectly.
9078        if (!FiniteOnlyFPMath() &&
9079            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
9080          if (!UnsafeFPMath &&
9081              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9082            break;
9083          std::swap(LHS, RHS);
9084        }
9085        Opcode = X86ISD::FMIN;
9086        break;
9087      case ISD::SETOLE:
9088        // Converting this to a min would handle comparisons between positive
9089        // and negative zero incorrectly.
9090        if (!UnsafeFPMath &&
9091            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
9092          break;
9093        Opcode = X86ISD::FMIN;
9094        break;
9095      case ISD::SETULE:
9096        // Converting this to a min would handle both negative zeros and NaNs
9097        // incorrectly, but we can swap the operands to fix both.
9098        std::swap(LHS, RHS);
9099      case ISD::SETOLT:
9100      case ISD::SETLT:
9101      case ISD::SETLE:
9102        Opcode = X86ISD::FMIN;
9103        break;
9104
9105      case ISD::SETOGE:
9106        // Converting this to a max would handle comparisons between positive
9107        // and negative zero incorrectly.
9108        if (!UnsafeFPMath &&
9109            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
9110          break;
9111        Opcode = X86ISD::FMAX;
9112        break;
9113      case ISD::SETUGT:
9114        // Converting this to a max would handle NaNs incorrectly, and swapping
9115        // the operands would cause it to handle comparisons between positive
9116        // and negative zero incorrectly.
9117        if (!FiniteOnlyFPMath() &&
9118            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) {
9119          if (!UnsafeFPMath &&
9120              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
9121            break;
9122          std::swap(LHS, RHS);
9123        }
9124        Opcode = X86ISD::FMAX;
9125        break;
9126      case ISD::SETUGE:
9127        // Converting this to a max would handle both negative zeros and NaNs
9128        // incorrectly, but we can swap the operands to fix both.
9129        std::swap(LHS, RHS);
9130      case ISD::SETOGT:
9131      case ISD::SETGT:
9132      case ISD::SETGE:
9133        Opcode = X86ISD::FMAX;
9134        break;
9135      }
9136    // Check for x CC y ? y : x -- a min/max with reversed arms.
9137    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
9138               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
9139      switch (CC) {
9140      default: break;
9141      case ISD::SETOGE:
9142        // Converting this to a min would handle comparisons between positive
9143        // and negative zero incorrectly, and swapping the operands would
9144        // cause it to handle NaNs incorrectly.
9145        if (!UnsafeFPMath &&
9146            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
9147          if (!FiniteOnlyFPMath() &&
9148              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
9149            break;
9150          std::swap(LHS, RHS);
9151        }
9152        Opcode = X86ISD::FMIN;
9153        break;
9154      case ISD::SETUGT:
9155        // Converting this to a min would handle NaNs incorrectly.
9156        if (!UnsafeFPMath &&
9157            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
9158          break;
9159        Opcode = X86ISD::FMIN;
9160        break;
9161      case ISD::SETUGE:
9162        // Converting this to a min would handle both negative zeros and NaNs
9163        // incorrectly, but we can swap the operands to fix both.
9164        std::swap(LHS, RHS);
9165      case ISD::SETOGT:
9166      case ISD::SETGT:
9167      case ISD::SETGE:
9168        Opcode = X86ISD::FMIN;
9169        break;
9170
9171      case ISD::SETULT:
9172        // Converting this to a max would handle NaNs incorrectly.
9173        if (!FiniteOnlyFPMath() &&
9174            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
9175          break;
9176        Opcode = X86ISD::FMAX;
9177        break;
9178      case ISD::SETOLE:
9179        // Converting this to a max would handle comparisons between positive
9180        // and negative zero incorrectly, and swapping the operands would
9181        // cause it to handle NaNs incorrectly.
9182        if (!UnsafeFPMath &&
9183            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
9184          if (!FiniteOnlyFPMath() &&
9185              (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
9186            break;
9187          std::swap(LHS, RHS);
9188        }
9189        Opcode = X86ISD::FMAX;
9190        break;
9191      case ISD::SETULE:
9192        // Converting this to a max would handle both negative zeros and NaNs
9193        // incorrectly, but we can swap the operands to fix both.
9194        std::swap(LHS, RHS);
9195      case ISD::SETOLT:
9196      case ISD::SETLT:
9197      case ISD::SETLE:
9198        Opcode = X86ISD::FMAX;
9199        break;
9200      }
9201    }
9202
9203    if (Opcode)
9204      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
9205  }
9206
9207  // If this is a select between two integer constants, try to do some
9208  // optimizations.
9209  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
9210    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
9211      // Don't do this for crazy integer types.
9212      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
9213        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
9214        // so that TrueC (the true value) is larger than FalseC.
9215        bool NeedsCondInvert = false;
9216
9217        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
9218            // Efficiently invertible.
9219            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
9220             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
9221              isa<ConstantSDNode>(Cond.getOperand(1))))) {
9222          NeedsCondInvert = true;
9223          std::swap(TrueC, FalseC);
9224        }
9225
9226        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
9227        if (FalseC->getAPIntValue() == 0 &&
9228            TrueC->getAPIntValue().isPowerOf2()) {
9229          if (NeedsCondInvert) // Invert the condition if needed.
9230            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9231                               DAG.getConstant(1, Cond.getValueType()));
9232
9233          // Zero extend the condition if needed.
9234          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
9235
9236          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
9237          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
9238                             DAG.getConstant(ShAmt, MVT::i8));
9239        }
9240
9241        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
9242        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
9243          if (NeedsCondInvert) // Invert the condition if needed.
9244            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9245                               DAG.getConstant(1, Cond.getValueType()));
9246
9247          // Zero extend the condition if needed.
9248          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
9249                             FalseC->getValueType(0), Cond);
9250          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9251                             SDValue(FalseC, 0));
9252        }
9253
9254        // Optimize cases that will turn into an LEA instruction.  This requires
9255        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
9256        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
9257          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
9258          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
9259
9260          bool isFastMultiplier = false;
9261          if (Diff < 10) {
9262            switch ((unsigned char)Diff) {
9263              default: break;
9264              case 1:  // result = add base, cond
9265              case 2:  // result = lea base(    , cond*2)
9266              case 3:  // result = lea base(cond, cond*2)
9267              case 4:  // result = lea base(    , cond*4)
9268              case 5:  // result = lea base(cond, cond*4)
9269              case 8:  // result = lea base(    , cond*8)
9270              case 9:  // result = lea base(cond, cond*8)
9271                isFastMultiplier = true;
9272                break;
9273            }
9274          }
9275
9276          if (isFastMultiplier) {
9277            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
9278            if (NeedsCondInvert) // Invert the condition if needed.
9279              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
9280                                 DAG.getConstant(1, Cond.getValueType()));
9281
9282            // Zero extend the condition if needed.
9283            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
9284                               Cond);
9285            // Scale the condition by the difference.
9286            if (Diff != 1)
9287              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
9288                                 DAG.getConstant(Diff, Cond.getValueType()));
9289
9290            // Add the base if non-zero.
9291            if (FalseC->getAPIntValue() != 0)
9292              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9293                                 SDValue(FalseC, 0));
9294            return Cond;
9295          }
9296        }
9297      }
9298  }
9299
9300  return SDValue();
9301}
9302
9303/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
9304static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
9305                                  TargetLowering::DAGCombinerInfo &DCI) {
9306  DebugLoc DL = N->getDebugLoc();
9307
9308  // If the flag operand isn't dead, don't touch this CMOV.
9309  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
9310    return SDValue();
9311
9312  // If this is a select between two integer constants, try to do some
9313  // optimizations.  Note that the operands are ordered the opposite of SELECT
9314  // operands.
9315  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
9316    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
9317      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
9318      // larger than FalseC (the false value).
9319      X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
9320
9321      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
9322        CC = X86::GetOppositeBranchCondition(CC);
9323        std::swap(TrueC, FalseC);
9324      }
9325
9326      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
9327      // This is efficient for any integer data type (including i8/i16) and
9328      // shift amount.
9329      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
9330        SDValue Cond = N->getOperand(3);
9331        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9332                           DAG.getConstant(CC, MVT::i8), Cond);
9333
9334        // Zero extend the condition if needed.
9335        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
9336
9337        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
9338        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
9339                           DAG.getConstant(ShAmt, MVT::i8));
9340        if (N->getNumValues() == 2)  // Dead flag value?
9341          return DCI.CombineTo(N, Cond, SDValue());
9342        return Cond;
9343      }
9344
9345      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
9346      // for any integer data type, including i8/i16.
9347      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
9348        SDValue Cond = N->getOperand(3);
9349        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9350                           DAG.getConstant(CC, MVT::i8), Cond);
9351
9352        // Zero extend the condition if needed.
9353        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
9354                           FalseC->getValueType(0), Cond);
9355        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9356                           SDValue(FalseC, 0));
9357
9358        if (N->getNumValues() == 2)  // Dead flag value?
9359          return DCI.CombineTo(N, Cond, SDValue());
9360        return Cond;
9361      }
9362
9363      // Optimize cases that will turn into an LEA instruction.  This requires
9364      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
9365      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
9366        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
9367        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
9368
9369        bool isFastMultiplier = false;
9370        if (Diff < 10) {
9371          switch ((unsigned char)Diff) {
9372          default: break;
9373          case 1:  // result = add base, cond
9374          case 2:  // result = lea base(    , cond*2)
9375          case 3:  // result = lea base(cond, cond*2)
9376          case 4:  // result = lea base(    , cond*4)
9377          case 5:  // result = lea base(cond, cond*4)
9378          case 8:  // result = lea base(    , cond*8)
9379          case 9:  // result = lea base(cond, cond*8)
9380            isFastMultiplier = true;
9381            break;
9382          }
9383        }
9384
9385        if (isFastMultiplier) {
9386          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
9387          SDValue Cond = N->getOperand(3);
9388          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
9389                             DAG.getConstant(CC, MVT::i8), Cond);
9390          // Zero extend the condition if needed.
9391          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
9392                             Cond);
9393          // Scale the condition by the difference.
9394          if (Diff != 1)
9395            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
9396                               DAG.getConstant(Diff, Cond.getValueType()));
9397
9398          // Add the base if non-zero.
9399          if (FalseC->getAPIntValue() != 0)
9400            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
9401                               SDValue(FalseC, 0));
9402          if (N->getNumValues() == 2)  // Dead flag value?
9403            return DCI.CombineTo(N, Cond, SDValue());
9404          return Cond;
9405        }
9406      }
9407    }
9408  }
9409  return SDValue();
9410}
9411
9412
9413/// PerformMulCombine - Optimize a single multiply with constant into two
9414/// in order to implement it with two cheaper instructions, e.g.
9415/// LEA + SHL, LEA + LEA.
9416static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
9417                                 TargetLowering::DAGCombinerInfo &DCI) {
9418  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
9419    return SDValue();
9420
9421  EVT VT = N->getValueType(0);
9422  if (VT != MVT::i64)
9423    return SDValue();
9424
9425  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
9426  if (!C)
9427    return SDValue();
9428  uint64_t MulAmt = C->getZExtValue();
9429  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
9430    return SDValue();
9431
9432  uint64_t MulAmt1 = 0;
9433  uint64_t MulAmt2 = 0;
9434  if ((MulAmt % 9) == 0) {
9435    MulAmt1 = 9;
9436    MulAmt2 = MulAmt / 9;
9437  } else if ((MulAmt % 5) == 0) {
9438    MulAmt1 = 5;
9439    MulAmt2 = MulAmt / 5;
9440  } else if ((MulAmt % 3) == 0) {
9441    MulAmt1 = 3;
9442    MulAmt2 = MulAmt / 3;
9443  }
9444  if (MulAmt2 &&
9445      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
9446    DebugLoc DL = N->getDebugLoc();
9447
9448    if (isPowerOf2_64(MulAmt2) &&
9449        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
9450      // If second multiplifer is pow2, issue it first. We want the multiply by
9451      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
9452      // is an add.
9453      std::swap(MulAmt1, MulAmt2);
9454
9455    SDValue NewMul;
9456    if (isPowerOf2_64(MulAmt1))
9457      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
9458                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
9459    else
9460      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
9461                           DAG.getConstant(MulAmt1, VT));
9462
9463    if (isPowerOf2_64(MulAmt2))
9464      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
9465                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
9466    else
9467      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
9468                           DAG.getConstant(MulAmt2, VT));
9469
9470    // Do not add new nodes to DAG combiner worklist.
9471    DCI.CombineTo(N, NewMul, false);
9472  }
9473  return SDValue();
9474}
9475
9476static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
9477  SDValue N0 = N->getOperand(0);
9478  SDValue N1 = N->getOperand(1);
9479  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
9480  EVT VT = N0.getValueType();
9481
9482  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
9483  // since the result of setcc_c is all zero's or all ones.
9484  if (N1C && N0.getOpcode() == ISD::AND &&
9485      N0.getOperand(1).getOpcode() == ISD::Constant) {
9486    SDValue N00 = N0.getOperand(0);
9487    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
9488        ((N00.getOpcode() == ISD::ANY_EXTEND ||
9489          N00.getOpcode() == ISD::ZERO_EXTEND) &&
9490         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
9491      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
9492      APInt ShAmt = N1C->getAPIntValue();
9493      Mask = Mask.shl(ShAmt);
9494      if (Mask != 0)
9495        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
9496                           N00, DAG.getConstant(Mask, VT));
9497    }
9498  }
9499
9500  return SDValue();
9501}
9502
9503/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
9504///                       when possible.
9505static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
9506                                   const X86Subtarget *Subtarget) {
9507  EVT VT = N->getValueType(0);
9508  if (!VT.isVector() && VT.isInteger() &&
9509      N->getOpcode() == ISD::SHL)
9510    return PerformSHLCombine(N, DAG);
9511
9512  // On X86 with SSE2 support, we can transform this to a vector shift if
9513  // all elements are shifted by the same amount.  We can't do this in legalize
9514  // because the a constant vector is typically transformed to a constant pool
9515  // so we have no knowledge of the shift amount.
9516  if (!Subtarget->hasSSE2())
9517    return SDValue();
9518
9519  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
9520    return SDValue();
9521
9522  SDValue ShAmtOp = N->getOperand(1);
9523  EVT EltVT = VT.getVectorElementType();
9524  DebugLoc DL = N->getDebugLoc();
9525  SDValue BaseShAmt = SDValue();
9526  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
9527    unsigned NumElts = VT.getVectorNumElements();
9528    unsigned i = 0;
9529    for (; i != NumElts; ++i) {
9530      SDValue Arg = ShAmtOp.getOperand(i);
9531      if (Arg.getOpcode() == ISD::UNDEF) continue;
9532      BaseShAmt = Arg;
9533      break;
9534    }
9535    for (; i != NumElts; ++i) {
9536      SDValue Arg = ShAmtOp.getOperand(i);
9537      if (Arg.getOpcode() == ISD::UNDEF) continue;
9538      if (Arg != BaseShAmt) {
9539        return SDValue();
9540      }
9541    }
9542  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
9543             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
9544    SDValue InVec = ShAmtOp.getOperand(0);
9545    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
9546      unsigned NumElts = InVec.getValueType().getVectorNumElements();
9547      unsigned i = 0;
9548      for (; i != NumElts; ++i) {
9549        SDValue Arg = InVec.getOperand(i);
9550        if (Arg.getOpcode() == ISD::UNDEF) continue;
9551        BaseShAmt = Arg;
9552        break;
9553      }
9554    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
9555       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
9556         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
9557         if (C->getZExtValue() == SplatIdx)
9558           BaseShAmt = InVec.getOperand(1);
9559       }
9560    }
9561    if (BaseShAmt.getNode() == 0)
9562      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
9563                              DAG.getIntPtrConstant(0));
9564  } else
9565    return SDValue();
9566
9567  // The shift amount is an i32.
9568  if (EltVT.bitsGT(MVT::i32))
9569    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
9570  else if (EltVT.bitsLT(MVT::i32))
9571    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
9572
9573  // The shift amount is identical so we can do a vector shift.
9574  SDValue  ValOp = N->getOperand(0);
9575  switch (N->getOpcode()) {
9576  default:
9577    llvm_unreachable("Unknown shift opcode!");
9578    break;
9579  case ISD::SHL:
9580    if (VT == MVT::v2i64)
9581      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9582                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
9583                         ValOp, BaseShAmt);
9584    if (VT == MVT::v4i32)
9585      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9586                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
9587                         ValOp, BaseShAmt);
9588    if (VT == MVT::v8i16)
9589      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9590                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
9591                         ValOp, BaseShAmt);
9592    break;
9593  case ISD::SRA:
9594    if (VT == MVT::v4i32)
9595      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9596                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
9597                         ValOp, BaseShAmt);
9598    if (VT == MVT::v8i16)
9599      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9600                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
9601                         ValOp, BaseShAmt);
9602    break;
9603  case ISD::SRL:
9604    if (VT == MVT::v2i64)
9605      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9606                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
9607                         ValOp, BaseShAmt);
9608    if (VT == MVT::v4i32)
9609      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9610                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
9611                         ValOp, BaseShAmt);
9612    if (VT ==  MVT::v8i16)
9613      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
9614                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
9615                         ValOp, BaseShAmt);
9616    break;
9617  }
9618  return SDValue();
9619}
9620
9621static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
9622                                TargetLowering::DAGCombinerInfo &DCI,
9623                                const X86Subtarget *Subtarget) {
9624  if (DCI.isBeforeLegalizeOps())
9625    return SDValue();
9626
9627  EVT VT = N->getValueType(0);
9628  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
9629    return SDValue();
9630
9631  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
9632  SDValue N0 = N->getOperand(0);
9633  SDValue N1 = N->getOperand(1);
9634  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
9635    std::swap(N0, N1);
9636  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
9637    return SDValue();
9638  if (!N0.hasOneUse() || !N1.hasOneUse())
9639    return SDValue();
9640
9641  SDValue ShAmt0 = N0.getOperand(1);
9642  if (ShAmt0.getValueType() != MVT::i8)
9643    return SDValue();
9644  SDValue ShAmt1 = N1.getOperand(1);
9645  if (ShAmt1.getValueType() != MVT::i8)
9646    return SDValue();
9647  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
9648    ShAmt0 = ShAmt0.getOperand(0);
9649  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
9650    ShAmt1 = ShAmt1.getOperand(0);
9651
9652  DebugLoc DL = N->getDebugLoc();
9653  unsigned Opc = X86ISD::SHLD;
9654  SDValue Op0 = N0.getOperand(0);
9655  SDValue Op1 = N1.getOperand(0);
9656  if (ShAmt0.getOpcode() == ISD::SUB) {
9657    Opc = X86ISD::SHRD;
9658    std::swap(Op0, Op1);
9659    std::swap(ShAmt0, ShAmt1);
9660  }
9661
9662  unsigned Bits = VT.getSizeInBits();
9663  if (ShAmt1.getOpcode() == ISD::SUB) {
9664    SDValue Sum = ShAmt1.getOperand(0);
9665    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
9666      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
9667      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
9668        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
9669      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
9670        return DAG.getNode(Opc, DL, VT,
9671                           Op0, Op1,
9672                           DAG.getNode(ISD::TRUNCATE, DL,
9673                                       MVT::i8, ShAmt0));
9674    }
9675  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
9676    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
9677    if (ShAmt0C &&
9678        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
9679      return DAG.getNode(Opc, DL, VT,
9680                         N0.getOperand(0), N1.getOperand(0),
9681                         DAG.getNode(ISD::TRUNCATE, DL,
9682                                       MVT::i8, ShAmt0));
9683  }
9684
9685  return SDValue();
9686}
9687
9688/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
9689static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
9690                                   const X86Subtarget *Subtarget) {
9691  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
9692  // the FP state in cases where an emms may be missing.
9693  // A preferable solution to the general problem is to figure out the right
9694  // places to insert EMMS.  This qualifies as a quick hack.
9695
9696  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
9697  StoreSDNode *St = cast<StoreSDNode>(N);
9698  EVT VT = St->getValue().getValueType();
9699  if (VT.getSizeInBits() != 64)
9700    return SDValue();
9701
9702  const Function *F = DAG.getMachineFunction().getFunction();
9703  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
9704  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
9705    && Subtarget->hasSSE2();
9706  if ((VT.isVector() ||
9707       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
9708      isa<LoadSDNode>(St->getValue()) &&
9709      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
9710      St->getChain().hasOneUse() && !St->isVolatile()) {
9711    SDNode* LdVal = St->getValue().getNode();
9712    LoadSDNode *Ld = 0;
9713    int TokenFactorIndex = -1;
9714    SmallVector<SDValue, 8> Ops;
9715    SDNode* ChainVal = St->getChain().getNode();
9716    // Must be a store of a load.  We currently handle two cases:  the load
9717    // is a direct child, and it's under an intervening TokenFactor.  It is
9718    // possible to dig deeper under nested TokenFactors.
9719    if (ChainVal == LdVal)
9720      Ld = cast<LoadSDNode>(St->getChain());
9721    else if (St->getValue().hasOneUse() &&
9722             ChainVal->getOpcode() == ISD::TokenFactor) {
9723      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
9724        if (ChainVal->getOperand(i).getNode() == LdVal) {
9725          TokenFactorIndex = i;
9726          Ld = cast<LoadSDNode>(St->getValue());
9727        } else
9728          Ops.push_back(ChainVal->getOperand(i));
9729      }
9730    }
9731
9732    if (!Ld || !ISD::isNormalLoad(Ld))
9733      return SDValue();
9734
9735    // If this is not the MMX case, i.e. we are just turning i64 load/store
9736    // into f64 load/store, avoid the transformation if there are multiple
9737    // uses of the loaded value.
9738    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
9739      return SDValue();
9740
9741    DebugLoc LdDL = Ld->getDebugLoc();
9742    DebugLoc StDL = N->getDebugLoc();
9743    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
9744    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
9745    // pair instead.
9746    if (Subtarget->is64Bit() || F64IsLegal) {
9747      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
9748      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
9749                                  Ld->getBasePtr(), Ld->getSrcValue(),
9750                                  Ld->getSrcValueOffset(), Ld->isVolatile(),
9751                                  Ld->isNonTemporal(), Ld->getAlignment());
9752      SDValue NewChain = NewLd.getValue(1);
9753      if (TokenFactorIndex != -1) {
9754        Ops.push_back(NewChain);
9755        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9756                               Ops.size());
9757      }
9758      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
9759                          St->getSrcValue(), St->getSrcValueOffset(),
9760                          St->isVolatile(), St->isNonTemporal(),
9761                          St->getAlignment());
9762    }
9763
9764    // Otherwise, lower to two pairs of 32-bit loads / stores.
9765    SDValue LoAddr = Ld->getBasePtr();
9766    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
9767                                 DAG.getConstant(4, MVT::i32));
9768
9769    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
9770                               Ld->getSrcValue(), Ld->getSrcValueOffset(),
9771                               Ld->isVolatile(), Ld->isNonTemporal(),
9772                               Ld->getAlignment());
9773    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
9774                               Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
9775                               Ld->isVolatile(), Ld->isNonTemporal(),
9776                               MinAlign(Ld->getAlignment(), 4));
9777
9778    SDValue NewChain = LoLd.getValue(1);
9779    if (TokenFactorIndex != -1) {
9780      Ops.push_back(LoLd);
9781      Ops.push_back(HiLd);
9782      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
9783                             Ops.size());
9784    }
9785
9786    LoAddr = St->getBasePtr();
9787    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
9788                         DAG.getConstant(4, MVT::i32));
9789
9790    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
9791                                St->getSrcValue(), St->getSrcValueOffset(),
9792                                St->isVolatile(), St->isNonTemporal(),
9793                                St->getAlignment());
9794    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
9795                                St->getSrcValue(),
9796                                St->getSrcValueOffset() + 4,
9797                                St->isVolatile(),
9798                                St->isNonTemporal(),
9799                                MinAlign(St->getAlignment(), 4));
9800    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
9801  }
9802  return SDValue();
9803}
9804
9805/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
9806/// X86ISD::FXOR nodes.
9807static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
9808  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
9809  // F[X]OR(0.0, x) -> x
9810  // F[X]OR(x, 0.0) -> x
9811  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9812    if (C->getValueAPF().isPosZero())
9813      return N->getOperand(1);
9814  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9815    if (C->getValueAPF().isPosZero())
9816      return N->getOperand(0);
9817  return SDValue();
9818}
9819
9820/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
9821static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
9822  // FAND(0.0, x) -> 0.0
9823  // FAND(x, 0.0) -> 0.0
9824  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
9825    if (C->getValueAPF().isPosZero())
9826      return N->getOperand(0);
9827  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
9828    if (C->getValueAPF().isPosZero())
9829      return N->getOperand(1);
9830  return SDValue();
9831}
9832
9833static SDValue PerformBTCombine(SDNode *N,
9834                                SelectionDAG &DAG,
9835                                TargetLowering::DAGCombinerInfo &DCI) {
9836  // BT ignores high bits in the bit index operand.
9837  SDValue Op1 = N->getOperand(1);
9838  if (Op1.hasOneUse()) {
9839    unsigned BitWidth = Op1.getValueSizeInBits();
9840    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
9841    APInt KnownZero, KnownOne;
9842    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
9843                                          !DCI.isBeforeLegalizeOps());
9844    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9845    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
9846        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
9847      DCI.CommitTargetLoweringOpt(TLO);
9848  }
9849  return SDValue();
9850}
9851
9852static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
9853  SDValue Op = N->getOperand(0);
9854  if (Op.getOpcode() == ISD::BIT_CONVERT)
9855    Op = Op.getOperand(0);
9856  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
9857  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
9858      VT.getVectorElementType().getSizeInBits() ==
9859      OpVT.getVectorElementType().getSizeInBits()) {
9860    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
9861  }
9862  return SDValue();
9863}
9864
9865static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
9866  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
9867  //           (and (i32 x86isd::setcc_carry), 1)
9868  // This eliminates the zext. This transformation is necessary because
9869  // ISD::SETCC is always legalized to i8.
9870  DebugLoc dl = N->getDebugLoc();
9871  SDValue N0 = N->getOperand(0);
9872  EVT VT = N->getValueType(0);
9873  if (N0.getOpcode() == ISD::AND &&
9874      N0.hasOneUse() &&
9875      N0.getOperand(0).hasOneUse()) {
9876    SDValue N00 = N0.getOperand(0);
9877    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
9878      return SDValue();
9879    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9880    if (!C || C->getZExtValue() != 1)
9881      return SDValue();
9882    return DAG.getNode(ISD::AND, dl, VT,
9883                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
9884                                   N00.getOperand(0), N00.getOperand(1)),
9885                       DAG.getConstant(1, VT));
9886  }
9887
9888  return SDValue();
9889}
9890
9891SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
9892                                             DAGCombinerInfo &DCI) const {
9893  SelectionDAG &DAG = DCI.DAG;
9894  switch (N->getOpcode()) {
9895  default: break;
9896  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
9897  case ISD::EXTRACT_VECTOR_ELT:
9898                        return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
9899  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
9900  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
9901  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
9902  case ISD::SHL:
9903  case ISD::SRA:
9904  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
9905  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
9906  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
9907  case X86ISD::FXOR:
9908  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
9909  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
9910  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
9911  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
9912  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
9913  }
9914
9915  return SDValue();
9916}
9917
9918/// isTypeDesirableForOp - Return true if the target has native support for
9919/// the specified value type and it is 'desirable' to use the type for the
9920/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
9921/// instruction encodings are longer and some i16 instructions are slow.
9922bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
9923  if (!isTypeLegal(VT))
9924    return false;
9925  if (VT != MVT::i16)
9926    return true;
9927
9928  switch (Opc) {
9929  default:
9930    return true;
9931  case ISD::LOAD:
9932  case ISD::SIGN_EXTEND:
9933  case ISD::ZERO_EXTEND:
9934  case ISD::ANY_EXTEND:
9935  case ISD::SHL:
9936  case ISD::SRL:
9937  case ISD::SUB:
9938  case ISD::ADD:
9939  case ISD::MUL:
9940  case ISD::AND:
9941  case ISD::OR:
9942  case ISD::XOR:
9943    return false;
9944  }
9945}
9946
9947static bool MayFoldLoad(SDValue Op) {
9948  return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
9949}
9950
9951static bool MayFoldIntoStore(SDValue Op) {
9952  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
9953}
9954
9955/// IsDesirableToPromoteOp - This method query the target whether it is
9956/// beneficial for dag combiner to promote the specified node. If true, it
9957/// should return the desired promotion type by reference.
9958bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
9959  EVT VT = Op.getValueType();
9960  if (VT != MVT::i16)
9961    return false;
9962
9963  bool Promote = false;
9964  bool Commute = false;
9965  switch (Op.getOpcode()) {
9966  default: break;
9967  case ISD::LOAD: {
9968    LoadSDNode *LD = cast<LoadSDNode>(Op);
9969    // If the non-extending load has a single use and it's not live out, then it
9970    // might be folded.
9971    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
9972                                                     Op.hasOneUse()*/) {
9973      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
9974             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
9975        // The only case where we'd want to promote LOAD (rather then it being
9976        // promoted as an operand is when it's only use is liveout.
9977        if (UI->getOpcode() != ISD::CopyToReg)
9978          return false;
9979      }
9980    }
9981    Promote = true;
9982    break;
9983  }
9984  case ISD::SIGN_EXTEND:
9985  case ISD::ZERO_EXTEND:
9986  case ISD::ANY_EXTEND:
9987    Promote = true;
9988    break;
9989  case ISD::SHL:
9990  case ISD::SRL: {
9991    SDValue N0 = Op.getOperand(0);
9992    // Look out for (store (shl (load), x)).
9993    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
9994      return false;
9995    Promote = true;
9996    break;
9997  }
9998  case ISD::ADD:
9999  case ISD::MUL:
10000  case ISD::AND:
10001  case ISD::OR:
10002  case ISD::XOR:
10003    Commute = true;
10004    // fallthrough
10005  case ISD::SUB: {
10006    SDValue N0 = Op.getOperand(0);
10007    SDValue N1 = Op.getOperand(1);
10008    if (!Commute && MayFoldLoad(N1))
10009      return false;
10010    // Avoid disabling potential load folding opportunities.
10011    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
10012      return false;
10013    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
10014      return false;
10015    Promote = true;
10016  }
10017  }
10018
10019  PVT = MVT::i32;
10020  return Promote;
10021}
10022
10023//===----------------------------------------------------------------------===//
10024//                           X86 Inline Assembly Support
10025//===----------------------------------------------------------------------===//
10026
10027static bool LowerToBSwap(CallInst *CI) {
10028  // FIXME: this should verify that we are targetting a 486 or better.  If not,
10029  // we will turn this bswap into something that will be lowered to logical ops
10030  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
10031  // so don't worry about this.
10032
10033  // Verify this is a simple bswap.
10034  if (CI->getNumOperands() != 2 ||
10035      CI->getType() != CI->getArgOperand(0)->getType() ||
10036      !CI->getType()->isIntegerTy())
10037    return false;
10038
10039  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
10040  if (!Ty || Ty->getBitWidth() % 16 != 0)
10041    return false;
10042
10043  // Okay, we can do this xform, do so now.
10044  const Type *Tys[] = { Ty };
10045  Module *M = CI->getParent()->getParent()->getParent();
10046  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
10047
10048  Value *Op = CI->getArgOperand(0);
10049  Op = CallInst::Create(Int, Op, CI->getName(), CI);
10050
10051  CI->replaceAllUsesWith(Op);
10052  CI->eraseFromParent();
10053  return true;
10054}
10055
10056bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
10057  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
10058  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
10059
10060  std::string AsmStr = IA->getAsmString();
10061
10062  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
10063  SmallVector<StringRef, 4> AsmPieces;
10064  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
10065
10066  switch (AsmPieces.size()) {
10067  default: return false;
10068  case 1:
10069    AsmStr = AsmPieces[0];
10070    AsmPieces.clear();
10071    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
10072
10073    // bswap $0
10074    if (AsmPieces.size() == 2 &&
10075        (AsmPieces[0] == "bswap" ||
10076         AsmPieces[0] == "bswapq" ||
10077         AsmPieces[0] == "bswapl") &&
10078        (AsmPieces[1] == "$0" ||
10079         AsmPieces[1] == "${0:q}")) {
10080      // No need to check constraints, nothing other than the equivalent of
10081      // "=r,0" would be valid here.
10082      return LowerToBSwap(CI);
10083    }
10084    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
10085    if (CI->getType()->isIntegerTy(16) &&
10086        AsmPieces.size() == 3 &&
10087        (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
10088        AsmPieces[1] == "$$8," &&
10089        AsmPieces[2] == "${0:w}" &&
10090        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
10091      AsmPieces.clear();
10092      const std::string &Constraints = IA->getConstraintString();
10093      SplitString(StringRef(Constraints).substr(5), AsmPieces, ",");
10094      std::sort(AsmPieces.begin(), AsmPieces.end());
10095      if (AsmPieces.size() == 4 &&
10096          AsmPieces[0] == "~{cc}" &&
10097          AsmPieces[1] == "~{dirflag}" &&
10098          AsmPieces[2] == "~{flags}" &&
10099          AsmPieces[3] == "~{fpsr}") {
10100        return LowerToBSwap(CI);
10101      }
10102    }
10103    break;
10104  case 3:
10105    if (CI->getType()->isIntegerTy(64) &&
10106        Constraints.size() >= 2 &&
10107        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
10108        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
10109      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
10110      SmallVector<StringRef, 4> Words;
10111      SplitString(AsmPieces[0], Words, " \t");
10112      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
10113        Words.clear();
10114        SplitString(AsmPieces[1], Words, " \t");
10115        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
10116          Words.clear();
10117          SplitString(AsmPieces[2], Words, " \t,");
10118          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
10119              Words[2] == "%edx") {
10120            return LowerToBSwap(CI);
10121          }
10122        }
10123      }
10124    }
10125    break;
10126  }
10127  return false;
10128}
10129
10130
10131
10132/// getConstraintType - Given a constraint letter, return the type of
10133/// constraint it is for this target.
10134X86TargetLowering::ConstraintType
10135X86TargetLowering::getConstraintType(const std::string &Constraint) const {
10136  if (Constraint.size() == 1) {
10137    switch (Constraint[0]) {
10138    case 'A':
10139      return C_Register;
10140    case 'f':
10141    case 'r':
10142    case 'R':
10143    case 'l':
10144    case 'q':
10145    case 'Q':
10146    case 'x':
10147    case 'y':
10148    case 'Y':
10149      return C_RegisterClass;
10150    case 'e':
10151    case 'Z':
10152      return C_Other;
10153    default:
10154      break;
10155    }
10156  }
10157  return TargetLowering::getConstraintType(Constraint);
10158}
10159
10160/// LowerXConstraint - try to replace an X constraint, which matches anything,
10161/// with another that has more specific requirements based on the type of the
10162/// corresponding operand.
10163const char *X86TargetLowering::
10164LowerXConstraint(EVT ConstraintVT) const {
10165  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
10166  // 'f' like normal targets.
10167  if (ConstraintVT.isFloatingPoint()) {
10168    if (Subtarget->hasSSE2())
10169      return "Y";
10170    if (Subtarget->hasSSE1())
10171      return "x";
10172  }
10173
10174  return TargetLowering::LowerXConstraint(ConstraintVT);
10175}
10176
10177/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
10178/// vector.  If it is invalid, don't add anything to Ops.
10179void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
10180                                                     char Constraint,
10181                                                     std::vector<SDValue>&Ops,
10182                                                     SelectionDAG &DAG) const {
10183  SDValue Result(0, 0);
10184
10185  switch (Constraint) {
10186  default: break;
10187  case 'I':
10188    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10189      if (C->getZExtValue() <= 31) {
10190        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10191        break;
10192      }
10193    }
10194    return;
10195  case 'J':
10196    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10197      if (C->getZExtValue() <= 63) {
10198        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10199        break;
10200      }
10201    }
10202    return;
10203  case 'K':
10204    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10205      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
10206        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10207        break;
10208      }
10209    }
10210    return;
10211  case 'N':
10212    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10213      if (C->getZExtValue() <= 255) {
10214        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10215        break;
10216      }
10217    }
10218    return;
10219  case 'e': {
10220    // 32-bit signed value
10221    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10222      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
10223                                           C->getSExtValue())) {
10224        // Widen to 64 bits here to get it sign extended.
10225        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
10226        break;
10227      }
10228    // FIXME gcc accepts some relocatable values here too, but only in certain
10229    // memory models; it's complicated.
10230    }
10231    return;
10232  }
10233  case 'Z': {
10234    // 32-bit unsigned value
10235    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
10236      if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
10237                                           C->getZExtValue())) {
10238        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
10239        break;
10240      }
10241    }
10242    // FIXME gcc accepts some relocatable values here too, but only in certain
10243    // memory models; it's complicated.
10244    return;
10245  }
10246  case 'i': {
10247    // Literal immediates are always ok.
10248    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
10249      // Widen to 64 bits here to get it sign extended.
10250      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
10251      break;
10252    }
10253
10254    // In any sort of PIC mode addresses need to be computed at runtime by
10255    // adding in a register or some sort of table lookup.  These can't
10256    // be used as immediates.
10257    if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC() ||
10258        Subtarget->isPICStyleRIPRel())
10259      return;
10260
10261    // If we are in non-pic codegen mode, we allow the address of a global (with
10262    // an optional displacement) to be used with 'i'.
10263    GlobalAddressSDNode *GA = 0;
10264    int64_t Offset = 0;
10265
10266    // Match either (GA), (GA+C), (GA+C1+C2), etc.
10267    while (1) {
10268      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
10269        Offset += GA->getOffset();
10270        break;
10271      } else if (Op.getOpcode() == ISD::ADD) {
10272        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
10273          Offset += C->getZExtValue();
10274          Op = Op.getOperand(0);
10275          continue;
10276        }
10277      } else if (Op.getOpcode() == ISD::SUB) {
10278        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
10279          Offset += -C->getZExtValue();
10280          Op = Op.getOperand(0);
10281          continue;
10282        }
10283      }
10284
10285      // Otherwise, this isn't something we can handle, reject it.
10286      return;
10287    }
10288
10289    const GlobalValue *GV = GA->getGlobal();
10290    // If we require an extra load to get this address, as in PIC mode, we
10291    // can't accept it.
10292    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
10293                                                        getTargetMachine())))
10294      return;
10295
10296    Result = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
10297    break;
10298  }
10299  }
10300
10301  if (Result.getNode()) {
10302    Ops.push_back(Result);
10303    return;
10304  }
10305  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
10306}
10307
10308std::vector<unsigned> X86TargetLowering::
10309getRegClassForInlineAsmConstraint(const std::string &Constraint,
10310                                  EVT VT) const {
10311  if (Constraint.size() == 1) {
10312    // FIXME: not handling fp-stack yet!
10313    switch (Constraint[0]) {      // GCC X86 Constraint Letters
10314    default: break;  // Unknown constraint letter
10315    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
10316      if (Subtarget->is64Bit()) {
10317        if (VT == MVT::i32)
10318          return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
10319                                       X86::ESI, X86::EDI, X86::R8D, X86::R9D,
10320                                       X86::R10D,X86::R11D,X86::R12D,
10321                                       X86::R13D,X86::R14D,X86::R15D,
10322                                       X86::EBP, X86::ESP, 0);
10323        else if (VT == MVT::i16)
10324          return make_vector<unsigned>(X86::AX,  X86::DX,  X86::CX, X86::BX,
10325                                       X86::SI,  X86::DI,  X86::R8W,X86::R9W,
10326                                       X86::R10W,X86::R11W,X86::R12W,
10327                                       X86::R13W,X86::R14W,X86::R15W,
10328                                       X86::BP,  X86::SP, 0);
10329        else if (VT == MVT::i8)
10330          return make_vector<unsigned>(X86::AL,  X86::DL,  X86::CL, X86::BL,
10331                                       X86::SIL, X86::DIL, X86::R8B,X86::R9B,
10332                                       X86::R10B,X86::R11B,X86::R12B,
10333                                       X86::R13B,X86::R14B,X86::R15B,
10334                                       X86::BPL, X86::SPL, 0);
10335
10336        else if (VT == MVT::i64)
10337          return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
10338                                       X86::RSI, X86::RDI, X86::R8,  X86::R9,
10339                                       X86::R10, X86::R11, X86::R12,
10340                                       X86::R13, X86::R14, X86::R15,
10341                                       X86::RBP, X86::RSP, 0);
10342
10343        break;
10344      }
10345      // 32-bit fallthrough
10346    case 'Q':   // Q_REGS
10347      if (VT == MVT::i32)
10348        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
10349      else if (VT == MVT::i16)
10350        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
10351      else if (VT == MVT::i8)
10352        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
10353      else if (VT == MVT::i64)
10354        return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
10355      break;
10356    }
10357  }
10358
10359  return std::vector<unsigned>();
10360}
10361
10362std::pair<unsigned, const TargetRegisterClass*>
10363X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
10364                                                EVT VT) const {
10365  // First, see if this is a constraint that directly corresponds to an LLVM
10366  // register class.
10367  if (Constraint.size() == 1) {
10368    // GCC Constraint Letters
10369    switch (Constraint[0]) {
10370    default: break;
10371    case 'r':   // GENERAL_REGS
10372    case 'l':   // INDEX_REGS
10373      if (VT == MVT::i8)
10374        return std::make_pair(0U, X86::GR8RegisterClass);
10375      if (VT == MVT::i16)
10376        return std::make_pair(0U, X86::GR16RegisterClass);
10377      if (VT == MVT::i32 || !Subtarget->is64Bit())
10378        return std::make_pair(0U, X86::GR32RegisterClass);
10379      return std::make_pair(0U, X86::GR64RegisterClass);
10380    case 'R':   // LEGACY_REGS
10381      if (VT == MVT::i8)
10382        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
10383      if (VT == MVT::i16)
10384        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
10385      if (VT == MVT::i32 || !Subtarget->is64Bit())
10386        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
10387      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
10388    case 'f':  // FP Stack registers.
10389      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
10390      // value to the correct fpstack register class.
10391      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
10392        return std::make_pair(0U, X86::RFP32RegisterClass);
10393      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
10394        return std::make_pair(0U, X86::RFP64RegisterClass);
10395      return std::make_pair(0U, X86::RFP80RegisterClass);
10396    case 'y':   // MMX_REGS if MMX allowed.
10397      if (!Subtarget->hasMMX()) break;
10398      return std::make_pair(0U, X86::VR64RegisterClass);
10399    case 'Y':   // SSE_REGS if SSE2 allowed
10400      if (!Subtarget->hasSSE2()) break;
10401      // FALL THROUGH.
10402    case 'x':   // SSE_REGS if SSE1 allowed
10403      if (!Subtarget->hasSSE1()) break;
10404
10405      switch (VT.getSimpleVT().SimpleTy) {
10406      default: break;
10407      // Scalar SSE types.
10408      case MVT::f32:
10409      case MVT::i32:
10410        return std::make_pair(0U, X86::FR32RegisterClass);
10411      case MVT::f64:
10412      case MVT::i64:
10413        return std::make_pair(0U, X86::FR64RegisterClass);
10414      // Vector types.
10415      case MVT::v16i8:
10416      case MVT::v8i16:
10417      case MVT::v4i32:
10418      case MVT::v2i64:
10419      case MVT::v4f32:
10420      case MVT::v2f64:
10421        return std::make_pair(0U, X86::VR128RegisterClass);
10422      }
10423      break;
10424    }
10425  }
10426
10427  // Use the default implementation in TargetLowering to convert the register
10428  // constraint into a member of a register class.
10429  std::pair<unsigned, const TargetRegisterClass*> Res;
10430  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
10431
10432  // Not found as a standard register?
10433  if (Res.second == 0) {
10434    // Map st(0) -> st(7) -> ST0
10435    if (Constraint.size() == 7 && Constraint[0] == '{' &&
10436        tolower(Constraint[1]) == 's' &&
10437        tolower(Constraint[2]) == 't' &&
10438        Constraint[3] == '(' &&
10439        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
10440        Constraint[5] == ')' &&
10441        Constraint[6] == '}') {
10442
10443      Res.first = X86::ST0+Constraint[4]-'0';
10444      Res.second = X86::RFP80RegisterClass;
10445      return Res;
10446    }
10447
10448    // GCC allows "st(0)" to be called just plain "st".
10449    if (StringRef("{st}").equals_lower(Constraint)) {
10450      Res.first = X86::ST0;
10451      Res.second = X86::RFP80RegisterClass;
10452      return Res;
10453    }
10454
10455    // flags -> EFLAGS
10456    if (StringRef("{flags}").equals_lower(Constraint)) {
10457      Res.first = X86::EFLAGS;
10458      Res.second = X86::CCRRegisterClass;
10459      return Res;
10460    }
10461
10462    // 'A' means EAX + EDX.
10463    if (Constraint == "A") {
10464      Res.first = X86::EAX;
10465      Res.second = X86::GR32_ADRegisterClass;
10466      return Res;
10467    }
10468    return Res;
10469  }
10470
10471  // Otherwise, check to see if this is a register class of the wrong value
10472  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
10473  // turn into {ax},{dx}.
10474  if (Res.second->hasType(VT))
10475    return Res;   // Correct type already, nothing to do.
10476
10477  // All of the single-register GCC register classes map their values onto
10478  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
10479  // really want an 8-bit or 32-bit register, map to the appropriate register
10480  // class and return the appropriate register.
10481  if (Res.second == X86::GR16RegisterClass) {
10482    if (VT == MVT::i8) {
10483      unsigned DestReg = 0;
10484      switch (Res.first) {
10485      default: break;
10486      case X86::AX: DestReg = X86::AL; break;
10487      case X86::DX: DestReg = X86::DL; break;
10488      case X86::CX: DestReg = X86::CL; break;
10489      case X86::BX: DestReg = X86::BL; break;
10490      }
10491      if (DestReg) {
10492        Res.first = DestReg;
10493        Res.second = X86::GR8RegisterClass;
10494      }
10495    } else if (VT == MVT::i32) {
10496      unsigned DestReg = 0;
10497      switch (Res.first) {
10498      default: break;
10499      case X86::AX: DestReg = X86::EAX; break;
10500      case X86::DX: DestReg = X86::EDX; break;
10501      case X86::CX: DestReg = X86::ECX; break;
10502      case X86::BX: DestReg = X86::EBX; break;
10503      case X86::SI: DestReg = X86::ESI; break;
10504      case X86::DI: DestReg = X86::EDI; break;
10505      case X86::BP: DestReg = X86::EBP; break;
10506      case X86::SP: DestReg = X86::ESP; break;
10507      }
10508      if (DestReg) {
10509        Res.first = DestReg;
10510        Res.second = X86::GR32RegisterClass;
10511      }
10512    } else if (VT == MVT::i64) {
10513      unsigned DestReg = 0;
10514      switch (Res.first) {
10515      default: break;
10516      case X86::AX: DestReg = X86::RAX; break;
10517      case X86::DX: DestReg = X86::RDX; break;
10518      case X86::CX: DestReg = X86::RCX; break;
10519      case X86::BX: DestReg = X86::RBX; break;
10520      case X86::SI: DestReg = X86::RSI; break;
10521      case X86::DI: DestReg = X86::RDI; break;
10522      case X86::BP: DestReg = X86::RBP; break;
10523      case X86::SP: DestReg = X86::RSP; break;
10524      }
10525      if (DestReg) {
10526        Res.first = DestReg;
10527        Res.second = X86::GR64RegisterClass;
10528      }
10529    }
10530  } else if (Res.second == X86::FR32RegisterClass ||
10531             Res.second == X86::FR64RegisterClass ||
10532             Res.second == X86::VR128RegisterClass) {
10533    // Handle references to XMM physical registers that got mapped into the
10534    // wrong class.  This can happen with constraints like {xmm0} where the
10535    // target independent register mapper will just pick the first match it can
10536    // find, ignoring the required type.
10537    if (VT == MVT::f32)
10538      Res.second = X86::FR32RegisterClass;
10539    else if (VT == MVT::f64)
10540      Res.second = X86::FR64RegisterClass;
10541    else if (X86::VR128RegisterClass->hasType(VT))
10542      Res.second = X86::VR128RegisterClass;
10543  }
10544
10545  return Res;
10546}
10547