SPUISelLowering.cpp revision 20bd5296cec8d8d597ab9db2aca7346a88e580c8
1//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
2//                     The LLVM Compiler Infrastructure
3//
4// This file is distributed under the University of Illinois Open Source
5// License. See LICENSE.TXT for details.
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the SPUTargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "SPUISelLowering.h"
14#include "SPUTargetMachine.h"
15#include "SPUFrameLowering.h"
16#include "SPUMachineFunction.h"
17#include "llvm/Constants.h"
18#include "llvm/Function.h"
19#include "llvm/Intrinsics.h"
20#include "llvm/CallingConv.h"
21#include "llvm/Type.h"
22#include "llvm/CodeGen/CallingConvLower.h"
23#include "llvm/CodeGen/MachineFrameInfo.h"
24#include "llvm/CodeGen/MachineFunction.h"
25#include "llvm/CodeGen/MachineInstrBuilder.h"
26#include "llvm/CodeGen/MachineRegisterInfo.h"
27#include "llvm/CodeGen/SelectionDAG.h"
28#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
29#include "llvm/Target/TargetOptions.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/ErrorHandling.h"
32#include "llvm/Support/MathExtras.h"
33#include "llvm/Support/raw_ostream.h"
34#include <map>
35
36using namespace llvm;
37
38// Used in getTargetNodeName() below
39namespace {
40  std::map<unsigned, const char *> node_names;
41
42  // Byte offset of the preferred slot (counted from the MSB)
43  int prefslotOffset(EVT VT) {
44    int retval=0;
45    if (VT==MVT::i1) retval=3;
46    if (VT==MVT::i8) retval=3;
47    if (VT==MVT::i16) retval=2;
48
49    return retval;
50  }
51
52  //! Expand a library call into an actual call DAG node
53  /*!
54   \note
55   This code is taken from SelectionDAGLegalize, since it is not exposed as
56   part of the LLVM SelectionDAG API.
57   */
58
59  SDValue
60  ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
61                bool isSigned, SDValue &Hi, const SPUTargetLowering &TLI) {
62    // The input chain to this libcall is the entry node of the function.
63    // Legalizing the call will automatically add the previous call to the
64    // dependence.
65    SDValue InChain = DAG.getEntryNode();
66
67    TargetLowering::ArgListTy Args;
68    TargetLowering::ArgListEntry Entry;
69    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
70      EVT ArgVT = Op.getOperand(i).getValueType();
71      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
72      Entry.Node = Op.getOperand(i);
73      Entry.Ty = ArgTy;
74      Entry.isSExt = isSigned;
75      Entry.isZExt = !isSigned;
76      Args.push_back(Entry);
77    }
78    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
79                                           TLI.getPointerTy());
80
81    // Splice the libcall in wherever FindInputOutputChains tells us to.
82    Type *RetTy =
83                Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
84    std::pair<SDValue, SDValue> CallInfo =
85            TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
86                            0, TLI.getLibcallCallingConv(LC), false,
87                            /*isReturnValueUsed=*/true,
88                            Callee, Args, DAG, Op.getDebugLoc());
89
90    return CallInfo.first;
91  }
92}
93
94SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
95  : TargetLowering(TM, new TargetLoweringObjectFileELF()),
96    SPUTM(TM) {
97
98  // Use _setjmp/_longjmp instead of setjmp/longjmp.
99  setUseUnderscoreSetJmp(true);
100  setUseUnderscoreLongJmp(true);
101
102  // Set RTLIB libcall names as used by SPU:
103  setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
104
105  // Set up the SPU's register classes:
106  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
107  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
108  addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
109  addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
110  addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
111  addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
112  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
113
114  // SPU has no sign or zero extended loads for i1, i8, i16:
115  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
116  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
117  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
118
119  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
120  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
121
122  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
123  setTruncStoreAction(MVT::i128, MVT::i32, Expand);
124  setTruncStoreAction(MVT::i128, MVT::i16, Expand);
125  setTruncStoreAction(MVT::i128, MVT::i8, Expand);
126
127  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
128
129  // SPU constant load actions are custom lowered:
130  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
131  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
132
133  // SPU's loads and stores have to be custom lowered:
134  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
135       ++sctype) {
136    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
137
138    setOperationAction(ISD::LOAD,   VT, Custom);
139    setOperationAction(ISD::STORE,  VT, Custom);
140    setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
141    setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
142    setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
143
144    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
145      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
146      setTruncStoreAction(VT, StoreVT, Expand);
147    }
148  }
149
150  for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
151       ++sctype) {
152    MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
153
154    setOperationAction(ISD::LOAD,   VT, Custom);
155    setOperationAction(ISD::STORE,  VT, Custom);
156
157    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
158      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
159      setTruncStoreAction(VT, StoreVT, Expand);
160    }
161  }
162
163  // Expand the jumptable branches
164  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
165  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
166
167  // Custom lower SELECT_CC for most cases, but expand by default
168  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
169  setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
170  setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
171  setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
172  setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
173
174  // SPU has no intrinsics for these particular operations:
175  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
176  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
177
178  // SPU has no division/remainder instructions
179  setOperationAction(ISD::SREM,    MVT::i8,   Expand);
180  setOperationAction(ISD::UREM,    MVT::i8,   Expand);
181  setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
182  setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
183  setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
184  setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
185  setOperationAction(ISD::SREM,    MVT::i16,  Expand);
186  setOperationAction(ISD::UREM,    MVT::i16,  Expand);
187  setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
188  setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
189  setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
190  setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
191  setOperationAction(ISD::SREM,    MVT::i32,  Expand);
192  setOperationAction(ISD::UREM,    MVT::i32,  Expand);
193  setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
194  setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
195  setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
196  setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
197  setOperationAction(ISD::SREM,    MVT::i64,  Expand);
198  setOperationAction(ISD::UREM,    MVT::i64,  Expand);
199  setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
200  setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
201  setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
202  setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
203  setOperationAction(ISD::SREM,    MVT::i128, Expand);
204  setOperationAction(ISD::UREM,    MVT::i128, Expand);
205  setOperationAction(ISD::SDIV,    MVT::i128, Expand);
206  setOperationAction(ISD::UDIV,    MVT::i128, Expand);
207  setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
208  setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
209
210  // We don't support sin/cos/sqrt/fmod
211  setOperationAction(ISD::FSIN , MVT::f64, Expand);
212  setOperationAction(ISD::FCOS , MVT::f64, Expand);
213  setOperationAction(ISD::FREM , MVT::f64, Expand);
214  setOperationAction(ISD::FSIN , MVT::f32, Expand);
215  setOperationAction(ISD::FCOS , MVT::f32, Expand);
216  setOperationAction(ISD::FREM , MVT::f32, Expand);
217
218  // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
219  // for f32!)
220  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
221  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
222
223  setOperationAction(ISD::FMA, MVT::f64, Expand);
224  setOperationAction(ISD::FMA, MVT::f32, Expand);
225
226  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
227  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
228
229  // SPU can do rotate right and left, so legalize it... but customize for i8
230  // because instructions don't exist.
231
232  // FIXME: Change from "expand" to appropriate type once ROTR is supported in
233  //        .td files.
234  setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
235  setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
236  setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
237
238  setOperationAction(ISD::ROTL, MVT::i32,    Legal);
239  setOperationAction(ISD::ROTL, MVT::i16,    Legal);
240  setOperationAction(ISD::ROTL, MVT::i8,     Custom);
241
242  // SPU has no native version of shift left/right for i8
243  setOperationAction(ISD::SHL,  MVT::i8,     Custom);
244  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
245  setOperationAction(ISD::SRA,  MVT::i8,     Custom);
246
247  // Make these operations legal and handle them during instruction selection:
248  setOperationAction(ISD::SHL,  MVT::i64,    Legal);
249  setOperationAction(ISD::SRL,  MVT::i64,    Legal);
250  setOperationAction(ISD::SRA,  MVT::i64,    Legal);
251
252  // Custom lower i8, i32 and i64 multiplications
253  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
254  setOperationAction(ISD::MUL,  MVT::i32,    Legal);
255  setOperationAction(ISD::MUL,  MVT::i64,    Legal);
256
257  // Expand double-width multiplication
258  // FIXME: It would probably be reasonable to support some of these operations
259  setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
260  setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
261  setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
262  setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
263  setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
264  setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
265  setOperationAction(ISD::MULHU,     MVT::i16, Expand);
266  setOperationAction(ISD::MULHS,     MVT::i16, Expand);
267  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
268  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
269  setOperationAction(ISD::MULHU,     MVT::i32, Expand);
270  setOperationAction(ISD::MULHS,     MVT::i32, Expand);
271  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
272  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
273  setOperationAction(ISD::MULHU,     MVT::i64, Expand);
274  setOperationAction(ISD::MULHS,     MVT::i64, Expand);
275
276  // Need to custom handle (some) common i8, i64 math ops
277  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
278  setOperationAction(ISD::ADD,  MVT::i64,    Legal);
279  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
280  setOperationAction(ISD::SUB,  MVT::i64,    Legal);
281
282  // SPU does not have BSWAP. It does have i32 support CTLZ.
283  // CTPOP has to be custom lowered.
284  setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
285  setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
286
287  setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
288  setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
289  setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
290  setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
291  setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
292
293  setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
294  setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
295  setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
296  setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
297  setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
298  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i8,    Expand);
299  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16,   Expand);
300  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32,   Expand);
301  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64,   Expand);
302  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i128,  Expand);
303
304  setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
305  setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
306  setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
307  setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
308  setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
309  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8,    Expand);
310  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16,   Expand);
311  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32,   Expand);
312  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64,   Expand);
313  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i128,  Expand);
314
315  // SPU has a version of select that implements (a&~c)|(b&c), just like
316  // select ought to work:
317  setOperationAction(ISD::SELECT, MVT::i8,   Legal);
318  setOperationAction(ISD::SELECT, MVT::i16,  Legal);
319  setOperationAction(ISD::SELECT, MVT::i32,  Legal);
320  setOperationAction(ISD::SELECT, MVT::i64,  Legal);
321
322  setOperationAction(ISD::SETCC, MVT::i8,    Legal);
323  setOperationAction(ISD::SETCC, MVT::i16,   Legal);
324  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
325  setOperationAction(ISD::SETCC, MVT::i64,   Legal);
326  setOperationAction(ISD::SETCC, MVT::f64,   Custom);
327
328  // Custom lower i128 -> i64 truncates
329  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
330
331  // Custom lower i32/i64 -> i128 sign extend
332  setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
333
334  setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
335  setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
336  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
337  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
338  // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
339  // to expand to a libcall, hence the custom lowering:
340  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
341  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
342  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
343  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
344  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
345  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
346
347  // FDIV on SPU requires custom lowering
348  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
349
350  // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
351  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
352  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
353  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
354  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
355  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
356  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
357  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
358  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
359
360  setOperationAction(ISD::BITCAST, MVT::i32, Legal);
361  setOperationAction(ISD::BITCAST, MVT::f32, Legal);
362  setOperationAction(ISD::BITCAST, MVT::i64, Legal);
363  setOperationAction(ISD::BITCAST, MVT::f64, Legal);
364
365  // We cannot sextinreg(i1).  Expand to shifts.
366  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
367
368  // We want to legalize GlobalAddress and ConstantPool nodes into the
369  // appropriate instructions to materialize the address.
370  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
371       ++sctype) {
372    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
373
374    setOperationAction(ISD::GlobalAddress,  VT, Custom);
375    setOperationAction(ISD::ConstantPool,   VT, Custom);
376    setOperationAction(ISD::JumpTable,      VT, Custom);
377  }
378
379  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
380  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
381
382  // Use the default implementation.
383  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
384  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
385  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
386  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
387  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
388  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
389  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
390
391  // Cell SPU has instructions for converting between i64 and fp.
392  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
393  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
394
395  // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
396  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
397
398  // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
399  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
400
401  // First set operation action for all vector types to expand. Then we
402  // will selectively turn on ones that can be effectively codegen'd.
403  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
404  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
405  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
406  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
407  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
408  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
409
410  for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
411       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
412    MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
413
414    // Set operation actions to legal types only.
415    if (!isTypeLegal(VT)) continue;
416
417    // add/sub are legal for all supported vector VT's.
418    setOperationAction(ISD::ADD,     VT, Legal);
419    setOperationAction(ISD::SUB,     VT, Legal);
420    // mul has to be custom lowered.
421    setOperationAction(ISD::MUL,     VT, Legal);
422
423    setOperationAction(ISD::AND,     VT, Legal);
424    setOperationAction(ISD::OR,      VT, Legal);
425    setOperationAction(ISD::XOR,     VT, Legal);
426    setOperationAction(ISD::LOAD,    VT, Custom);
427    setOperationAction(ISD::SELECT,  VT, Legal);
428    setOperationAction(ISD::STORE,   VT, Custom);
429
430    // These operations need to be expanded:
431    setOperationAction(ISD::SDIV,    VT, Expand);
432    setOperationAction(ISD::SREM,    VT, Expand);
433    setOperationAction(ISD::UDIV,    VT, Expand);
434    setOperationAction(ISD::UREM,    VT, Expand);
435
436    // Expand all trunc stores
437    for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
438         j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
439      MVT::SimpleValueType TargetVT = (MVT::SimpleValueType)j;
440    setTruncStoreAction(VT, TargetVT, Expand);
441    }
442
443    // Custom lower build_vector, constant pool spills, insert and
444    // extract vector elements:
445    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
446    setOperationAction(ISD::ConstantPool, VT, Custom);
447    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
448    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
449    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
450    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
451  }
452
453  setOperationAction(ISD::SHL, MVT::v2i64, Expand);
454
455  setOperationAction(ISD::AND, MVT::v16i8, Custom);
456  setOperationAction(ISD::OR,  MVT::v16i8, Custom);
457  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
458  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
459
460  setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
461
462  setBooleanContents(ZeroOrNegativeOneBooleanContent);
463  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct?
464
465  setStackPointerRegisterToSaveRestore(SPU::R1);
466
467  // We have target-specific dag combine patterns for the following nodes:
468  setTargetDAGCombine(ISD::ADD);
469  setTargetDAGCombine(ISD::ZERO_EXTEND);
470  setTargetDAGCombine(ISD::SIGN_EXTEND);
471  setTargetDAGCombine(ISD::ANY_EXTEND);
472
473  setMinFunctionAlignment(3);
474
475  computeRegisterProperties();
476
477  // Set pre-RA register scheduler default to BURR, which produces slightly
478  // better code than the default (could also be TDRR, but TargetLowering.h
479  // needs a mod to support that model):
480  setSchedulingPreference(Sched::RegPressure);
481}
482
483const char *
484SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
485{
486  if (node_names.empty()) {
487    node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
488    node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
489    node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
490    node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
491    node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
492    node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
493    node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
494    node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
495    node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
496    node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
497    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
498    node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
499    node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
500    node_names[(unsigned) SPUISD::SHL_BITS] = "SPUISD::SHL_BITS";
501    node_names[(unsigned) SPUISD::SHL_BYTES] = "SPUISD::SHL_BYTES";
502    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
503    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
504    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
505    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
506            "SPUISD::ROTBYTES_LEFT_BITS";
507    node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
508    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
509    node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
510    node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
511    node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
512  }
513
514  std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
515
516  return ((i != node_names.end()) ? i->second : 0);
517}
518
519//===----------------------------------------------------------------------===//
520// Return the Cell SPU's SETCC result type
521//===----------------------------------------------------------------------===//
522
523EVT SPUTargetLowering::getSetCCResultType(EVT VT) const {
524  // i8, i16 and i32 are valid SETCC result types
525  MVT::SimpleValueType retval;
526
527  switch(VT.getSimpleVT().SimpleTy){
528    case MVT::i1:
529    case MVT::i8:
530      retval = MVT::i8; break;
531    case MVT::i16:
532      retval = MVT::i16; break;
533    case MVT::i32:
534    default:
535      retval = MVT::i32;
536  }
537  return retval;
538}
539
540//===----------------------------------------------------------------------===//
541// Calling convention code:
542//===----------------------------------------------------------------------===//
543
544#include "SPUGenCallingConv.inc"
545
546//===----------------------------------------------------------------------===//
547//  LowerOperation implementation
548//===----------------------------------------------------------------------===//
549
550/// Custom lower loads for CellSPU
551/*!
552 All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
553 within a 16-byte block, we have to rotate to extract the requested element.
554
555 For extending loads, we also want to ensure that the following sequence is
556 emitted, e.g. for MVT::f32 extending load to MVT::f64:
557
558\verbatim
559%1  v16i8,ch = load
560%2  v16i8,ch = rotate %1
561%3  v4f8, ch = bitconvert %2
562%4  f32      = vec2perfslot %3
563%5  f64      = fp_extend %4
564\endverbatim
565*/
566static SDValue
567LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
568  LoadSDNode *LN = cast<LoadSDNode>(Op);
569  SDValue the_chain = LN->getChain();
570  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
571  EVT InVT = LN->getMemoryVT();
572  EVT OutVT = Op.getValueType();
573  ISD::LoadExtType ExtType = LN->getExtensionType();
574  unsigned alignment = LN->getAlignment();
575  int pso = prefslotOffset(InVT);
576  DebugLoc dl = Op.getDebugLoc();
577  EVT vecVT = InVT.isVector()? InVT: EVT::getVectorVT(*DAG.getContext(), InVT,
578                                                  (128 / InVT.getSizeInBits()));
579
580  // two sanity checks
581  assert( LN->getAddressingMode() == ISD::UNINDEXED
582          && "we should get only UNINDEXED adresses");
583  // clean aligned loads can be selected as-is
584  if (InVT.getSizeInBits() == 128 && (alignment%16) == 0)
585    return SDValue();
586
587  // Get pointerinfos to the memory chunk(s) that contain the data to load
588  uint64_t mpi_offset = LN->getPointerInfo().Offset;
589  mpi_offset -= mpi_offset%16;
590  MachinePointerInfo lowMemPtr(LN->getPointerInfo().V, mpi_offset);
591  MachinePointerInfo highMemPtr(LN->getPointerInfo().V, mpi_offset+16);
592
593  SDValue result;
594  SDValue basePtr = LN->getBasePtr();
595  SDValue rotate;
596
597  if ((alignment%16) == 0) {
598    ConstantSDNode *CN;
599
600    // Special cases for a known aligned load to simplify the base pointer
601    // and the rotation amount:
602    if (basePtr.getOpcode() == ISD::ADD
603        && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
604      // Known offset into basePtr
605      int64_t offset = CN->getSExtValue();
606      int64_t rotamt = int64_t((offset & 0xf) - pso);
607
608      if (rotamt < 0)
609        rotamt += 16;
610
611      rotate = DAG.getConstant(rotamt, MVT::i16);
612
613      // Simplify the base pointer for this case:
614      basePtr = basePtr.getOperand(0);
615      if ((offset & ~0xf) > 0) {
616        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
617                              basePtr,
618                              DAG.getConstant((offset & ~0xf), PtrVT));
619      }
620    } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
621               || (basePtr.getOpcode() == SPUISD::IndirectAddr
622                   && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
623                   && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
624      // Plain aligned a-form address: rotate into preferred slot
625      // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
626      int64_t rotamt = -pso;
627      if (rotamt < 0)
628        rotamt += 16;
629      rotate = DAG.getConstant(rotamt, MVT::i16);
630    } else {
631      // Offset the rotate amount by the basePtr and the preferred slot
632      // byte offset
633      int64_t rotamt = -pso;
634      if (rotamt < 0)
635        rotamt += 16;
636      rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
637                           basePtr,
638                           DAG.getConstant(rotamt, PtrVT));
639    }
640  } else {
641    // Unaligned load: must be more pessimistic about addressing modes:
642    if (basePtr.getOpcode() == ISD::ADD) {
643      MachineFunction &MF = DAG.getMachineFunction();
644      MachineRegisterInfo &RegInfo = MF.getRegInfo();
645      unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
646      SDValue Flag;
647
648      SDValue Op0 = basePtr.getOperand(0);
649      SDValue Op1 = basePtr.getOperand(1);
650
651      if (isa<ConstantSDNode>(Op1)) {
652        // Convert the (add <ptr>, <const>) to an indirect address contained
653        // in a register. Note that this is done because we need to avoid
654        // creating a 0(reg) d-form address due to the SPU's block loads.
655        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
656        the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
657        basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
658      } else {
659        // Convert the (add <arg1>, <arg2>) to an indirect address, which
660        // will likely be lowered as a reg(reg) x-form address.
661        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
662      }
663    } else {
664      basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
665                            basePtr,
666                            DAG.getConstant(0, PtrVT));
667   }
668
669    // Offset the rotate amount by the basePtr and the preferred slot
670    // byte offset
671    rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
672                         basePtr,
673                         DAG.getConstant(-pso, PtrVT));
674  }
675
676  // Do the load as a i128 to allow possible shifting
677  SDValue low = DAG.getLoad(MVT::i128, dl, the_chain, basePtr,
678                       lowMemPtr,
679                       LN->isVolatile(), LN->isNonTemporal(), false, 16);
680
681  // When the size is not greater than alignment we get all data with just
682  // one load
683  if (alignment >= InVT.getSizeInBits()/8) {
684    // Update the chain
685    the_chain = low.getValue(1);
686
687    // Rotate into the preferred slot:
688    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::i128,
689                         low.getValue(0), rotate);
690
691    // Convert the loaded v16i8 vector to the appropriate vector type
692    // specified by the operand:
693    EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
694                                 InVT, (128 / InVT.getSizeInBits()));
695    result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
696                         DAG.getNode(ISD::BITCAST, dl, vecVT, result));
697  }
698  // When alignment is less than the size, we might need (known only at
699  // run-time) two loads
700  // TODO: if the memory address is composed only from constants, we have
701  // extra kowledge, and might avoid the second load
702  else {
703    // storage position offset from lower 16 byte aligned memory chunk
704    SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
705                                  basePtr, DAG.getConstant( 0xf, MVT::i32 ) );
706    // get a registerfull of ones. (this implementation is a workaround: LLVM
707    // cannot handle 128 bit signed int constants)
708    SDValue ones = DAG.getConstant(-1, MVT::v4i32 );
709    ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
710
711    SDValue high = DAG.getLoad(MVT::i128, dl, the_chain,
712                               DAG.getNode(ISD::ADD, dl, PtrVT,
713                                           basePtr,
714                                           DAG.getConstant(16, PtrVT)),
715                               highMemPtr,
716                               LN->isVolatile(), LN->isNonTemporal(), false,
717                               16);
718
719    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
720                                                              high.getValue(1));
721
722    // Shift the (possible) high part right to compensate the misalignemnt.
723    // if there is no highpart (i.e. value is i64 and offset is 4), this
724    // will zero out the high value.
725    high = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, high,
726                                     DAG.getNode(ISD::SUB, dl, MVT::i32,
727                                                 DAG.getConstant( 16, MVT::i32),
728                                                 offset
729                                                ));
730
731    // Shift the low similarly
732    // TODO: add SPUISD::SHL_BYTES
733    low = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, low, offset );
734
735    // Merge the two parts
736    result = DAG.getNode(ISD::BITCAST, dl, vecVT,
737                          DAG.getNode(ISD::OR, dl, MVT::i128, low, high));
738
739    if (!InVT.isVector()) {
740      result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT, result );
741     }
742
743  }
744    // Handle extending loads by extending the scalar result:
745    if (ExtType == ISD::SEXTLOAD) {
746      result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
747    } else if (ExtType == ISD::ZEXTLOAD) {
748      result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
749    } else if (ExtType == ISD::EXTLOAD) {
750      unsigned NewOpc = ISD::ANY_EXTEND;
751
752      if (OutVT.isFloatingPoint())
753        NewOpc = ISD::FP_EXTEND;
754
755      result = DAG.getNode(NewOpc, dl, OutVT, result);
756    }
757
758    SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
759    SDValue retops[2] = {
760      result,
761      the_chain
762    };
763
764    result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
765                         retops, sizeof(retops) / sizeof(retops[0]));
766    return result;
767}
768
769/// Custom lower stores for CellSPU
770/*!
771 All CellSPU stores are aligned to 16-byte boundaries, so for elements
772 within a 16-byte block, we have to generate a shuffle to insert the
773 requested element into its place, then store the resulting block.
774 */
775static SDValue
776LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
777  StoreSDNode *SN = cast<StoreSDNode>(Op);
778  SDValue Value = SN->getValue();
779  EVT VT = Value.getValueType();
780  EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
781  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
782  DebugLoc dl = Op.getDebugLoc();
783  unsigned alignment = SN->getAlignment();
784  SDValue result;
785  EVT vecVT = StVT.isVector()? StVT: EVT::getVectorVT(*DAG.getContext(), StVT,
786                                                 (128 / StVT.getSizeInBits()));
787  // Get pointerinfos to the memory chunk(s) that contain the data to load
788  uint64_t mpi_offset = SN->getPointerInfo().Offset;
789  mpi_offset -= mpi_offset%16;
790  MachinePointerInfo lowMemPtr(SN->getPointerInfo().V, mpi_offset);
791  MachinePointerInfo highMemPtr(SN->getPointerInfo().V, mpi_offset+16);
792
793
794  // two sanity checks
795  assert( SN->getAddressingMode() == ISD::UNINDEXED
796          && "we should get only UNINDEXED adresses");
797  // clean aligned loads can be selected as-is
798  if (StVT.getSizeInBits() == 128 && (alignment%16) == 0)
799    return SDValue();
800
801  SDValue alignLoadVec;
802  SDValue basePtr = SN->getBasePtr();
803  SDValue the_chain = SN->getChain();
804  SDValue insertEltOffs;
805
806  if ((alignment%16) == 0) {
807    ConstantSDNode *CN;
808    // Special cases for a known aligned load to simplify the base pointer
809    // and insertion byte:
810    if (basePtr.getOpcode() == ISD::ADD
811        && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
812      // Known offset into basePtr
813      int64_t offset = CN->getSExtValue();
814
815      // Simplify the base pointer for this case:
816      basePtr = basePtr.getOperand(0);
817      insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
818                                  basePtr,
819                                  DAG.getConstant((offset & 0xf), PtrVT));
820
821      if ((offset & ~0xf) > 0) {
822        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
823                              basePtr,
824                              DAG.getConstant((offset & ~0xf), PtrVT));
825      }
826    } else {
827      // Otherwise, assume it's at byte 0 of basePtr
828      insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
829                                  basePtr,
830                                  DAG.getConstant(0, PtrVT));
831      basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
832                                  basePtr,
833                                  DAG.getConstant(0, PtrVT));
834    }
835  } else {
836    // Unaligned load: must be more pessimistic about addressing modes:
837    if (basePtr.getOpcode() == ISD::ADD) {
838      MachineFunction &MF = DAG.getMachineFunction();
839      MachineRegisterInfo &RegInfo = MF.getRegInfo();
840      unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
841      SDValue Flag;
842
843      SDValue Op0 = basePtr.getOperand(0);
844      SDValue Op1 = basePtr.getOperand(1);
845
846      if (isa<ConstantSDNode>(Op1)) {
847        // Convert the (add <ptr>, <const>) to an indirect address contained
848        // in a register. Note that this is done because we need to avoid
849        // creating a 0(reg) d-form address due to the SPU's block loads.
850        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
851        the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
852        basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
853      } else {
854        // Convert the (add <arg1>, <arg2>) to an indirect address, which
855        // will likely be lowered as a reg(reg) x-form address.
856        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
857      }
858    } else {
859      basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
860                            basePtr,
861                            DAG.getConstant(0, PtrVT));
862    }
863
864    // Insertion point is solely determined by basePtr's contents
865    insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
866                                basePtr,
867                                DAG.getConstant(0, PtrVT));
868  }
869
870  // Load the lower part of the memory to which to store.
871  SDValue low = DAG.getLoad(vecVT, dl, the_chain, basePtr,
872                          lowMemPtr, SN->isVolatile(), SN->isNonTemporal(),
873                            false, 16);
874
875  // if we don't need to store over the 16 byte boundary, one store suffices
876  if (alignment >= StVT.getSizeInBits()/8) {
877    // Update the chain
878    the_chain = low.getValue(1);
879
880    LoadSDNode *LN = cast<LoadSDNode>(low);
881    SDValue theValue = SN->getValue();
882
883    if (StVT != VT
884        && (theValue.getOpcode() == ISD::AssertZext
885            || theValue.getOpcode() == ISD::AssertSext)) {
886      // Drill down and get the value for zero- and sign-extended
887      // quantities
888      theValue = theValue.getOperand(0);
889    }
890
891    // If the base pointer is already a D-form address, then just create
892    // a new D-form address with a slot offset and the orignal base pointer.
893    // Otherwise generate a D-form address with the slot offset relative
894    // to the stack pointer, which is always aligned.
895#if !defined(NDEBUG)
896      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
897        errs() << "CellSPU LowerSTORE: basePtr = ";
898        basePtr.getNode()->dump(&DAG);
899        errs() << "\n";
900      }
901#endif
902
903    SDValue insertEltOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT,
904                                      insertEltOffs);
905    SDValue vectorizeOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT,
906                                      theValue);
907
908    result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
909                         vectorizeOp, low,
910                         DAG.getNode(ISD::BITCAST, dl,
911                                     MVT::v4i32, insertEltOp));
912
913    result = DAG.getStore(the_chain, dl, result, basePtr,
914                          lowMemPtr,
915                          LN->isVolatile(), LN->isNonTemporal(),
916                          16);
917
918  }
919  // do the store when it might cross the 16 byte memory access boundary.
920  else {
921    // TODO issue a warning if SN->isVolatile()== true? This is likely not
922    // what the user wanted.
923
924    // address offset from nearest lower 16byte alinged address
925    SDValue offset = DAG.getNode(ISD::AND, dl, MVT::i32,
926                                    SN->getBasePtr(),
927                                    DAG.getConstant(0xf, MVT::i32));
928    // 16 - offset
929    SDValue offset_compl = DAG.getNode(ISD::SUB, dl, MVT::i32,
930                                           DAG.getConstant( 16, MVT::i32),
931                                           offset);
932    // 16 - sizeof(Value)
933    SDValue surplus = DAG.getNode(ISD::SUB, dl, MVT::i32,
934                                     DAG.getConstant( 16, MVT::i32),
935                                     DAG.getConstant( VT.getSizeInBits()/8,
936                                                      MVT::i32));
937    // get a registerfull of ones
938    SDValue ones = DAG.getConstant(-1, MVT::v4i32);
939    ones = DAG.getNode(ISD::BITCAST, dl, MVT::i128, ones);
940
941    // Create the 128 bit masks that have ones where the data to store is
942    // located.
943    SDValue lowmask, himask;
944    // if the value to store don't fill up the an entire 128 bits, zero
945    // out the last bits of the mask so that only the value we want to store
946    // is masked.
947    // this is e.g. in the case of store i32, align 2
948    if (!VT.isVector()){
949      Value = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, Value);
950      lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, ones, surplus);
951      lowmask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
952                                                               surplus);
953      Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
954      Value = DAG.getNode(ISD::AND, dl, MVT::i128, Value, lowmask);
955
956    }
957    else {
958      lowmask = ones;
959      Value = DAG.getNode(ISD::BITCAST, dl, MVT::i128, Value);
960    }
961    // this will zero, if there are no data that goes to the high quad
962    himask = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, lowmask,
963                                                            offset_compl);
964    lowmask = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, lowmask,
965                                                             offset);
966
967    // Load in the old data and zero out the parts that will be overwritten with
968    // the new data to store.
969    SDValue hi = DAG.getLoad(MVT::i128, dl, the_chain,
970                               DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
971                                           DAG.getConstant( 16, PtrVT)),
972                               highMemPtr,
973                               SN->isVolatile(), SN->isNonTemporal(),
974                               false, 16);
975    the_chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(1),
976                                                              hi.getValue(1));
977
978    low = DAG.getNode(ISD::AND, dl, MVT::i128,
979                        DAG.getNode( ISD::BITCAST, dl, MVT::i128, low),
980                        DAG.getNode( ISD::XOR, dl, MVT::i128, lowmask, ones));
981    hi = DAG.getNode(ISD::AND, dl, MVT::i128,
982                        DAG.getNode( ISD::BITCAST, dl, MVT::i128, hi),
983                        DAG.getNode( ISD::XOR, dl, MVT::i128, himask, ones));
984
985    // Shift the Value to store into place. rlow contains the parts that go to
986    // the lower memory chunk, rhi has the parts that go to the upper one.
987    SDValue rlow = DAG.getNode(SPUISD::SRL_BYTES, dl, MVT::i128, Value, offset);
988    rlow = DAG.getNode(ISD::AND, dl, MVT::i128, rlow, lowmask);
989    SDValue rhi = DAG.getNode(SPUISD::SHL_BYTES, dl, MVT::i128, Value,
990                                                            offset_compl);
991
992    // Merge the old data and the new data and store the results
993    // Need to convert vectors here to integer as 'OR'ing floats assert
994    rlow = DAG.getNode(ISD::OR, dl, MVT::i128,
995                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, low),
996                          DAG.getNode(ISD::BITCAST, dl, MVT::i128, rlow));
997    rhi = DAG.getNode(ISD::OR, dl, MVT::i128,
998                         DAG.getNode(ISD::BITCAST, dl, MVT::i128, hi),
999                         DAG.getNode(ISD::BITCAST, dl, MVT::i128, rhi));
1000
1001    low = DAG.getStore(the_chain, dl, rlow, basePtr,
1002                          lowMemPtr,
1003                          SN->isVolatile(), SN->isNonTemporal(), 16);
1004    hi  = DAG.getStore(the_chain, dl, rhi,
1005                            DAG.getNode(ISD::ADD, dl, PtrVT, basePtr,
1006                                        DAG.getConstant( 16, PtrVT)),
1007                            highMemPtr,
1008                            SN->isVolatile(), SN->isNonTemporal(), 16);
1009    result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, low.getValue(0),
1010                                                           hi.getValue(0));
1011  }
1012
1013  return result;
1014}
1015
1016//! Generate the address of a constant pool entry.
1017static SDValue
1018LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1019  EVT PtrVT = Op.getValueType();
1020  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
1021  const Constant *C = CP->getConstVal();
1022  SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
1023  SDValue Zero = DAG.getConstant(0, PtrVT);
1024  const TargetMachine &TM = DAG.getTarget();
1025  // FIXME there is no actual debug info here
1026  DebugLoc dl = Op.getDebugLoc();
1027
1028  if (TM.getRelocationModel() == Reloc::Static) {
1029    if (!ST->usingLargeMem()) {
1030      // Just return the SDValue with the constant pool address in it.
1031      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
1032    } else {
1033      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
1034      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
1035      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1036    }
1037  }
1038
1039  llvm_unreachable("LowerConstantPool: Relocation model other than static"
1040                   " not supported.");
1041}
1042
1043//! Alternate entry point for generating the address of a constant pool entry
1044SDValue
1045SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
1046  return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
1047}
1048
1049static SDValue
1050LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1051  EVT PtrVT = Op.getValueType();
1052  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
1053  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
1054  SDValue Zero = DAG.getConstant(0, PtrVT);
1055  const TargetMachine &TM = DAG.getTarget();
1056  // FIXME there is no actual debug info here
1057  DebugLoc dl = Op.getDebugLoc();
1058
1059  if (TM.getRelocationModel() == Reloc::Static) {
1060    if (!ST->usingLargeMem()) {
1061      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
1062    } else {
1063      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
1064      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
1065      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1066    }
1067  }
1068
1069  llvm_unreachable("LowerJumpTable: Relocation model other than static"
1070                   " not supported.");
1071}
1072
1073static SDValue
1074LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
1075  EVT PtrVT = Op.getValueType();
1076  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
1077  const GlobalValue *GV = GSDN->getGlobal();
1078  SDValue GA = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
1079                                          PtrVT, GSDN->getOffset());
1080  const TargetMachine &TM = DAG.getTarget();
1081  SDValue Zero = DAG.getConstant(0, PtrVT);
1082  // FIXME there is no actual debug info here
1083  DebugLoc dl = Op.getDebugLoc();
1084
1085  if (TM.getRelocationModel() == Reloc::Static) {
1086    if (!ST->usingLargeMem()) {
1087      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
1088    } else {
1089      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
1090      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
1091      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
1092    }
1093  } else {
1094    report_fatal_error("LowerGlobalAddress: Relocation model other than static"
1095                      "not supported.");
1096    /*NOTREACHED*/
1097  }
1098}
1099
1100//! Custom lower double precision floating point constants
1101static SDValue
1102LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
1103  EVT VT = Op.getValueType();
1104  // FIXME there is no actual debug info here
1105  DebugLoc dl = Op.getDebugLoc();
1106
1107  if (VT == MVT::f64) {
1108    ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
1109
1110    assert((FP != 0) &&
1111           "LowerConstantFP: Node is not ConstantFPSDNode");
1112
1113    uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
1114    SDValue T = DAG.getConstant(dbits, MVT::i64);
1115    SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
1116    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
1117                       DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Tvec));
1118  }
1119
1120  return SDValue();
1121}
1122
1123SDValue
1124SPUTargetLowering::LowerFormalArguments(SDValue Chain,
1125                                        CallingConv::ID CallConv, bool isVarArg,
1126                                        const SmallVectorImpl<ISD::InputArg>
1127                                          &Ins,
1128                                        DebugLoc dl, SelectionDAG &DAG,
1129                                        SmallVectorImpl<SDValue> &InVals)
1130                                          const {
1131
1132  MachineFunction &MF = DAG.getMachineFunction();
1133  MachineFrameInfo *MFI = MF.getFrameInfo();
1134  MachineRegisterInfo &RegInfo = MF.getRegInfo();
1135  SPUFunctionInfo *FuncInfo = MF.getInfo<SPUFunctionInfo>();
1136
1137  unsigned ArgOffset = SPUFrameLowering::minStackSize();
1138  unsigned ArgRegIdx = 0;
1139  unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1140
1141  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1142
1143  SmallVector<CCValAssign, 16> ArgLocs;
1144  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1145		 getTargetMachine(), ArgLocs, *DAG.getContext());
1146  // FIXME: allow for other calling conventions
1147  CCInfo.AnalyzeFormalArguments(Ins, CCC_SPU);
1148
1149  // Add DAG nodes to load the arguments or copy them out of registers.
1150  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
1151    EVT ObjectVT = Ins[ArgNo].VT;
1152    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
1153    SDValue ArgVal;
1154    CCValAssign &VA = ArgLocs[ArgNo];
1155
1156    if (VA.isRegLoc()) {
1157      const TargetRegisterClass *ArgRegClass;
1158
1159      switch (ObjectVT.getSimpleVT().SimpleTy) {
1160      default:
1161        report_fatal_error("LowerFormalArguments Unhandled argument type: " +
1162                           Twine(ObjectVT.getEVTString()));
1163      case MVT::i8:
1164        ArgRegClass = &SPU::R8CRegClass;
1165        break;
1166      case MVT::i16:
1167        ArgRegClass = &SPU::R16CRegClass;
1168        break;
1169      case MVT::i32:
1170        ArgRegClass = &SPU::R32CRegClass;
1171        break;
1172      case MVT::i64:
1173        ArgRegClass = &SPU::R64CRegClass;
1174        break;
1175      case MVT::i128:
1176        ArgRegClass = &SPU::GPRCRegClass;
1177        break;
1178      case MVT::f32:
1179        ArgRegClass = &SPU::R32FPRegClass;
1180        break;
1181      case MVT::f64:
1182        ArgRegClass = &SPU::R64FPRegClass;
1183        break;
1184      case MVT::v2f64:
1185      case MVT::v4f32:
1186      case MVT::v2i64:
1187      case MVT::v4i32:
1188      case MVT::v8i16:
1189      case MVT::v16i8:
1190        ArgRegClass = &SPU::VECREGRegClass;
1191        break;
1192      }
1193
1194      unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
1195      RegInfo.addLiveIn(VA.getLocReg(), VReg);
1196      ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
1197      ++ArgRegIdx;
1198    } else {
1199      // We need to load the argument to a virtual register if we determined
1200      // above that we ran out of physical registers of the appropriate type
1201      // or we're forced to do vararg
1202      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true);
1203      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1204      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
1205                           false, false, false, 0);
1206      ArgOffset += StackSlotSize;
1207    }
1208
1209    InVals.push_back(ArgVal);
1210    // Update the chain
1211    Chain = ArgVal.getOperand(0);
1212  }
1213
1214  // vararg handling:
1215  if (isVarArg) {
1216    // FIXME: we should be able to query the argument registers from
1217    //        tablegen generated code.
1218    static const unsigned ArgRegs[] = {
1219      SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
1220      SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
1221      SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
1222      SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
1223      SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
1224      SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
1225      SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
1226      SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
1227      SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
1228      SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
1229      SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
1230    };
1231    // size of ArgRegs array
1232    unsigned NumArgRegs = 77;
1233
1234    // We will spill (79-3)+1 registers to the stack
1235    SmallVector<SDValue, 79-3+1> MemOps;
1236
1237    // Create the frame slot
1238    for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
1239      FuncInfo->setVarArgsFrameIndex(
1240        MFI->CreateFixedObject(StackSlotSize, ArgOffset, true));
1241      SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
1242      unsigned VReg = MF.addLiveIn(ArgRegs[ArgRegIdx], &SPU::VECREGRegClass);
1243      SDValue ArgVal = DAG.getRegister(VReg, MVT::v16i8);
1244      SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, MachinePointerInfo(),
1245                                   false, false, 0);
1246      Chain = Store.getOperand(0);
1247      MemOps.push_back(Store);
1248
1249      // Increment address by stack slot size for the next stored argument
1250      ArgOffset += StackSlotSize;
1251    }
1252    if (!MemOps.empty())
1253      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1254                          &MemOps[0], MemOps.size());
1255  }
1256
1257  return Chain;
1258}
1259
1260/// isLSAAddress - Return the immediate to use if the specified
1261/// value is representable as a LSA address.
1262static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
1263  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
1264  if (!C) return 0;
1265
1266  int Addr = C->getZExtValue();
1267  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
1268      (Addr << 14 >> 14) != Addr)
1269    return 0;  // Top 14 bits have to be sext of immediate.
1270
1271  return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
1272}
1273
1274SDValue
1275SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
1276                             CallingConv::ID CallConv, bool isVarArg,
1277                             bool &isTailCall,
1278                             const SmallVectorImpl<ISD::OutputArg> &Outs,
1279                             const SmallVectorImpl<SDValue> &OutVals,
1280                             const SmallVectorImpl<ISD::InputArg> &Ins,
1281                             DebugLoc dl, SelectionDAG &DAG,
1282                             SmallVectorImpl<SDValue> &InVals) const {
1283  // CellSPU target does not yet support tail call optimization.
1284  isTailCall = false;
1285
1286  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
1287  unsigned NumOps     = Outs.size();
1288  unsigned StackSlotSize = SPUFrameLowering::stackSlotSize();
1289
1290  SmallVector<CCValAssign, 16> ArgLocs;
1291  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1292		 getTargetMachine(), ArgLocs, *DAG.getContext());
1293  // FIXME: allow for other calling conventions
1294  CCInfo.AnalyzeCallOperands(Outs, CCC_SPU);
1295
1296  const unsigned NumArgRegs = ArgLocs.size();
1297
1298
1299  // Handy pointer type
1300  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1301
1302  // Set up a copy of the stack pointer for use loading and storing any
1303  // arguments that may not fit in the registers available for argument
1304  // passing.
1305  SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
1306
1307  // Figure out which arguments are going to go in registers, and which in
1308  // memory.
1309  unsigned ArgOffset = SPUFrameLowering::minStackSize(); // Just below [LR]
1310  unsigned ArgRegIdx = 0;
1311
1312  // Keep track of registers passing arguments
1313  std::vector<std::pair<unsigned, SDValue> > RegsToPass;
1314  // And the arguments passed on the stack
1315  SmallVector<SDValue, 8> MemOpChains;
1316
1317  for (; ArgRegIdx != NumOps; ++ArgRegIdx) {
1318    SDValue Arg = OutVals[ArgRegIdx];
1319    CCValAssign &VA = ArgLocs[ArgRegIdx];
1320
1321    // PtrOff will be used to store the current argument to the stack if a
1322    // register cannot be found for it.
1323    SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
1324    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
1325
1326    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
1327    default: llvm_unreachable("Unexpected ValueType for argument!");
1328    case MVT::i8:
1329    case MVT::i16:
1330    case MVT::i32:
1331    case MVT::i64:
1332    case MVT::i128:
1333    case MVT::f32:
1334    case MVT::f64:
1335    case MVT::v2i64:
1336    case MVT::v2f64:
1337    case MVT::v4f32:
1338    case MVT::v4i32:
1339    case MVT::v8i16:
1340    case MVT::v16i8:
1341      if (ArgRegIdx != NumArgRegs) {
1342        RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
1343      } else {
1344        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
1345                                           MachinePointerInfo(),
1346                                           false, false, 0));
1347        ArgOffset += StackSlotSize;
1348      }
1349      break;
1350    }
1351  }
1352
1353  // Accumulate how many bytes are to be pushed on the stack, including the
1354  // linkage area, and parameter passing area.  According to the SPU ABI,
1355  // we minimally need space for [LR] and [SP].
1356  unsigned NumStackBytes = ArgOffset - SPUFrameLowering::minStackSize();
1357
1358  // Insert a call sequence start
1359  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
1360                                                            true));
1361
1362  if (!MemOpChains.empty()) {
1363    // Adjust the stack pointer for the stack arguments.
1364    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
1365                        &MemOpChains[0], MemOpChains.size());
1366  }
1367
1368  // Build a sequence of copy-to-reg nodes chained together with token chain
1369  // and flag operands which copy the outgoing args into the appropriate regs.
1370  SDValue InFlag;
1371  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
1372    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
1373                             RegsToPass[i].second, InFlag);
1374    InFlag = Chain.getValue(1);
1375  }
1376
1377  SmallVector<SDValue, 8> Ops;
1378  unsigned CallOpc = SPUISD::CALL;
1379
1380  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
1381  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
1382  // node so that legalize doesn't hack it.
1383  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1384    const GlobalValue *GV = G->getGlobal();
1385    EVT CalleeVT = Callee.getValueType();
1386    SDValue Zero = DAG.getConstant(0, PtrVT);
1387    SDValue GA = DAG.getTargetGlobalAddress(GV, dl, CalleeVT);
1388
1389    if (!ST->usingLargeMem()) {
1390      // Turn calls to targets that are defined (i.e., have bodies) into BRSL
1391      // style calls, otherwise, external symbols are BRASL calls. This assumes
1392      // that declared/defined symbols are in the same compilation unit and can
1393      // be reached through PC-relative jumps.
1394      //
1395      // NOTE:
1396      // This may be an unsafe assumption for JIT and really large compilation
1397      // units.
1398      if (GV->isDeclaration()) {
1399        Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
1400      } else {
1401        Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
1402      }
1403    } else {
1404      // "Large memory" mode: Turn all calls into indirect calls with a X-form
1405      // address pairs:
1406      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
1407    }
1408  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1409    EVT CalleeVT = Callee.getValueType();
1410    SDValue Zero = DAG.getConstant(0, PtrVT);
1411    SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
1412        Callee.getValueType());
1413
1414    if (!ST->usingLargeMem()) {
1415      Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
1416    } else {
1417      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
1418    }
1419  } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
1420    // If this is an absolute destination address that appears to be a legal
1421    // local store address, use the munged value.
1422    Callee = SDValue(Dest, 0);
1423  }
1424
1425  Ops.push_back(Chain);
1426  Ops.push_back(Callee);
1427
1428  // Add argument registers to the end of the list so that they are known live
1429  // into the call.
1430  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
1431    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
1432                                  RegsToPass[i].second.getValueType()));
1433
1434  if (InFlag.getNode())
1435    Ops.push_back(InFlag);
1436  // Returns a chain and a flag for retval copy to use.
1437  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Glue),
1438                      &Ops[0], Ops.size());
1439  InFlag = Chain.getValue(1);
1440
1441  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
1442                             DAG.getIntPtrConstant(0, true), InFlag);
1443  if (!Ins.empty())
1444    InFlag = Chain.getValue(1);
1445
1446  // If the function returns void, just return the chain.
1447  if (Ins.empty())
1448    return Chain;
1449
1450  // Now handle the return value(s)
1451  SmallVector<CCValAssign, 16> RVLocs;
1452  CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1453		    getTargetMachine(), RVLocs, *DAG.getContext());
1454  CCRetInfo.AnalyzeCallResult(Ins, CCC_SPU);
1455
1456
1457  // If the call has results, copy the values out of the ret val registers.
1458  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1459    CCValAssign VA = RVLocs[i];
1460
1461    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1462                                     InFlag);
1463    Chain = Val.getValue(1);
1464    InFlag = Val.getValue(2);
1465    InVals.push_back(Val);
1466   }
1467
1468  return Chain;
1469}
1470
1471SDValue
1472SPUTargetLowering::LowerReturn(SDValue Chain,
1473                               CallingConv::ID CallConv, bool isVarArg,
1474                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1475                               const SmallVectorImpl<SDValue> &OutVals,
1476                               DebugLoc dl, SelectionDAG &DAG) const {
1477
1478  SmallVector<CCValAssign, 16> RVLocs;
1479  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
1480		 getTargetMachine(), RVLocs, *DAG.getContext());
1481  CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
1482
1483  // If this is the first return lowered for this function, add the regs to the
1484  // liveout set for the function.
1485  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
1486    for (unsigned i = 0; i != RVLocs.size(); ++i)
1487      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
1488  }
1489
1490  SDValue Flag;
1491
1492  // Copy the result values into the output registers.
1493  for (unsigned i = 0; i != RVLocs.size(); ++i) {
1494    CCValAssign &VA = RVLocs[i];
1495    assert(VA.isRegLoc() && "Can only return in registers!");
1496    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
1497                             OutVals[i], Flag);
1498    Flag = Chain.getValue(1);
1499  }
1500
1501  if (Flag.getNode())
1502    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
1503  else
1504    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
1505}
1506
1507
1508//===----------------------------------------------------------------------===//
1509// Vector related lowering:
1510//===----------------------------------------------------------------------===//
1511
1512static ConstantSDNode *
1513getVecImm(SDNode *N) {
1514  SDValue OpVal(0, 0);
1515
1516  // Check to see if this buildvec has a single non-undef value in its elements.
1517  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
1518    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
1519    if (OpVal.getNode() == 0)
1520      OpVal = N->getOperand(i);
1521    else if (OpVal != N->getOperand(i))
1522      return 0;
1523  }
1524
1525  if (OpVal.getNode() != 0) {
1526    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
1527      return CN;
1528    }
1529  }
1530
1531  return 0;
1532}
1533
1534/// get_vec_i18imm - Test if this vector is a vector filled with the same value
1535/// and the value fits into an unsigned 18-bit constant, and if so, return the
1536/// constant
1537SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
1538                              EVT ValueType) {
1539  if (ConstantSDNode *CN = getVecImm(N)) {
1540    uint64_t Value = CN->getZExtValue();
1541    if (ValueType == MVT::i64) {
1542      uint64_t UValue = CN->getZExtValue();
1543      uint32_t upper = uint32_t(UValue >> 32);
1544      uint32_t lower = uint32_t(UValue);
1545      if (upper != lower)
1546        return SDValue();
1547      Value = Value >> 32;
1548    }
1549    if (Value <= 0x3ffff)
1550      return DAG.getTargetConstant(Value, ValueType);
1551  }
1552
1553  return SDValue();
1554}
1555
1556/// get_vec_i16imm - Test if this vector is a vector filled with the same value
1557/// and the value fits into a signed 16-bit constant, and if so, return the
1558/// constant
1559SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
1560                              EVT ValueType) {
1561  if (ConstantSDNode *CN = getVecImm(N)) {
1562    int64_t Value = CN->getSExtValue();
1563    if (ValueType == MVT::i64) {
1564      uint64_t UValue = CN->getZExtValue();
1565      uint32_t upper = uint32_t(UValue >> 32);
1566      uint32_t lower = uint32_t(UValue);
1567      if (upper != lower)
1568        return SDValue();
1569      Value = Value >> 32;
1570    }
1571    if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
1572      return DAG.getTargetConstant(Value, ValueType);
1573    }
1574  }
1575
1576  return SDValue();
1577}
1578
1579/// get_vec_i10imm - Test if this vector is a vector filled with the same value
1580/// and the value fits into a signed 10-bit constant, and if so, return the
1581/// constant
1582SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
1583                              EVT ValueType) {
1584  if (ConstantSDNode *CN = getVecImm(N)) {
1585    int64_t Value = CN->getSExtValue();
1586    if (ValueType == MVT::i64) {
1587      uint64_t UValue = CN->getZExtValue();
1588      uint32_t upper = uint32_t(UValue >> 32);
1589      uint32_t lower = uint32_t(UValue);
1590      if (upper != lower)
1591        return SDValue();
1592      Value = Value >> 32;
1593    }
1594    if (isInt<10>(Value))
1595      return DAG.getTargetConstant(Value, ValueType);
1596  }
1597
1598  return SDValue();
1599}
1600
1601/// get_vec_i8imm - Test if this vector is a vector filled with the same value
1602/// and the value fits into a signed 8-bit constant, and if so, return the
1603/// constant.
1604///
1605/// @note: The incoming vector is v16i8 because that's the only way we can load
1606/// constant vectors. Thus, we test to see if the upper and lower bytes are the
1607/// same value.
1608SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
1609                             EVT ValueType) {
1610  if (ConstantSDNode *CN = getVecImm(N)) {
1611    int Value = (int) CN->getZExtValue();
1612    if (ValueType == MVT::i16
1613        && Value <= 0xffff                 /* truncated from uint64_t */
1614        && ((short) Value >> 8) == ((short) Value & 0xff))
1615      return DAG.getTargetConstant(Value & 0xff, ValueType);
1616    else if (ValueType == MVT::i8
1617             && (Value & 0xff) == Value)
1618      return DAG.getTargetConstant(Value, ValueType);
1619  }
1620
1621  return SDValue();
1622}
1623
1624/// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
1625/// and the value fits into a signed 16-bit constant, and if so, return the
1626/// constant
1627SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
1628                               EVT ValueType) {
1629  if (ConstantSDNode *CN = getVecImm(N)) {
1630    uint64_t Value = CN->getZExtValue();
1631    if ((ValueType == MVT::i32
1632          && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
1633        || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
1634      return DAG.getTargetConstant(Value >> 16, ValueType);
1635  }
1636
1637  return SDValue();
1638}
1639
1640/// get_v4i32_imm - Catch-all for general 32-bit constant vectors
1641SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
1642  if (ConstantSDNode *CN = getVecImm(N)) {
1643    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
1644  }
1645
1646  return SDValue();
1647}
1648
1649/// get_v4i32_imm - Catch-all for general 64-bit constant vectors
1650SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
1651  if (ConstantSDNode *CN = getVecImm(N)) {
1652    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
1653  }
1654
1655  return SDValue();
1656}
1657
1658//! Lower a BUILD_VECTOR instruction creatively:
1659static SDValue
1660LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
1661  EVT VT = Op.getValueType();
1662  EVT EltVT = VT.getVectorElementType();
1663  DebugLoc dl = Op.getDebugLoc();
1664  BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
1665  assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
1666  unsigned minSplatBits = EltVT.getSizeInBits();
1667
1668  if (minSplatBits < 16)
1669    minSplatBits = 16;
1670
1671  APInt APSplatBits, APSplatUndef;
1672  unsigned SplatBitSize;
1673  bool HasAnyUndefs;
1674
1675  if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
1676                            HasAnyUndefs, minSplatBits)
1677      || minSplatBits < SplatBitSize)
1678    return SDValue();   // Wasn't a constant vector or splat exceeded min
1679
1680  uint64_t SplatBits = APSplatBits.getZExtValue();
1681
1682  switch (VT.getSimpleVT().SimpleTy) {
1683  default:
1684    report_fatal_error("CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = " +
1685                       Twine(VT.getEVTString()));
1686    /*NOTREACHED*/
1687  case MVT::v4f32: {
1688    uint32_t Value32 = uint32_t(SplatBits);
1689    assert(SplatBitSize == 32
1690           && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
1691    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1692    SDValue T = DAG.getConstant(Value32, MVT::i32);
1693    return DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,
1694                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
1695  }
1696  case MVT::v2f64: {
1697    uint64_t f64val = uint64_t(SplatBits);
1698    assert(SplatBitSize == 64
1699           && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
1700    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
1701    SDValue T = DAG.getConstant(f64val, MVT::i64);
1702    return DAG.getNode(ISD::BITCAST, dl, MVT::v2f64,
1703                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
1704  }
1705  case MVT::v16i8: {
1706   // 8-bit constants have to be expanded to 16-bits
1707   unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
1708   SmallVector<SDValue, 8> Ops;
1709
1710   Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
1711   return DAG.getNode(ISD::BITCAST, dl, VT,
1712                      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
1713  }
1714  case MVT::v8i16: {
1715    unsigned short Value16 = SplatBits;
1716    SDValue T = DAG.getConstant(Value16, EltVT);
1717    SmallVector<SDValue, 8> Ops;
1718
1719    Ops.assign(8, T);
1720    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
1721  }
1722  case MVT::v4i32: {
1723    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
1724    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
1725  }
1726  case MVT::v2i64: {
1727    return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
1728  }
1729  }
1730}
1731
1732/*!
1733 */
1734SDValue
1735SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
1736                     DebugLoc dl) {
1737  uint32_t upper = uint32_t(SplatVal >> 32);
1738  uint32_t lower = uint32_t(SplatVal);
1739
1740  if (upper == lower) {
1741    // Magic constant that can be matched by IL, ILA, et. al.
1742    SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
1743    return DAG.getNode(ISD::BITCAST, dl, OpVT,
1744                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1745                                   Val, Val, Val, Val));
1746  } else {
1747    bool upper_special, lower_special;
1748
1749    // NOTE: This code creates common-case shuffle masks that can be easily
1750    // detected as common expressions. It is not attempting to create highly
1751    // specialized masks to replace any and all 0's, 0xff's and 0x80's.
1752
1753    // Detect if the upper or lower half is a special shuffle mask pattern:
1754    upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
1755    lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
1756
1757    // Both upper and lower are special, lower to a constant pool load:
1758    if (lower_special && upper_special) {
1759      SDValue UpperVal = DAG.getConstant(upper, MVT::i32);
1760      SDValue LowerVal = DAG.getConstant(lower, MVT::i32);
1761      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1762                         UpperVal, LowerVal, UpperVal, LowerVal);
1763      return DAG.getNode(ISD::BITCAST, dl, OpVT, BV);
1764    }
1765
1766    SDValue LO32;
1767    SDValue HI32;
1768    SmallVector<SDValue, 16> ShufBytes;
1769    SDValue Result;
1770
1771    // Create lower vector if not a special pattern
1772    if (!lower_special) {
1773      SDValue LO32C = DAG.getConstant(lower, MVT::i32);
1774      LO32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1775                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1776                                     LO32C, LO32C, LO32C, LO32C));
1777    }
1778
1779    // Create upper vector if not a special pattern
1780    if (!upper_special) {
1781      SDValue HI32C = DAG.getConstant(upper, MVT::i32);
1782      HI32 = DAG.getNode(ISD::BITCAST, dl, OpVT,
1783                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1784                                     HI32C, HI32C, HI32C, HI32C));
1785    }
1786
1787    // If either upper or lower are special, then the two input operands are
1788    // the same (basically, one of them is a "don't care")
1789    if (lower_special)
1790      LO32 = HI32;
1791    if (upper_special)
1792      HI32 = LO32;
1793
1794    for (int i = 0; i < 4; ++i) {
1795      uint64_t val = 0;
1796      for (int j = 0; j < 4; ++j) {
1797        SDValue V;
1798        bool process_upper, process_lower;
1799        val <<= 8;
1800        process_upper = (upper_special && (i & 1) == 0);
1801        process_lower = (lower_special && (i & 1) == 1);
1802
1803        if (process_upper || process_lower) {
1804          if ((process_upper && upper == 0)
1805                  || (process_lower && lower == 0))
1806            val |= 0x80;
1807          else if ((process_upper && upper == 0xffffffff)
1808                  || (process_lower && lower == 0xffffffff))
1809            val |= 0xc0;
1810          else if ((process_upper && upper == 0x80000000)
1811                  || (process_lower && lower == 0x80000000))
1812            val |= (j == 0 ? 0xe0 : 0x80);
1813        } else
1814          val |= i * 4 + j + ((i & 1) * 16);
1815      }
1816
1817      ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
1818    }
1819
1820    return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
1821                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
1822                                   &ShufBytes[0], ShufBytes.size()));
1823  }
1824}
1825
1826/// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
1827/// which the Cell can operate. The code inspects V3 to ascertain whether the
1828/// permutation vector, V3, is monotonically increasing with one "exception"
1829/// element, e.g., (0, 1, _, 3). If this is the case, then generate a
1830/// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
1831/// In either case, the net result is going to eventually invoke SHUFB to
1832/// permute/shuffle the bytes from V1 and V2.
1833/// \note
1834/// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
1835/// control word for byte/halfword/word insertion. This takes care of a single
1836/// element move from V2 into V1.
1837/// \note
1838/// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
1839static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
1840  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
1841  SDValue V1 = Op.getOperand(0);
1842  SDValue V2 = Op.getOperand(1);
1843  DebugLoc dl = Op.getDebugLoc();
1844
1845  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
1846
1847  // If we have a single element being moved from V1 to V2, this can be handled
1848  // using the C*[DX] compute mask instructions, but the vector elements have
1849  // to be monotonically increasing with one exception element, and the source
1850  // slot of the element to move must be the same as the destination.
1851  EVT VecVT = V1.getValueType();
1852  EVT EltVT = VecVT.getVectorElementType();
1853  unsigned EltsFromV2 = 0;
1854  unsigned V2EltOffset = 0;
1855  unsigned V2EltIdx0 = 0;
1856  unsigned CurrElt = 0;
1857  unsigned MaxElts = VecVT.getVectorNumElements();
1858  unsigned PrevElt = 0;
1859  bool monotonic = true;
1860  bool rotate = true;
1861  int rotamt=0;
1862  EVT maskVT;             // which of the c?d instructions to use
1863
1864  if (EltVT == MVT::i8) {
1865    V2EltIdx0 = 16;
1866    maskVT = MVT::v16i8;
1867  } else if (EltVT == MVT::i16) {
1868    V2EltIdx0 = 8;
1869    maskVT = MVT::v8i16;
1870  } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
1871    V2EltIdx0 = 4;
1872    maskVT = MVT::v4i32;
1873  } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
1874    V2EltIdx0 = 2;
1875    maskVT = MVT::v2i64;
1876  } else
1877    llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
1878
1879  for (unsigned i = 0; i != MaxElts; ++i) {
1880    if (SVN->getMaskElt(i) < 0)
1881      continue;
1882
1883    unsigned SrcElt = SVN->getMaskElt(i);
1884
1885    if (monotonic) {
1886      if (SrcElt >= V2EltIdx0) {
1887        // TODO: optimize for the monotonic case when several consecutive
1888        // elements are taken form V2. Do we ever get such a case?
1889        if (EltsFromV2 == 0 && CurrElt == (SrcElt - V2EltIdx0))
1890          V2EltOffset = (SrcElt - V2EltIdx0) * (EltVT.getSizeInBits()/8);
1891        else
1892          monotonic = false;
1893        ++EltsFromV2;
1894      } else if (CurrElt != SrcElt) {
1895        monotonic = false;
1896      }
1897
1898      ++CurrElt;
1899    }
1900
1901    if (rotate) {
1902      if (PrevElt > 0 && SrcElt < MaxElts) {
1903        if ((PrevElt == SrcElt - 1)
1904            || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
1905          PrevElt = SrcElt;
1906        } else {
1907          rotate = false;
1908        }
1909      } else if (i == 0 || (PrevElt==0 && SrcElt==1)) {
1910        // First time or after a "wrap around"
1911        rotamt = SrcElt-i;
1912        PrevElt = SrcElt;
1913      } else {
1914        // This isn't a rotation, takes elements from vector 2
1915        rotate = false;
1916      }
1917    }
1918  }
1919
1920  if (EltsFromV2 == 1 && monotonic) {
1921    // Compute mask and shuffle
1922    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
1923
1924    // As SHUFFLE_MASK becomes a c?d instruction, feed it an address
1925    // R1 ($sp) is used here only as it is guaranteed to have last bits zero
1926    SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
1927                                DAG.getRegister(SPU::R1, PtrVT),
1928                                DAG.getConstant(V2EltOffset, MVT::i32));
1929    SDValue ShufMaskOp = DAG.getNode(SPUISD::SHUFFLE_MASK, dl,
1930                                     maskVT, Pointer);
1931
1932    // Use shuffle mask in SHUFB synthetic instruction:
1933    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
1934                       ShufMaskOp);
1935  } else if (rotate) {
1936    if (rotamt < 0)
1937      rotamt +=MaxElts;
1938    rotamt *= EltVT.getSizeInBits()/8;
1939    return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
1940                       V1, DAG.getConstant(rotamt, MVT::i16));
1941  } else {
1942   // Convert the SHUFFLE_VECTOR mask's input element units to the
1943   // actual bytes.
1944    unsigned BytesPerElement = EltVT.getSizeInBits()/8;
1945
1946    SmallVector<SDValue, 16> ResultMask;
1947    for (unsigned i = 0, e = MaxElts; i != e; ++i) {
1948      unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
1949
1950      for (unsigned j = 0; j < BytesPerElement; ++j)
1951        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
1952    }
1953    SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
1954                                    &ResultMask[0], ResultMask.size());
1955    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
1956  }
1957}
1958
1959static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
1960  SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
1961  DebugLoc dl = Op.getDebugLoc();
1962
1963  if (Op0.getNode()->getOpcode() == ISD::Constant) {
1964    // For a constant, build the appropriate constant vector, which will
1965    // eventually simplify to a vector register load.
1966
1967    ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
1968    SmallVector<SDValue, 16> ConstVecValues;
1969    EVT VT;
1970    size_t n_copies;
1971
1972    // Create a constant vector:
1973    switch (Op.getValueType().getSimpleVT().SimpleTy) {
1974    default: llvm_unreachable("Unexpected constant value type in "
1975                              "LowerSCALAR_TO_VECTOR");
1976    case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
1977    case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
1978    case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
1979    case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
1980    case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
1981    case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
1982    }
1983
1984    SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
1985    for (size_t j = 0; j < n_copies; ++j)
1986      ConstVecValues.push_back(CValue);
1987
1988    return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
1989                       &ConstVecValues[0], ConstVecValues.size());
1990  } else {
1991    // Otherwise, copy the value from one register to another:
1992    switch (Op0.getValueType().getSimpleVT().SimpleTy) {
1993    default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
1994    case MVT::i8:
1995    case MVT::i16:
1996    case MVT::i32:
1997    case MVT::i64:
1998    case MVT::f32:
1999    case MVT::f64:
2000      return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
2001    }
2002  }
2003}
2004
2005static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
2006  EVT VT = Op.getValueType();
2007  SDValue N = Op.getOperand(0);
2008  SDValue Elt = Op.getOperand(1);
2009  DebugLoc dl = Op.getDebugLoc();
2010  SDValue retval;
2011
2012  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
2013    // Constant argument:
2014    int EltNo = (int) C->getZExtValue();
2015
2016    // sanity checks:
2017    if (VT == MVT::i8 && EltNo >= 16)
2018      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
2019    else if (VT == MVT::i16 && EltNo >= 8)
2020      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
2021    else if (VT == MVT::i32 && EltNo >= 4)
2022      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
2023    else if (VT == MVT::i64 && EltNo >= 2)
2024      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
2025
2026    if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
2027      // i32 and i64: Element 0 is the preferred slot
2028      return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
2029    }
2030
2031    // Need to generate shuffle mask and extract:
2032    int prefslot_begin = -1, prefslot_end = -1;
2033    int elt_byte = EltNo * VT.getSizeInBits() / 8;
2034
2035    switch (VT.getSimpleVT().SimpleTy) {
2036    default: llvm_unreachable("Invalid value type!");
2037    case MVT::i8: {
2038      prefslot_begin = prefslot_end = 3;
2039      break;
2040    }
2041    case MVT::i16: {
2042      prefslot_begin = 2; prefslot_end = 3;
2043      break;
2044    }
2045    case MVT::i32:
2046    case MVT::f32: {
2047      prefslot_begin = 0; prefslot_end = 3;
2048      break;
2049    }
2050    case MVT::i64:
2051    case MVT::f64: {
2052      prefslot_begin = 0; prefslot_end = 7;
2053      break;
2054    }
2055    }
2056
2057    assert(prefslot_begin != -1 && prefslot_end != -1 &&
2058           "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
2059
2060    unsigned int ShufBytes[16] = {
2061      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
2062    };
2063    for (int i = 0; i < 16; ++i) {
2064      // zero fill uppper part of preferred slot, don't care about the
2065      // other slots:
2066      unsigned int mask_val;
2067      if (i <= prefslot_end) {
2068        mask_val =
2069          ((i < prefslot_begin)
2070           ? 0x80
2071           : elt_byte + (i - prefslot_begin));
2072
2073        ShufBytes[i] = mask_val;
2074      } else
2075        ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
2076    }
2077
2078    SDValue ShufMask[4];
2079    for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
2080      unsigned bidx = i * 4;
2081      unsigned int bits = ((ShufBytes[bidx] << 24) |
2082                           (ShufBytes[bidx+1] << 16) |
2083                           (ShufBytes[bidx+2] << 8) |
2084                           ShufBytes[bidx+3]);
2085      ShufMask[i] = DAG.getConstant(bits, MVT::i32);
2086    }
2087
2088    SDValue ShufMaskVec =
2089      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2090                  &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
2091
2092    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2093                         DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
2094                                     N, N, ShufMaskVec));
2095  } else {
2096    // Variable index: Rotate the requested element into slot 0, then replicate
2097    // slot 0 across the vector
2098    EVT VecVT = N.getValueType();
2099    if (!VecVT.isSimple() || !VecVT.isVector()) {
2100      report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
2101                        "vector type!");
2102    }
2103
2104    // Make life easier by making sure the index is zero-extended to i32
2105    if (Elt.getValueType() != MVT::i32)
2106      Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
2107
2108    // Scale the index to a bit/byte shift quantity
2109    APInt scaleFactor =
2110            APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
2111    unsigned scaleShift = scaleFactor.logBase2();
2112    SDValue vecShift;
2113
2114    if (scaleShift > 0) {
2115      // Scale the shift factor:
2116      Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
2117                        DAG.getConstant(scaleShift, MVT::i32));
2118    }
2119
2120    vecShift = DAG.getNode(SPUISD::SHL_BYTES, dl, VecVT, N, Elt);
2121
2122    // Replicate the bytes starting at byte 0 across the entire vector (for
2123    // consistency with the notion of a unified register set)
2124    SDValue replicate;
2125
2126    switch (VT.getSimpleVT().SimpleTy) {
2127    default:
2128      report_fatal_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
2129                        "type");
2130      /*NOTREACHED*/
2131    case MVT::i8: {
2132      SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
2133      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2134                              factor, factor, factor, factor);
2135      break;
2136    }
2137    case MVT::i16: {
2138      SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
2139      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2140                              factor, factor, factor, factor);
2141      break;
2142    }
2143    case MVT::i32:
2144    case MVT::f32: {
2145      SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
2146      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2147                              factor, factor, factor, factor);
2148      break;
2149    }
2150    case MVT::i64:
2151    case MVT::f64: {
2152      SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
2153      SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
2154      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2155                              loFactor, hiFactor, loFactor, hiFactor);
2156      break;
2157    }
2158    }
2159
2160    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
2161                         DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2162                                     vecShift, vecShift, replicate));
2163  }
2164
2165  return retval;
2166}
2167
2168static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
2169  SDValue VecOp = Op.getOperand(0);
2170  SDValue ValOp = Op.getOperand(1);
2171  SDValue IdxOp = Op.getOperand(2);
2172  DebugLoc dl = Op.getDebugLoc();
2173  EVT VT = Op.getValueType();
2174  EVT eltVT = ValOp.getValueType();
2175
2176  // use 0 when the lane to insert to is 'undef'
2177  int64_t Offset=0;
2178  if (IdxOp.getOpcode() != ISD::UNDEF) {
2179    ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
2180    assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
2181    Offset = (CN->getSExtValue()) * eltVT.getSizeInBits()/8;
2182  }
2183
2184  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
2185  // Use $sp ($1) because it's always 16-byte aligned and it's available:
2186  SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
2187                                DAG.getRegister(SPU::R1, PtrVT),
2188                                DAG.getConstant(Offset, PtrVT));
2189  // widen the mask when dealing with half vectors
2190  EVT maskVT = EVT::getVectorVT(*(DAG.getContext()), VT.getVectorElementType(),
2191                                128/ VT.getVectorElementType().getSizeInBits());
2192  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, maskVT, Pointer);
2193
2194  SDValue result =
2195    DAG.getNode(SPUISD::SHUFB, dl, VT,
2196                DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
2197                VecOp,
2198                DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ShufMask));
2199
2200  return result;
2201}
2202
2203static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
2204                           const TargetLowering &TLI)
2205{
2206  SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
2207  DebugLoc dl = Op.getDebugLoc();
2208  EVT ShiftVT = TLI.getShiftAmountTy(N0.getValueType());
2209
2210  assert(Op.getValueType() == MVT::i8);
2211  switch (Opc) {
2212  default:
2213    llvm_unreachable("Unhandled i8 math operator");
2214  case ISD::ADD: {
2215    // 8-bit addition: Promote the arguments up to 16-bits and truncate
2216    // the result:
2217    SDValue N1 = Op.getOperand(1);
2218    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2219    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2220    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2221                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2222
2223  }
2224
2225  case ISD::SUB: {
2226    // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
2227    // the result:
2228    SDValue N1 = Op.getOperand(1);
2229    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2230    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2231    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2232                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2233  }
2234  case ISD::ROTR:
2235  case ISD::ROTL: {
2236    SDValue N1 = Op.getOperand(1);
2237    EVT N1VT = N1.getValueType();
2238
2239    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2240    if (!N1VT.bitsEq(ShiftVT)) {
2241      unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
2242                       ? ISD::ZERO_EXTEND
2243                       : ISD::TRUNCATE;
2244      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2245    }
2246
2247    // Replicate lower 8-bits into upper 8:
2248    SDValue ExpandArg =
2249      DAG.getNode(ISD::OR, dl, MVT::i16, N0,
2250                  DAG.getNode(ISD::SHL, dl, MVT::i16,
2251                              N0, DAG.getConstant(8, MVT::i32)));
2252
2253    // Truncate back down to i8
2254    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2255                       DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
2256  }
2257  case ISD::SRL:
2258  case ISD::SHL: {
2259    SDValue N1 = Op.getOperand(1);
2260    EVT N1VT = N1.getValueType();
2261
2262    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
2263    if (!N1VT.bitsEq(ShiftVT)) {
2264      unsigned N1Opc = ISD::ZERO_EXTEND;
2265
2266      if (N1.getValueType().bitsGT(ShiftVT))
2267        N1Opc = ISD::TRUNCATE;
2268
2269      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2270    }
2271
2272    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2273                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2274  }
2275  case ISD::SRA: {
2276    SDValue N1 = Op.getOperand(1);
2277    EVT N1VT = N1.getValueType();
2278
2279    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2280    if (!N1VT.bitsEq(ShiftVT)) {
2281      unsigned N1Opc = ISD::SIGN_EXTEND;
2282
2283      if (N1VT.bitsGT(ShiftVT))
2284        N1Opc = ISD::TRUNCATE;
2285      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
2286    }
2287
2288    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2289                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2290  }
2291  case ISD::MUL: {
2292    SDValue N1 = Op.getOperand(1);
2293
2294    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
2295    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
2296    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
2297                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
2298  }
2299  }
2300}
2301
2302//! Lower byte immediate operations for v16i8 vectors:
2303static SDValue
2304LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
2305  SDValue ConstVec;
2306  SDValue Arg;
2307  EVT VT = Op.getValueType();
2308  DebugLoc dl = Op.getDebugLoc();
2309
2310  ConstVec = Op.getOperand(0);
2311  Arg = Op.getOperand(1);
2312  if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
2313    if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2314      ConstVec = ConstVec.getOperand(0);
2315    } else {
2316      ConstVec = Op.getOperand(1);
2317      Arg = Op.getOperand(0);
2318      if (ConstVec.getNode()->getOpcode() == ISD::BITCAST) {
2319        ConstVec = ConstVec.getOperand(0);
2320      }
2321    }
2322  }
2323
2324  if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
2325    BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
2326    assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
2327
2328    APInt APSplatBits, APSplatUndef;
2329    unsigned SplatBitSize;
2330    bool HasAnyUndefs;
2331    unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
2332
2333    if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
2334                              HasAnyUndefs, minSplatBits)
2335        && minSplatBits <= SplatBitSize) {
2336      uint64_t SplatBits = APSplatBits.getZExtValue();
2337      SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
2338
2339      SmallVector<SDValue, 16> tcVec;
2340      tcVec.assign(16, tc);
2341      return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
2342                         DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
2343    }
2344  }
2345
2346  // These operations (AND, OR, XOR) are legal, they just couldn't be custom
2347  // lowered.  Return the operation, rather than a null SDValue.
2348  return Op;
2349}
2350
2351//! Custom lowering for CTPOP (count population)
2352/*!
2353  Custom lowering code that counts the number ones in the input
2354  operand. SPU has such an instruction, but it counts the number of
2355  ones per byte, which then have to be accumulated.
2356*/
2357static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
2358  EVT VT = Op.getValueType();
2359  EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
2360                               VT, (128 / VT.getSizeInBits()));
2361  DebugLoc dl = Op.getDebugLoc();
2362
2363  switch (VT.getSimpleVT().SimpleTy) {
2364  default: llvm_unreachable("Invalid value type!");
2365  case MVT::i8: {
2366    SDValue N = Op.getOperand(0);
2367    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2368
2369    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2370    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2371
2372    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
2373  }
2374
2375  case MVT::i16: {
2376    MachineFunction &MF = DAG.getMachineFunction();
2377    MachineRegisterInfo &RegInfo = MF.getRegInfo();
2378
2379    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
2380
2381    SDValue N = Op.getOperand(0);
2382    SDValue Elt0 = DAG.getConstant(0, MVT::i16);
2383    SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
2384    SDValue Shift1 = DAG.getConstant(8, MVT::i32);
2385
2386    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2387    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2388
2389    // CNTB_result becomes the chain to which all of the virtual registers
2390    // CNTB_reg, SUM1_reg become associated:
2391    SDValue CNTB_result =
2392      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
2393
2394    SDValue CNTB_rescopy =
2395      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2396
2397    SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
2398
2399    return DAG.getNode(ISD::AND, dl, MVT::i16,
2400                       DAG.getNode(ISD::ADD, dl, MVT::i16,
2401                                   DAG.getNode(ISD::SRL, dl, MVT::i16,
2402                                               Tmp1, Shift1),
2403                                   Tmp1),
2404                       Mask0);
2405  }
2406
2407  case MVT::i32: {
2408    MachineFunction &MF = DAG.getMachineFunction();
2409    MachineRegisterInfo &RegInfo = MF.getRegInfo();
2410
2411    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2412    unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
2413
2414    SDValue N = Op.getOperand(0);
2415    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
2416    SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
2417    SDValue Shift1 = DAG.getConstant(16, MVT::i32);
2418    SDValue Shift2 = DAG.getConstant(8, MVT::i32);
2419
2420    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
2421    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
2422
2423    // CNTB_result becomes the chain to which all of the virtual registers
2424    // CNTB_reg, SUM1_reg become associated:
2425    SDValue CNTB_result =
2426      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
2427
2428    SDValue CNTB_rescopy =
2429      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
2430
2431    SDValue Comp1 =
2432      DAG.getNode(ISD::SRL, dl, MVT::i32,
2433                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
2434                  Shift1);
2435
2436    SDValue Sum1 =
2437      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
2438                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
2439
2440    SDValue Sum1_rescopy =
2441      DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
2442
2443    SDValue Comp2 =
2444      DAG.getNode(ISD::SRL, dl, MVT::i32,
2445                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
2446                  Shift2);
2447    SDValue Sum2 =
2448      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
2449                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
2450
2451    return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
2452  }
2453
2454  case MVT::i64:
2455    break;
2456  }
2457
2458  return SDValue();
2459}
2460
2461//! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
2462/*!
2463 f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
2464 All conversions to i64 are expanded to a libcall.
2465 */
2466static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
2467                              const SPUTargetLowering &TLI) {
2468  EVT OpVT = Op.getValueType();
2469  SDValue Op0 = Op.getOperand(0);
2470  EVT Op0VT = Op0.getValueType();
2471
2472  if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
2473      || OpVT == MVT::i64) {
2474    // Convert f32 / f64 to i32 / i64 via libcall.
2475    RTLIB::Libcall LC =
2476            (Op.getOpcode() == ISD::FP_TO_SINT)
2477             ? RTLIB::getFPTOSINT(Op0VT, OpVT)
2478             : RTLIB::getFPTOUINT(Op0VT, OpVT);
2479    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
2480    SDValue Dummy;
2481    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2482  }
2483
2484  return Op;
2485}
2486
2487//! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
2488/*!
2489 i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
2490 All conversions from i64 are expanded to a libcall.
2491 */
2492static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
2493                              const SPUTargetLowering &TLI) {
2494  EVT OpVT = Op.getValueType();
2495  SDValue Op0 = Op.getOperand(0);
2496  EVT Op0VT = Op0.getValueType();
2497
2498  if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
2499      || Op0VT == MVT::i64) {
2500    // Convert i32, i64 to f64 via libcall:
2501    RTLIB::Libcall LC =
2502            (Op.getOpcode() == ISD::SINT_TO_FP)
2503             ? RTLIB::getSINTTOFP(Op0VT, OpVT)
2504             : RTLIB::getUINTTOFP(Op0VT, OpVT);
2505    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
2506    SDValue Dummy;
2507    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
2508  }
2509
2510  return Op;
2511}
2512
2513//! Lower ISD::SETCC
2514/*!
2515 This handles MVT::f64 (double floating point) condition lowering
2516 */
2517static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
2518                          const TargetLowering &TLI) {
2519  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
2520  DebugLoc dl = Op.getDebugLoc();
2521  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
2522
2523  SDValue lhs = Op.getOperand(0);
2524  SDValue rhs = Op.getOperand(1);
2525  EVT lhsVT = lhs.getValueType();
2526  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
2527
2528  EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
2529  APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2530  EVT IntVT(MVT::i64);
2531
2532  // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
2533  // selected to a NOP:
2534  SDValue i64lhs = DAG.getNode(ISD::BITCAST, dl, IntVT, lhs);
2535  SDValue lhsHi32 =
2536          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2537                      DAG.getNode(ISD::SRL, dl, IntVT,
2538                                  i64lhs, DAG.getConstant(32, MVT::i32)));
2539  SDValue lhsHi32abs =
2540          DAG.getNode(ISD::AND, dl, MVT::i32,
2541                      lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
2542  SDValue lhsLo32 =
2543          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
2544
2545  // SETO and SETUO only use the lhs operand:
2546  if (CC->get() == ISD::SETO) {
2547    // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
2548    // SETUO
2549    APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
2550    return DAG.getNode(ISD::XOR, dl, ccResultVT,
2551                       DAG.getSetCC(dl, ccResultVT,
2552                                    lhs, DAG.getConstantFP(0.0, lhsVT),
2553                                    ISD::SETUO),
2554                       DAG.getConstant(ccResultAllOnes, ccResultVT));
2555  } else if (CC->get() == ISD::SETUO) {
2556    // Evaluates to true if Op0 is [SQ]NaN
2557    return DAG.getNode(ISD::AND, dl, ccResultVT,
2558                       DAG.getSetCC(dl, ccResultVT,
2559                                    lhsHi32abs,
2560                                    DAG.getConstant(0x7ff00000, MVT::i32),
2561                                    ISD::SETGE),
2562                       DAG.getSetCC(dl, ccResultVT,
2563                                    lhsLo32,
2564                                    DAG.getConstant(0, MVT::i32),
2565                                    ISD::SETGT));
2566  }
2567
2568  SDValue i64rhs = DAG.getNode(ISD::BITCAST, dl, IntVT, rhs);
2569  SDValue rhsHi32 =
2570          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
2571                      DAG.getNode(ISD::SRL, dl, IntVT,
2572                                  i64rhs, DAG.getConstant(32, MVT::i32)));
2573
2574  // If a value is negative, subtract from the sign magnitude constant:
2575  SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
2576
2577  // Convert the sign-magnitude representation into 2's complement:
2578  SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2579                                      lhsHi32, DAG.getConstant(31, MVT::i32));
2580  SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
2581  SDValue lhsSelect =
2582          DAG.getNode(ISD::SELECT, dl, IntVT,
2583                      lhsSelectMask, lhsSignMag2TC, i64lhs);
2584
2585  SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
2586                                      rhsHi32, DAG.getConstant(31, MVT::i32));
2587  SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
2588  SDValue rhsSelect =
2589          DAG.getNode(ISD::SELECT, dl, IntVT,
2590                      rhsSelectMask, rhsSignMag2TC, i64rhs);
2591
2592  unsigned compareOp;
2593
2594  switch (CC->get()) {
2595  case ISD::SETOEQ:
2596  case ISD::SETUEQ:
2597    compareOp = ISD::SETEQ; break;
2598  case ISD::SETOGT:
2599  case ISD::SETUGT:
2600    compareOp = ISD::SETGT; break;
2601  case ISD::SETOGE:
2602  case ISD::SETUGE:
2603    compareOp = ISD::SETGE; break;
2604  case ISD::SETOLT:
2605  case ISD::SETULT:
2606    compareOp = ISD::SETLT; break;
2607  case ISD::SETOLE:
2608  case ISD::SETULE:
2609    compareOp = ISD::SETLE; break;
2610  case ISD::SETUNE:
2611  case ISD::SETONE:
2612    compareOp = ISD::SETNE; break;
2613  default:
2614    report_fatal_error("CellSPU ISel Select: unimplemented f64 condition");
2615  }
2616
2617  SDValue result =
2618          DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
2619                       (ISD::CondCode) compareOp);
2620
2621  if ((CC->get() & 0x8) == 0) {
2622    // Ordered comparison:
2623    SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
2624                                  lhs, DAG.getConstantFP(0.0, MVT::f64),
2625                                  ISD::SETO);
2626    SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
2627                                  rhs, DAG.getConstantFP(0.0, MVT::f64),
2628                                  ISD::SETO);
2629    SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
2630
2631    result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
2632  }
2633
2634  return result;
2635}
2636
2637//! Lower ISD::SELECT_CC
2638/*!
2639  ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
2640  SELB instruction.
2641
2642  \note Need to revisit this in the future: if the code path through the true
2643  and false value computations is longer than the latency of a branch (6
2644  cycles), then it would be more advantageous to branch and insert a new basic
2645  block and branch on the condition. However, this code does not make that
2646  assumption, given the simplisitc uses so far.
2647 */
2648
2649static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
2650                              const TargetLowering &TLI) {
2651  EVT VT = Op.getValueType();
2652  SDValue lhs = Op.getOperand(0);
2653  SDValue rhs = Op.getOperand(1);
2654  SDValue trueval = Op.getOperand(2);
2655  SDValue falseval = Op.getOperand(3);
2656  SDValue condition = Op.getOperand(4);
2657  DebugLoc dl = Op.getDebugLoc();
2658
2659  // NOTE: SELB's arguments: $rA, $rB, $mask
2660  //
2661  // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
2662  // where bits in $mask are 1. CCond will be inverted, having 1s where the
2663  // condition was true and 0s where the condition was false. Hence, the
2664  // arguments to SELB get reversed.
2665
2666  // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
2667  // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
2668  // with another "cannot select select_cc" assert:
2669
2670  SDValue compare = DAG.getNode(ISD::SETCC, dl,
2671                                TLI.getSetCCResultType(Op.getValueType()),
2672                                lhs, rhs, condition);
2673  return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
2674}
2675
2676//! Custom lower ISD::TRUNCATE
2677static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
2678{
2679  // Type to truncate to
2680  EVT VT = Op.getValueType();
2681  MVT simpleVT = VT.getSimpleVT();
2682  EVT VecVT = EVT::getVectorVT(*DAG.getContext(),
2683                               VT, (128 / VT.getSizeInBits()));
2684  DebugLoc dl = Op.getDebugLoc();
2685
2686  // Type to truncate from
2687  SDValue Op0 = Op.getOperand(0);
2688  EVT Op0VT = Op0.getValueType();
2689
2690  if (Op0VT == MVT::i128 && simpleVT == MVT::i64) {
2691    // Create shuffle mask, least significant doubleword of quadword
2692    unsigned maskHigh = 0x08090a0b;
2693    unsigned maskLow = 0x0c0d0e0f;
2694    // Use a shuffle to perform the truncation
2695    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2696                                   DAG.getConstant(maskHigh, MVT::i32),
2697                                   DAG.getConstant(maskLow, MVT::i32),
2698                                   DAG.getConstant(maskHigh, MVT::i32),
2699                                   DAG.getConstant(maskLow, MVT::i32));
2700
2701    SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
2702                                       Op0, Op0, shufMask);
2703
2704    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
2705  }
2706
2707  return SDValue();             // Leave the truncate unmolested
2708}
2709
2710/*!
2711 * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
2712 * algorithm is to duplicate the sign bit using rotmai to generate at
2713 * least one byte full of sign bits. Then propagate the "sign-byte" into
2714 * the leftmost words and the i64/i32 into the rightmost words using shufb.
2715 *
2716 * @param Op The sext operand
2717 * @param DAG The current DAG
2718 * @return The SDValue with the entire instruction sequence
2719 */
2720static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
2721{
2722  DebugLoc dl = Op.getDebugLoc();
2723
2724  // Type to extend to
2725  MVT OpVT = Op.getValueType().getSimpleVT();
2726
2727  // Type to extend from
2728  SDValue Op0 = Op.getOperand(0);
2729  MVT Op0VT = Op0.getValueType().getSimpleVT();
2730
2731  // extend i8 & i16 via i32
2732  if (Op0VT == MVT::i8 || Op0VT == MVT::i16) {
2733    Op0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Op0);
2734    Op0VT = MVT::i32;
2735  }
2736
2737  // The type to extend to needs to be a i128 and
2738  // the type to extend from needs to be i64 or i32.
2739  assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
2740          "LowerSIGN_EXTEND: input and/or output operand have wrong size");
2741  (void)OpVT;
2742
2743  // Create shuffle mask
2744  unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
2745  unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
2746  unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
2747  SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
2748                                 DAG.getConstant(mask1, MVT::i32),
2749                                 DAG.getConstant(mask1, MVT::i32),
2750                                 DAG.getConstant(mask2, MVT::i32),
2751                                 DAG.getConstant(mask3, MVT::i32));
2752
2753  // Word wise arithmetic right shift to generate at least one byte
2754  // that contains sign bits.
2755  MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
2756  SDValue sraVal = DAG.getNode(ISD::SRA,
2757                 dl,
2758                 mvt,
2759                 DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
2760                 DAG.getConstant(31, MVT::i32));
2761
2762  // reinterpret as a i128 (SHUFB requires it). This gets lowered away.
2763  SDValue extended = SDValue(DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
2764                                        dl, Op0VT, Op0,
2765                                        DAG.getTargetConstant(
2766                                                  SPU::GPRCRegClass.getID(),
2767                                                  MVT::i32)), 0);
2768  // Shuffle bytes - Copy the sign bits into the upper 64 bits
2769  // and the input value into the lower 64 bits.
2770  SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
2771        extended, sraVal, shufMask);
2772  return DAG.getNode(ISD::BITCAST, dl, MVT::i128, extShuffle);
2773}
2774
2775//! Custom (target-specific) lowering entry point
2776/*!
2777  This is where LLVM's DAG selection process calls to do target-specific
2778  lowering of nodes.
2779 */
2780SDValue
2781SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
2782{
2783  unsigned Opc = (unsigned) Op.getOpcode();
2784  EVT VT = Op.getValueType();
2785
2786  switch (Opc) {
2787  default: {
2788#ifndef NDEBUG
2789    errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
2790    errs() << "Op.getOpcode() = " << Opc << "\n";
2791    errs() << "*Op.getNode():\n";
2792    Op.getNode()->dump();
2793#endif
2794    llvm_unreachable(0);
2795  }
2796  case ISD::LOAD:
2797  case ISD::EXTLOAD:
2798  case ISD::SEXTLOAD:
2799  case ISD::ZEXTLOAD:
2800    return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
2801  case ISD::STORE:
2802    return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
2803  case ISD::ConstantPool:
2804    return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
2805  case ISD::GlobalAddress:
2806    return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
2807  case ISD::JumpTable:
2808    return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
2809  case ISD::ConstantFP:
2810    return LowerConstantFP(Op, DAG);
2811
2812  // i8, i64 math ops:
2813  case ISD::ADD:
2814  case ISD::SUB:
2815  case ISD::ROTR:
2816  case ISD::ROTL:
2817  case ISD::SRL:
2818  case ISD::SHL:
2819  case ISD::SRA: {
2820    if (VT == MVT::i8)
2821      return LowerI8Math(Op, DAG, Opc, *this);
2822    break;
2823  }
2824
2825  case ISD::FP_TO_SINT:
2826  case ISD::FP_TO_UINT:
2827    return LowerFP_TO_INT(Op, DAG, *this);
2828
2829  case ISD::SINT_TO_FP:
2830  case ISD::UINT_TO_FP:
2831    return LowerINT_TO_FP(Op, DAG, *this);
2832
2833  // Vector-related lowering.
2834  case ISD::BUILD_VECTOR:
2835    return LowerBUILD_VECTOR(Op, DAG);
2836  case ISD::SCALAR_TO_VECTOR:
2837    return LowerSCALAR_TO_VECTOR(Op, DAG);
2838  case ISD::VECTOR_SHUFFLE:
2839    return LowerVECTOR_SHUFFLE(Op, DAG);
2840  case ISD::EXTRACT_VECTOR_ELT:
2841    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2842  case ISD::INSERT_VECTOR_ELT:
2843    return LowerINSERT_VECTOR_ELT(Op, DAG);
2844
2845  // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
2846  case ISD::AND:
2847  case ISD::OR:
2848  case ISD::XOR:
2849    return LowerByteImmed(Op, DAG);
2850
2851  // Vector and i8 multiply:
2852  case ISD::MUL:
2853    if (VT == MVT::i8)
2854      return LowerI8Math(Op, DAG, Opc, *this);
2855
2856  case ISD::CTPOP:
2857    return LowerCTPOP(Op, DAG);
2858
2859  case ISD::SELECT_CC:
2860    return LowerSELECT_CC(Op, DAG, *this);
2861
2862  case ISD::SETCC:
2863    return LowerSETCC(Op, DAG, *this);
2864
2865  case ISD::TRUNCATE:
2866    return LowerTRUNCATE(Op, DAG);
2867
2868  case ISD::SIGN_EXTEND:
2869    return LowerSIGN_EXTEND(Op, DAG);
2870  }
2871
2872  return SDValue();
2873}
2874
2875void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
2876                                           SmallVectorImpl<SDValue>&Results,
2877                                           SelectionDAG &DAG) const
2878{
2879#if 0
2880  unsigned Opc = (unsigned) N->getOpcode();
2881  EVT OpVT = N->getValueType(0);
2882
2883  switch (Opc) {
2884  default: {
2885    errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
2886    errs() << "Op.getOpcode() = " << Opc << "\n";
2887    errs() << "*Op.getNode():\n";
2888    N->dump();
2889    abort();
2890    /*NOTREACHED*/
2891  }
2892  }
2893#endif
2894
2895  /* Otherwise, return unchanged */
2896}
2897
2898//===----------------------------------------------------------------------===//
2899// Target Optimization Hooks
2900//===----------------------------------------------------------------------===//
2901
2902SDValue
2903SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
2904{
2905#if 0
2906  TargetMachine &TM = getTargetMachine();
2907#endif
2908  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
2909  SelectionDAG &DAG = DCI.DAG;
2910  SDValue Op0 = N->getOperand(0);       // everything has at least one operand
2911  EVT NodeVT = N->getValueType(0);      // The node's value type
2912  EVT Op0VT = Op0.getValueType();       // The first operand's result
2913  SDValue Result;                       // Initially, empty result
2914  DebugLoc dl = N->getDebugLoc();
2915
2916  switch (N->getOpcode()) {
2917  default: break;
2918  case ISD::ADD: {
2919    SDValue Op1 = N->getOperand(1);
2920
2921    if (Op0.getOpcode() == SPUISD::IndirectAddr
2922        || Op1.getOpcode() == SPUISD::IndirectAddr) {
2923      // Normalize the operands to reduce repeated code
2924      SDValue IndirectArg = Op0, AddArg = Op1;
2925
2926      if (Op1.getOpcode() == SPUISD::IndirectAddr) {
2927        IndirectArg = Op1;
2928        AddArg = Op0;
2929      }
2930
2931      if (isa<ConstantSDNode>(AddArg)) {
2932        ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
2933        SDValue IndOp1 = IndirectArg.getOperand(1);
2934
2935        if (CN0->isNullValue()) {
2936          // (add (SPUindirect <arg>, <arg>), 0) ->
2937          // (SPUindirect <arg>, <arg>)
2938
2939#if !defined(NDEBUG)
2940          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2941            errs() << "\n"
2942                 << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
2943                 << "With:    (SPUindirect <arg>, <arg>)\n";
2944          }
2945#endif
2946
2947          return IndirectArg;
2948        } else if (isa<ConstantSDNode>(IndOp1)) {
2949          // (add (SPUindirect <arg>, <const>), <const>) ->
2950          // (SPUindirect <arg>, <const + const>)
2951          ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
2952          int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
2953          SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
2954
2955#if !defined(NDEBUG)
2956          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2957            errs() << "\n"
2958                 << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
2959                 << "), " << CN0->getSExtValue() << ")\n"
2960                 << "With:    (SPUindirect <arg>, "
2961                 << combinedConst << ")\n";
2962          }
2963#endif
2964
2965          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
2966                             IndirectArg, combinedValue);
2967        }
2968      }
2969    }
2970    break;
2971  }
2972  case ISD::SIGN_EXTEND:
2973  case ISD::ZERO_EXTEND:
2974  case ISD::ANY_EXTEND: {
2975    if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
2976      // (any_extend (SPUextract_elt0 <arg>)) ->
2977      // (SPUextract_elt0 <arg>)
2978      // Types must match, however...
2979#if !defined(NDEBUG)
2980      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
2981        errs() << "\nReplace: ";
2982        N->dump(&DAG);
2983        errs() << "\nWith:    ";
2984        Op0.getNode()->dump(&DAG);
2985        errs() << "\n";
2986      }
2987#endif
2988
2989      return Op0;
2990    }
2991    break;
2992  }
2993  case SPUISD::IndirectAddr: {
2994    if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
2995      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
2996      if (CN != 0 && CN->isNullValue()) {
2997        // (SPUindirect (SPUaform <addr>, 0), 0) ->
2998        // (SPUaform <addr>, 0)
2999
3000        DEBUG(errs() << "Replace: ");
3001        DEBUG(N->dump(&DAG));
3002        DEBUG(errs() << "\nWith:    ");
3003        DEBUG(Op0.getNode()->dump(&DAG));
3004        DEBUG(errs() << "\n");
3005
3006        return Op0;
3007      }
3008    } else if (Op0.getOpcode() == ISD::ADD) {
3009      SDValue Op1 = N->getOperand(1);
3010      if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
3011        // (SPUindirect (add <arg>, <arg>), 0) ->
3012        // (SPUindirect <arg>, <arg>)
3013        if (CN1->isNullValue()) {
3014
3015#if !defined(NDEBUG)
3016          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
3017            errs() << "\n"
3018                 << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
3019                 << "With:    (SPUindirect <arg>, <arg>)\n";
3020          }
3021#endif
3022
3023          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
3024                             Op0.getOperand(0), Op0.getOperand(1));
3025        }
3026      }
3027    }
3028    break;
3029  }
3030  case SPUISD::SHL_BITS:
3031  case SPUISD::SHL_BYTES:
3032  case SPUISD::ROTBYTES_LEFT: {
3033    SDValue Op1 = N->getOperand(1);
3034
3035    // Kill degenerate vector shifts:
3036    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
3037      if (CN->isNullValue()) {
3038        Result = Op0;
3039      }
3040    }
3041    break;
3042  }
3043  case SPUISD::PREFSLOT2VEC: {
3044    switch (Op0.getOpcode()) {
3045    default:
3046      break;
3047    case ISD::ANY_EXTEND:
3048    case ISD::ZERO_EXTEND:
3049    case ISD::SIGN_EXTEND: {
3050      // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
3051      // <arg>
3052      // but only if the SPUprefslot2vec and <arg> types match.
3053      SDValue Op00 = Op0.getOperand(0);
3054      if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
3055        SDValue Op000 = Op00.getOperand(0);
3056        if (Op000.getValueType() == NodeVT) {
3057          Result = Op000;
3058        }
3059      }
3060      break;
3061    }
3062    case SPUISD::VEC2PREFSLOT: {
3063      // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
3064      // <arg>
3065      Result = Op0.getOperand(0);
3066      break;
3067    }
3068    }
3069    break;
3070  }
3071  }
3072
3073  // Otherwise, return unchanged.
3074#ifndef NDEBUG
3075  if (Result.getNode()) {
3076    DEBUG(errs() << "\nReplace.SPU: ");
3077    DEBUG(N->dump(&DAG));
3078    DEBUG(errs() << "\nWith:        ");
3079    DEBUG(Result.getNode()->dump(&DAG));
3080    DEBUG(errs() << "\n");
3081  }
3082#endif
3083
3084  return Result;
3085}
3086
3087//===----------------------------------------------------------------------===//
3088// Inline Assembly Support
3089//===----------------------------------------------------------------------===//
3090
3091/// getConstraintType - Given a constraint letter, return the type of
3092/// constraint it is for this target.
3093SPUTargetLowering::ConstraintType
3094SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
3095  if (ConstraintLetter.size() == 1) {
3096    switch (ConstraintLetter[0]) {
3097    default: break;
3098    case 'b':
3099    case 'r':
3100    case 'f':
3101    case 'v':
3102    case 'y':
3103      return C_RegisterClass;
3104    }
3105  }
3106  return TargetLowering::getConstraintType(ConstraintLetter);
3107}
3108
3109/// Examine constraint type and operand type and determine a weight value.
3110/// This object must already have been set up with the operand type
3111/// and the current alternative constraint selected.
3112TargetLowering::ConstraintWeight
3113SPUTargetLowering::getSingleConstraintMatchWeight(
3114    AsmOperandInfo &info, const char *constraint) const {
3115  ConstraintWeight weight = CW_Invalid;
3116  Value *CallOperandVal = info.CallOperandVal;
3117    // If we don't have a value, we can't do a match,
3118    // but allow it at the lowest weight.
3119  if (CallOperandVal == NULL)
3120    return CW_Default;
3121  // Look at the constraint type.
3122  switch (*constraint) {
3123  default:
3124    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
3125    break;
3126    //FIXME: Seems like the supported constraint letters were just copied
3127    // from PPC, as the following doesn't correspond to the GCC docs.
3128    // I'm leaving it so until someone adds the corresponding lowering support.
3129  case 'b':
3130  case 'r':
3131  case 'f':
3132  case 'd':
3133  case 'v':
3134  case 'y':
3135    weight = CW_Register;
3136    break;
3137  }
3138  return weight;
3139}
3140
3141std::pair<unsigned, const TargetRegisterClass*>
3142SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
3143                                                EVT VT) const
3144{
3145  if (Constraint.size() == 1) {
3146    // GCC RS6000 Constraint Letters
3147    switch (Constraint[0]) {
3148    case 'b':   // R1-R31
3149    case 'r':   // R0-R31
3150      if (VT == MVT::i64)
3151        return std::make_pair(0U, SPU::R64CRegisterClass);
3152      return std::make_pair(0U, SPU::R32CRegisterClass);
3153    case 'f':
3154      if (VT == MVT::f32)
3155        return std::make_pair(0U, SPU::R32FPRegisterClass);
3156      else if (VT == MVT::f64)
3157        return std::make_pair(0U, SPU::R64FPRegisterClass);
3158      break;
3159    case 'v':
3160      return std::make_pair(0U, SPU::GPRCRegisterClass);
3161    }
3162  }
3163
3164  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
3165}
3166
3167//! Compute used/known bits for a SPU operand
3168void
3169SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
3170                                                  const APInt &Mask,
3171                                                  APInt &KnownZero,
3172                                                  APInt &KnownOne,
3173                                                  const SelectionDAG &DAG,
3174                                                  unsigned Depth ) const {
3175#if 0
3176  const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
3177
3178  switch (Op.getOpcode()) {
3179  default:
3180    // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
3181    break;
3182  case CALL:
3183  case SHUFB:
3184  case SHUFFLE_MASK:
3185  case CNTB:
3186  case SPUISD::PREFSLOT2VEC:
3187  case SPUISD::LDRESULT:
3188  case SPUISD::VEC2PREFSLOT:
3189  case SPUISD::SHLQUAD_L_BITS:
3190  case SPUISD::SHLQUAD_L_BYTES:
3191  case SPUISD::VEC_ROTL:
3192  case SPUISD::VEC_ROTR:
3193  case SPUISD::ROTBYTES_LEFT:
3194  case SPUISD::SELECT_MASK:
3195  case SPUISD::SELB:
3196  }
3197#endif
3198}
3199
3200unsigned
3201SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
3202                                                   unsigned Depth) const {
3203  switch (Op.getOpcode()) {
3204  default:
3205    return 1;
3206
3207  case ISD::SETCC: {
3208    EVT VT = Op.getValueType();
3209
3210    if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
3211      VT = MVT::i32;
3212    }
3213    return VT.getSizeInBits();
3214  }
3215  }
3216}
3217
3218// LowerAsmOperandForConstraint
3219void
3220SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
3221                                                std::string &Constraint,
3222                                                std::vector<SDValue> &Ops,
3223                                                SelectionDAG &DAG) const {
3224  // Default, for the time being, to the base class handler
3225  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3226}
3227
3228/// isLegalAddressImmediate - Return true if the integer value can be used
3229/// as the offset of the target addressing mode.
3230bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
3231                                                Type *Ty) const {
3232  // SPU's addresses are 256K:
3233  return (V > -(1 << 18) && V < (1 << 18) - 1);
3234}
3235
3236bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
3237  return false;
3238}
3239
3240bool
3241SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
3242  // The SPU target isn't yet aware of offsets.
3243  return false;
3244}
3245
3246// can we compare to Imm without writing it into a register?
3247bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
3248  //ceqi, cgti, etc. all take s10 operand
3249  return isInt<10>(Imm);
3250}
3251
3252bool
3253SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
3254                                         Type * ) const{
3255
3256  // A-form: 18bit absolute address.
3257  if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
3258    return true;
3259
3260  // D-form: reg + 14bit offset
3261  if (AM.BaseGV ==0 && AM.HasBaseReg && AM.Scale == 0 && isInt<14>(AM.BaseOffs))
3262    return true;
3263
3264  // X-form: reg+reg
3265  if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 1 && AM.BaseOffs ==0)
3266    return true;
3267
3268  return false;
3269}
3270