1/*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "nv50_ir_target_nvc0.h"
24
25namespace nv50_ir {
26
27Target *getTargetNVC0(unsigned int chipset)
28{
29   return new TargetNVC0(chipset);
30}
31
32TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4)
33{
34   chipset = card;
35   initOpInfo();
36}
37
38// BULTINS / LIBRARY FUNCTIONS:
39
40// lazyness -> will just hardcode everything for the time being
41
42// Will probably make this nicer once we support subroutines properly,
43// i.e. when we have an input IR that provides function declarations.
44
45// TODO: separate version for nve4+ which doesn't like the 4-byte insn formats
46static const uint32_t nvc0_builtin_code[] =
47{
48// DIV U32: slow unsigned integer division
49//
50// UNR recurrence (q = a / b):
51// look for z such that 2^32 - b <= b * z < 2^32
52// then q - 1 <= (a * z) / 2^32 <= q
53//
54// INPUT:   $r0: dividend, $r1: divisor
55// OUTPUT:  $r0: result, $r1: modulus
56// CLOBBER: $r2 - $r3, $p0 - $p1
57// SIZE:    22 / 14 * 8 bytes
58//
59#if 1
60   0x04009c03, 0x78000000,
61   0x7c209c82, 0x38000000, // 0x7c209cdd,
62   0x0400dde2, 0x18000000, // 0x0010dd18,
63   0x08309c03, 0x60000000,
64   0x05205d04, 0x1c000000, // 0x05605c18,
65   0x0810dc03, 0x50000000, // 0x0810dc2a,
66   0x0c209c43, 0x20040000,
67   0x0810dc03, 0x50000000,
68   0x0c209c43, 0x20040000,
69   0x0810dc03, 0x50000000,
70   0x0c209c43, 0x20040000,
71   0x0810dc03, 0x50000000,
72   0x0c209c43, 0x20040000,
73   0x0810dc03, 0x50000000,
74   0x0c209c43, 0x20040000,
75   0x0000dde4, 0x28000000,
76   0x08001c43, 0x50000000,
77   0x05209d04, 0x1c000000, // 0x05609c18,
78   0x00105c03, 0x20060000, // 0x0010430d,
79   0x0811dc03, 0x1b0e0000,
80   0x08104103, 0x48000000,
81   0x04000002, 0x08000000,
82   0x0811c003, 0x1b0e0000,
83   0x08104103, 0x48000000,
84   0x04000002, 0x08000000, // 0x040000ac,
85   0x00001de7, 0x90000000, // 0x90001dff,
86#else
87   0x0401dc03, 0x1b0e0000,
88   0x00008003, 0x78000000,
89   0x0400c003, 0x78000000,
90   0x0c20c103, 0x48000000,
91   0x0c108003, 0x60000000,
92   0x00005c28,
93   0x00001d18,
94   0x0031c023, 0x1b0ec000,
95   0xb000a1e7, 0x40000000,
96   0x04000003, 0x6000c000,
97   0x0813dc03, 0x1b000000,
98   0x0420446c,
99   0x040004bd,
100   0x04208003, 0x5800c000,
101   0x0430c103, 0x4800c000,
102   0x0ffc5dff,
103   0x90001dff,
104#endif
105
106// DIV S32: slow signed integer division
107//
108// INPUT:   $r0: dividend, $r1: divisor
109// OUTPUT:  $r0: result, $r1: modulus
110// CLOBBER: $r2 - $r3, $p0 - $p3
111// SIZE:    18 * 8 bytes
112//
113   0xfc05dc23, 0x188e0000,
114   0xfc17dc23, 0x18c40000,
115   0x01201ec4, 0x1c000000, // 0x03301e18,
116   0x05205ec4, 0x1c000000, // 0x07305e18,
117   0x0401dc03, 0x1b0e0000,
118   0x00008003, 0x78000000,
119   0x0400c003, 0x78000000,
120   0x0c20c103, 0x48000000,
121   0x0c108003, 0x60000000,
122   0x00005de4, 0x28000000, // 0x00005c28,
123   0x00001de2, 0x18000000, // 0x00001d18,
124   0x0031c023, 0x1b0ec000,
125   0xe000a1e7, 0x40000000, // 0xb000a1e7, 0x40000000,
126   0x04000003, 0x6000c000,
127   0x0813dc03, 0x1b000000,
128   0x04204603, 0x48000000, // 0x0420446c,
129   0x04000442, 0x38000000, // 0x040004bd,
130   0x04208003, 0x5800c000,
131   0x0430c103, 0x4800c000,
132   0xe0001de7, 0x4003fffe, // 0x0ffc5dff,
133   0x01200f84, 0x1c000000, // 0x01700e18,
134   0x05204b84, 0x1c000000, // 0x05704a18,
135   0x00001de7, 0x90000000, // 0x90001dff,
136
137// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
138//
139// INPUT:   $r0d (x)
140// OUTPUT:  $r0d (rcp(x))
141// CLOBBER: $r2 - $r7
142// SIZE:    9 * 8 bytes
143//
144   0x9810dc08,
145   0x00009c28,
146   0x4001df18,
147   0x00019d18,
148   0x08011e01, 0x200c0000,
149   0x10209c01, 0x50000000,
150   0x08011e01, 0x200c0000,
151   0x10209c01, 0x50000000,
152   0x08011e01, 0x200c0000,
153   0x10201c01, 0x50000000,
154   0x00001de7, 0x90000000,
155
156// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
157//
158// INPUT:   $r0d (x)
159// OUTPUT:  $r0d (rsqrt(x))
160// CLOBBER: $r2 - $r7
161// SIZE:    14 * 8 bytes
162//
163   0x9c10dc08,
164   0x00009c28,
165   0x00019d18,
166   0x3fe1df18,
167   0x18001c01, 0x50000000,
168   0x0001dde2, 0x18ffe000,
169   0x08211c01, 0x50000000,
170   0x10011e01, 0x200c0000,
171   0x10209c01, 0x50000000,
172   0x08211c01, 0x50000000,
173   0x10011e01, 0x200c0000,
174   0x10209c01, 0x50000000,
175   0x08211c01, 0x50000000,
176   0x10011e01, 0x200c0000,
177   0x10201c01, 0x50000000,
178   0x00001de7, 0x90000000,
179};
180
181static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] =
182{
183   0,
184   8 * (26),
185   8 * (26 + 23),
186   8 * (26 + 23 + 9)
187};
188
189void
190TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const
191{
192   *code = &nvc0_builtin_code[0];
193   *size = sizeof(nvc0_builtin_code);
194}
195
196uint32_t
197TargetNVC0::getBuiltinOffset(int builtin) const
198{
199   assert(builtin < NVC0_BUILTIN_COUNT);
200   return nvc0_builtin_offsets[builtin];
201}
202
203struct opProperties
204{
205   operation op;
206   unsigned int mNeg   : 4;
207   unsigned int mAbs   : 4;
208   unsigned int mNot   : 4;
209   unsigned int mSat   : 4;
210   unsigned int fConst : 3;
211   unsigned int fImmd  : 4; // last bit indicates if full immediate is suppoted
212};
213
214static const struct opProperties _initProps[] =
215{
216   //           neg  abs  not  sat  c[]  imm
217   { OP_ADD,    0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 },
218   { OP_SUB,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 },
219   { OP_MUL,    0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 },
220   { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
221   { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
222   { OP_MAD,    0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint
223   { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
224   { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
225   { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
226   { OP_CEIL,   0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
227   { OP_FLOOR,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
228   { OP_TRUNC,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
229   { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
230   { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
231   { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
232   { OP_SHL,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
233   { OP_SHR,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
234   { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
235   { OP_SLCT,   0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint
236   { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
237   { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
238   { OP_COS,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
239   { OP_SIN,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
240   { OP_EX2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
241   { OP_LG2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
242   { OP_RCP,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
243   { OP_RSQ,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
244   { OP_DFDX,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
245   { OP_DFDY,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
246   { OP_CALL,   0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
247   { OP_INSBF,  0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
248   { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
249   { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
250   { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
251   // saturate only:
252   { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
253   { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
254};
255
256void TargetNVC0::initOpInfo()
257{
258   unsigned int i, j;
259
260   static const uint32_t commutative[(OP_LAST + 31) / 32] =
261   {
262      // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN
263      0x0670ca00, 0x0000003f, 0x00000000
264   };
265
266   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
267   {
268      // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV
269      0x0670ca00, 0x00000000, 0x00000000
270   };
271
272   static const operation noDest[] =
273   {
274      OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
275      OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
276      OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
277      OP_QUADON, OP_QUADPOP, OP_TEXBAR
278   };
279
280   for (i = 0; i < DATA_FILE_COUNT; ++i)
281      nativeFileMap[i] = (DataFile)i;
282   nativeFileMap[FILE_ADDRESS] = FILE_GPR;
283
284   for (i = 0; i < OP_LAST; ++i) {
285      opInfo[i].variants = NULL;
286      opInfo[i].op = (operation)i;
287      opInfo[i].srcTypes = 1 << (int)TYPE_F32;
288      opInfo[i].dstTypes = 1 << (int)TYPE_F32;
289      opInfo[i].immdBits = 0;
290      opInfo[i].srcNr = operationSrcNr[i];
291
292      for (j = 0; j < opInfo[i].srcNr; ++j) {
293         opInfo[i].srcMods[j] = 0;
294         opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
295      }
296      opInfo[i].dstMods = 0;
297      opInfo[i].dstFiles = 1 << (int)FILE_GPR;
298
299      opInfo[i].hasDest = 1;
300      opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
301      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
302      opInfo[i].pseudo = (i < OP_MOV);
303      opInfo[i].predicate = !opInfo[i].pseudo;
304      opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
305      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
306   }
307   for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i)
308      opInfo[noDest[i]].hasDest = 0;
309
310   for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
311      const struct opProperties *prop = &_initProps[i];
312
313      for (int s = 0; s < 3; ++s) {
314         if (prop->mNeg & (1 << s))
315            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
316         if (prop->mAbs & (1 << s))
317            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
318         if (prop->mNot & (1 << s))
319            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
320         if (prop->fConst & (1 << s))
321            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
322         if (prop->fImmd & (1 << s))
323            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
324         if (prop->fImmd & 8)
325            opInfo[prop->op].immdBits = 0xffffffff;
326      }
327      if (prop->mSat & 8)
328         opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
329   }
330}
331
332unsigned int
333TargetNVC0::getFileSize(DataFile file) const
334{
335   switch (file) {
336   case FILE_NULL:          return 0;
337   case FILE_GPR:           return 63;
338   case FILE_PREDICATE:     return 7;
339   case FILE_FLAGS:         return 1;
340   case FILE_ADDRESS:       return 0;
341   case FILE_IMMEDIATE:     return 0;
342   case FILE_MEMORY_CONST:  return 65536;
343   case FILE_SHADER_INPUT:  return 0x400;
344   case FILE_SHADER_OUTPUT: return 0x400;
345   case FILE_MEMORY_GLOBAL: return 0xffffffff;
346   case FILE_MEMORY_SHARED: return 16 << 10;
347   case FILE_MEMORY_LOCAL:  return 48 << 10;
348   case FILE_SYSTEM_VALUE:  return 32;
349   default:
350      assert(!"invalid file");
351      return 0;
352   }
353}
354
355unsigned int
356TargetNVC0::getFileUnit(DataFile file) const
357{
358   if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE)
359      return 2;
360   return 0;
361}
362
363uint32_t
364TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
365{
366   const int idx = sym->reg.data.sv.index;
367   const SVSemantic sv = sym->reg.data.sv.sv;
368
369   const bool isInput = shaderFile == FILE_SHADER_INPUT;
370
371   switch (sv) {
372   case SV_POSITION:       return 0x070 + idx * 4;
373   case SV_INSTANCE_ID:    return 0x2f8;
374   case SV_VERTEX_ID:      return 0x2fc;
375   case SV_PRIMITIVE_ID:   return isInput ? 0x060 : 0x040;
376   case SV_LAYER:          return 0x064;
377   case SV_VIEWPORT_INDEX: return 0x068;
378   case SV_POINT_SIZE:     return 0x06c;
379   case SV_CLIP_DISTANCE:  return 0x2c0 + idx * 4;
380   case SV_POINT_COORD:    return 0x2e0 + idx * 4;
381   case SV_FACE:           return 0x3fc;
382   case SV_TESS_FACTOR:    return 0x000 + idx * 4;
383   case SV_TESS_COORD:     return 0x2f0 + idx * 4;
384   default:
385      return 0xffffffff;
386   }
387}
388
389bool
390TargetNVC0::insnCanLoad(const Instruction *i, int s,
391                        const Instruction *ld) const
392{
393   DataFile sf = ld->src(0).getFile();
394
395   // immediate 0 can be represented by GPR $r63
396   if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
397      return (!i->asTex() && i->op != OP_EXPORT && i->op != OP_STORE);
398
399   if (s >= opInfo[i->op].srcNr)
400      return false;
401   if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
402      return false;
403
404   // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
405   if (ld->src(0).isIndirect(0))
406      return false;
407
408   for (int k = 0; i->srcExists(k); ++k) {
409      if (i->src(k).getFile() == FILE_IMMEDIATE) {
410         if (i->getSrc(k)->reg.data.u64 != 0)
411            return false;
412      } else
413      if (i->src(k).getFile() != FILE_GPR &&
414          i->src(k).getFile() != FILE_PREDICATE) {
415         return false;
416      }
417   }
418
419   // not all instructions support full 32 bit immediates
420   if (sf == FILE_IMMEDIATE) {
421      Storage &reg = ld->getSrc(0)->asImm()->reg;
422
423      if (opInfo[i->op].immdBits != 0xffffffff) {
424         if (i->sType == TYPE_F32) {
425            if (reg.data.u32 & 0xfff)
426               return false;
427         } else
428         if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
429            // with u32, 0xfffff counts as 0xffffffff as well
430            if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
431               return false;
432         }
433      } else
434      if (i->op == OP_MAD || i->op == OP_FMA) {
435         // requires src == dst, cannot decide before RA
436         // (except if we implement more constraints)
437         if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff)
438            return false;
439      }
440   }
441
442   return true;
443}
444
445bool
446TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
447{
448   if (ty == TYPE_NONE)
449      return false;
450   if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ?
451      return typeSizeof(ty) <= 8;
452   if (ty == TYPE_B96)
453      return (file == FILE_SHADER_INPUT) || (file == FILE_SHADER_OUTPUT);
454   return true;
455}
456
457bool
458TargetNVC0::isOpSupported(operation op, DataType ty) const
459{
460   if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32))
461      return false;
462   if (op == OP_SAD && ty != TYPE_S32 && ty != TYPE_U32)
463      return false;
464   if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD)
465      return false;
466   return true;
467}
468
469bool
470TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
471{
472   if (!isFloatType(insn->dType)) {
473      switch (insn->op) {
474      case OP_ABS:
475      case OP_NEG:
476      case OP_CVT:
477      case OP_CEIL:
478      case OP_FLOOR:
479      case OP_TRUNC:
480      case OP_AND:
481      case OP_OR:
482      case OP_XOR:
483         break;
484      case OP_ADD:
485         if (mod.abs())
486            return false;
487         if (insn->src(s ? 0 : 1).mod.neg())
488            return false;
489         break;
490      case OP_SUB:
491         if (s == 0)
492            return insn->src(1).mod.neg() ? false : true;
493         break;
494      default:
495         return false;
496      }
497   }
498   if (s > 3)
499      return false;
500   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
501}
502
503bool
504TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const
505{
506   if (insn->getPredicate())
507      return false;
508   return opInfo[insn->op].predicate;
509}
510
511bool
512TargetNVC0::isSatSupported(const Instruction *insn) const
513{
514   if (insn->op == OP_CVT)
515      return true;
516   if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT))
517      return false;
518
519   if (insn->dType == TYPE_U32)
520      return (insn->op == OP_ADD) || (insn->op == OP_MAD);
521
522   return insn->dType == TYPE_F32;
523}
524
525bool
526TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const
527{
528   if (op != OP_MUL)
529      return false;
530   f = fabsf(f);
531   e = static_cast<int>(log2f(f));
532   if (e < -3 || e > 3)
533      return false;
534   return f == exp2f(static_cast<float>(e));
535}
536
537// TODO: better values
538// this could be more precise, e.g. depending on the issue-to-read/write delay
539// of the depending instruction, but it's good enough
540int TargetNVC0::getLatency(const Instruction *i) const
541{
542   if (chipset >= 0xe4) {
543      if (i->dType == TYPE_F64 || i->sType == TYPE_F64)
544         return 20;
545      switch (i->op) {
546      case OP_LINTERP:
547      case OP_PINTERP:
548         return 15;
549      case OP_LOAD:
550         if (i->src(0).getFile() == FILE_MEMORY_CONST)
551            return 9;
552         // fall through
553      case OP_VFETCH:
554         return 24;
555      default:
556         if (Target::getOpClass(i->op) == OPCLASS_TEXTURE)
557            return 17;
558         if (i->op == OP_MUL && i->dType != TYPE_F32)
559            return 15;
560         return 9;
561      }
562   } else {
563      if (i->op == OP_LOAD) {
564         if (i->cache == CACHE_CV)
565            return 700;
566         return 48;
567      }
568      return 24;
569   }
570   return 32;
571}
572
573// These are "inverse" throughput values, i.e. the number of cycles required
574// to issue a specific instruction for a full warp (32 threads).
575//
576// Assuming we have more than 1 warp in flight, a higher issue latency results
577// in a lower result latency since the MP will have spent more time with other
578// warps.
579// This also helps to determine the number of cycles between instructions in
580// a single warp.
581//
582int TargetNVC0::getThroughput(const Instruction *i) const
583{
584   // TODO: better values
585   if (i->dType == TYPE_F32) {
586      switch (i->op) {
587      case OP_ADD:
588      case OP_MUL:
589      case OP_MAD:
590      case OP_FMA:
591         return 1;
592      case OP_CVT:
593      case OP_CEIL:
594      case OP_FLOOR:
595      case OP_TRUNC:
596      case OP_SET:
597      case OP_SLCT:
598      case OP_MIN:
599      case OP_MAX:
600         return 2;
601      case OP_RCP:
602      case OP_RSQ:
603      case OP_LG2:
604      case OP_SIN:
605      case OP_COS:
606      case OP_PRESIN:
607      case OP_PREEX2:
608      default:
609         return 8;
610      }
611   } else
612   if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
613      switch (i->op) {
614      case OP_ADD:
615      case OP_AND:
616      case OP_OR:
617      case OP_XOR:
618      case OP_NOT:
619         return 1;
620      case OP_MUL:
621      case OP_MAD:
622      case OP_CVT:
623      case OP_SET:
624      case OP_SLCT:
625      case OP_SHL:
626      case OP_SHR:
627      case OP_NEG:
628      case OP_ABS:
629      case OP_MIN:
630      case OP_MAX:
631      default:
632         return 2;
633      }
634   } else
635   if (i->dType == TYPE_F64) {
636      return 2;
637   } else {
638      return 1;
639   }
640}
641
642bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const
643{
644   const OpClass clA = operationClass[a->op];
645   const OpClass clB = operationClass[b->op];
646
647   if (getChipset() >= 0xe4) {
648      // not texturing
649      // not if the 2nd instruction isn't necessarily executed
650      if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW)
651         return false;
652      // anything with MOV
653      if (a->op == OP_MOV || b->op == OP_MOV)
654         return true;
655      if (clA == clB) {
656         // only F32 arith or integer additions
657         if (clA != OPCLASS_ARITH)
658            return false;
659         return (a->dType == TYPE_F32 || a->op == OP_ADD ||
660                 b->dType == TYPE_F32 || b->op == OP_ADD);
661      }
662      // nothing with TEXBAR
663      if (a->op == OP_TEXBAR || b->op == OP_TEXBAR)
664         return false;
665      // no loads and stores accessing the the same space
666      if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) ||
667          (clB == OPCLASS_LOAD && clA == OPCLASS_STORE))
668         if (a->src(0).getFile() == b->src(0).getFile())
669            return false;
670      // no > 32-bit ops
671      if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 ||
672          typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4)
673         return false;
674      return true;
675   } else {
676      return false; // info not needed (yet)
677   }
678}
679
680} // namespace nv50_ir
681