1/* 2 * Copyright 2011 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "nv50_ir_target_nvc0.h" 24 25namespace nv50_ir { 26 27Target *getTargetNVC0(unsigned int chipset) 28{ 29 return new TargetNVC0(chipset); 30} 31 32TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4) 33{ 34 chipset = card; 35 initOpInfo(); 36} 37 38// BULTINS / LIBRARY FUNCTIONS: 39 40// lazyness -> will just hardcode everything for the time being 41 42// Will probably make this nicer once we support subroutines properly, 43// i.e. when we have an input IR that provides function declarations. 44 45// TODO: separate version for nve4+ which doesn't like the 4-byte insn formats 46static const uint32_t nvc0_builtin_code[] = 47{ 48// DIV U32: slow unsigned integer division 49// 50// UNR recurrence (q = a / b): 51// look for z such that 2^32 - b <= b * z < 2^32 52// then q - 1 <= (a * z) / 2^32 <= q 53// 54// INPUT: $r0: dividend, $r1: divisor 55// OUTPUT: $r0: result, $r1: modulus 56// CLOBBER: $r2 - $r3, $p0 - $p1 57// SIZE: 22 / 14 * 8 bytes 58// 59#if 1 60 0x04009c03, 0x78000000, 61 0x7c209c82, 0x38000000, // 0x7c209cdd, 62 0x0400dde2, 0x18000000, // 0x0010dd18, 63 0x08309c03, 0x60000000, 64 0x05205d04, 0x1c000000, // 0x05605c18, 65 0x0810dc03, 0x50000000, // 0x0810dc2a, 66 0x0c209c43, 0x20040000, 67 0x0810dc03, 0x50000000, 68 0x0c209c43, 0x20040000, 69 0x0810dc03, 0x50000000, 70 0x0c209c43, 0x20040000, 71 0x0810dc03, 0x50000000, 72 0x0c209c43, 0x20040000, 73 0x0810dc03, 0x50000000, 74 0x0c209c43, 0x20040000, 75 0x0000dde4, 0x28000000, 76 0x08001c43, 0x50000000, 77 0x05209d04, 0x1c000000, // 0x05609c18, 78 0x00105c03, 0x20060000, // 0x0010430d, 79 0x0811dc03, 0x1b0e0000, 80 0x08104103, 0x48000000, 81 0x04000002, 0x08000000, 82 0x0811c003, 0x1b0e0000, 83 0x08104103, 0x48000000, 84 0x04000002, 0x08000000, // 0x040000ac, 85 0x00001de7, 0x90000000, // 0x90001dff, 86#else 87 0x0401dc03, 0x1b0e0000, 88 0x00008003, 0x78000000, 89 0x0400c003, 0x78000000, 90 0x0c20c103, 0x48000000, 91 0x0c108003, 0x60000000, 92 0x00005c28, 93 0x00001d18, 94 0x0031c023, 0x1b0ec000, 95 0xb000a1e7, 0x40000000, 96 0x04000003, 0x6000c000, 97 0x0813dc03, 0x1b000000, 98 0x0420446c, 99 0x040004bd, 100 0x04208003, 0x5800c000, 101 0x0430c103, 0x4800c000, 102 0x0ffc5dff, 103 0x90001dff, 104#endif 105 106// DIV S32: slow signed integer division 107// 108// INPUT: $r0: dividend, $r1: divisor 109// OUTPUT: $r0: result, $r1: modulus 110// CLOBBER: $r2 - $r3, $p0 - $p3 111// SIZE: 18 * 8 bytes 112// 113 0xfc05dc23, 0x188e0000, 114 0xfc17dc23, 0x18c40000, 115 0x01201ec4, 0x1c000000, // 0x03301e18, 116 0x05205ec4, 0x1c000000, // 0x07305e18, 117 0x0401dc03, 0x1b0e0000, 118 0x00008003, 0x78000000, 119 0x0400c003, 0x78000000, 120 0x0c20c103, 0x48000000, 121 0x0c108003, 0x60000000, 122 0x00005de4, 0x28000000, // 0x00005c28, 123 0x00001de2, 0x18000000, // 0x00001d18, 124 0x0031c023, 0x1b0ec000, 125 0xe000a1e7, 0x40000000, // 0xb000a1e7, 0x40000000, 126 0x04000003, 0x6000c000, 127 0x0813dc03, 0x1b000000, 128 0x04204603, 0x48000000, // 0x0420446c, 129 0x04000442, 0x38000000, // 0x040004bd, 130 0x04208003, 0x5800c000, 131 0x0430c103, 0x4800c000, 132 0xe0001de7, 0x4003fffe, // 0x0ffc5dff, 133 0x01200f84, 0x1c000000, // 0x01700e18, 134 0x05204b84, 0x1c000000, // 0x05704a18, 135 0x00001de7, 0x90000000, // 0x90001dff, 136 137// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i) 138// 139// INPUT: $r0d (x) 140// OUTPUT: $r0d (rcp(x)) 141// CLOBBER: $r2 - $r7 142// SIZE: 9 * 8 bytes 143// 144 0x9810dc08, 145 0x00009c28, 146 0x4001df18, 147 0x00019d18, 148 0x08011e01, 0x200c0000, 149 0x10209c01, 0x50000000, 150 0x08011e01, 0x200c0000, 151 0x10209c01, 0x50000000, 152 0x08011e01, 0x200c0000, 153 0x10201c01, 0x50000000, 154 0x00001de7, 0x90000000, 155 156// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i) 157// 158// INPUT: $r0d (x) 159// OUTPUT: $r0d (rsqrt(x)) 160// CLOBBER: $r2 - $r7 161// SIZE: 14 * 8 bytes 162// 163 0x9c10dc08, 164 0x00009c28, 165 0x00019d18, 166 0x3fe1df18, 167 0x18001c01, 0x50000000, 168 0x0001dde2, 0x18ffe000, 169 0x08211c01, 0x50000000, 170 0x10011e01, 0x200c0000, 171 0x10209c01, 0x50000000, 172 0x08211c01, 0x50000000, 173 0x10011e01, 0x200c0000, 174 0x10209c01, 0x50000000, 175 0x08211c01, 0x50000000, 176 0x10011e01, 0x200c0000, 177 0x10201c01, 0x50000000, 178 0x00001de7, 0x90000000, 179}; 180 181static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] = 182{ 183 0, 184 8 * (26), 185 8 * (26 + 23), 186 8 * (26 + 23 + 9) 187}; 188 189void 190TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const 191{ 192 *code = &nvc0_builtin_code[0]; 193 *size = sizeof(nvc0_builtin_code); 194} 195 196uint32_t 197TargetNVC0::getBuiltinOffset(int builtin) const 198{ 199 assert(builtin < NVC0_BUILTIN_COUNT); 200 return nvc0_builtin_offsets[builtin]; 201} 202 203struct opProperties 204{ 205 operation op; 206 unsigned int mNeg : 4; 207 unsigned int mAbs : 4; 208 unsigned int mNot : 4; 209 unsigned int mSat : 4; 210 unsigned int fConst : 3; 211 unsigned int fImmd : 4; // last bit indicates if full immediate is suppoted 212}; 213 214static const struct opProperties _initProps[] = 215{ 216 // neg abs not sat c[] imm 217 { OP_ADD, 0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 }, 218 { OP_SUB, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 }, 219 { OP_MUL, 0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 }, 220 { OP_MAX, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, 221 { OP_MIN, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, 222 { OP_MAD, 0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint 223 { OP_ABS, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 }, 224 { OP_NEG, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0 }, 225 { OP_CVT, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 }, 226 { OP_CEIL, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 }, 227 { OP_FLOOR, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 }, 228 { OP_TRUNC, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 }, 229 { OP_AND, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 }, 230 { OP_OR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 }, 231 { OP_XOR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 }, 232 { OP_SHL, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 }, 233 { OP_SHR, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 }, 234 { OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, 235 { OP_SLCT, 0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint 236 { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 }, 237 { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 }, 238 { OP_COS, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, 239 { OP_SIN, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, 240 { OP_EX2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, 241 { OP_LG2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, 242 { OP_RCP, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, 243 { OP_RSQ, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 }, 244 { OP_DFDX, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 }, 245 { OP_DFDY, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 }, 246 { OP_CALL, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 }, 247 { OP_INSBF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4 }, 248 { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, 249 { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, 250 { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 }, 251 // saturate only: 252 { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 }, 253 { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 }, 254}; 255 256void TargetNVC0::initOpInfo() 257{ 258 unsigned int i, j; 259 260 static const uint32_t commutative[(OP_LAST + 31) / 32] = 261 { 262 // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN 263 0x0670ca00, 0x0000003f, 0x00000000 264 }; 265 266 static const uint32_t shortForm[(OP_LAST + 31) / 32] = 267 { 268 // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV 269 0x0670ca00, 0x00000000, 0x00000000 270 }; 271 272 static const operation noDest[] = 273 { 274 OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT, 275 OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET, 276 OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART, 277 OP_QUADON, OP_QUADPOP, OP_TEXBAR 278 }; 279 280 for (i = 0; i < DATA_FILE_COUNT; ++i) 281 nativeFileMap[i] = (DataFile)i; 282 nativeFileMap[FILE_ADDRESS] = FILE_GPR; 283 284 for (i = 0; i < OP_LAST; ++i) { 285 opInfo[i].variants = NULL; 286 opInfo[i].op = (operation)i; 287 opInfo[i].srcTypes = 1 << (int)TYPE_F32; 288 opInfo[i].dstTypes = 1 << (int)TYPE_F32; 289 opInfo[i].immdBits = 0; 290 opInfo[i].srcNr = operationSrcNr[i]; 291 292 for (j = 0; j < opInfo[i].srcNr; ++j) { 293 opInfo[i].srcMods[j] = 0; 294 opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR; 295 } 296 opInfo[i].dstMods = 0; 297 opInfo[i].dstFiles = 1 << (int)FILE_GPR; 298 299 opInfo[i].hasDest = 1; 300 opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA); 301 opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1; 302 opInfo[i].pseudo = (i < OP_MOV); 303 opInfo[i].predicate = !opInfo[i].pseudo; 304 opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN); 305 opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8; 306 } 307 for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i) 308 opInfo[noDest[i]].hasDest = 0; 309 310 for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) { 311 const struct opProperties *prop = &_initProps[i]; 312 313 for (int s = 0; s < 3; ++s) { 314 if (prop->mNeg & (1 << s)) 315 opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG; 316 if (prop->mAbs & (1 << s)) 317 opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS; 318 if (prop->mNot & (1 << s)) 319 opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT; 320 if (prop->fConst & (1 << s)) 321 opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST; 322 if (prop->fImmd & (1 << s)) 323 opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE; 324 if (prop->fImmd & 8) 325 opInfo[prop->op].immdBits = 0xffffffff; 326 } 327 if (prop->mSat & 8) 328 opInfo[prop->op].dstMods = NV50_IR_MOD_SAT; 329 } 330} 331 332unsigned int 333TargetNVC0::getFileSize(DataFile file) const 334{ 335 switch (file) { 336 case FILE_NULL: return 0; 337 case FILE_GPR: return 63; 338 case FILE_PREDICATE: return 7; 339 case FILE_FLAGS: return 1; 340 case FILE_ADDRESS: return 0; 341 case FILE_IMMEDIATE: return 0; 342 case FILE_MEMORY_CONST: return 65536; 343 case FILE_SHADER_INPUT: return 0x400; 344 case FILE_SHADER_OUTPUT: return 0x400; 345 case FILE_MEMORY_GLOBAL: return 0xffffffff; 346 case FILE_MEMORY_SHARED: return 16 << 10; 347 case FILE_MEMORY_LOCAL: return 48 << 10; 348 case FILE_SYSTEM_VALUE: return 32; 349 default: 350 assert(!"invalid file"); 351 return 0; 352 } 353} 354 355unsigned int 356TargetNVC0::getFileUnit(DataFile file) const 357{ 358 if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE) 359 return 2; 360 return 0; 361} 362 363uint32_t 364TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const 365{ 366 const int idx = sym->reg.data.sv.index; 367 const SVSemantic sv = sym->reg.data.sv.sv; 368 369 const bool isInput = shaderFile == FILE_SHADER_INPUT; 370 371 switch (sv) { 372 case SV_POSITION: return 0x070 + idx * 4; 373 case SV_INSTANCE_ID: return 0x2f8; 374 case SV_VERTEX_ID: return 0x2fc; 375 case SV_PRIMITIVE_ID: return isInput ? 0x060 : 0x040; 376 case SV_LAYER: return 0x064; 377 case SV_VIEWPORT_INDEX: return 0x068; 378 case SV_POINT_SIZE: return 0x06c; 379 case SV_CLIP_DISTANCE: return 0x2c0 + idx * 4; 380 case SV_POINT_COORD: return 0x2e0 + idx * 4; 381 case SV_FACE: return 0x3fc; 382 case SV_TESS_FACTOR: return 0x000 + idx * 4; 383 case SV_TESS_COORD: return 0x2f0 + idx * 4; 384 default: 385 return 0xffffffff; 386 } 387} 388 389bool 390TargetNVC0::insnCanLoad(const Instruction *i, int s, 391 const Instruction *ld) const 392{ 393 DataFile sf = ld->src(0).getFile(); 394 395 // immediate 0 can be represented by GPR $r63 396 if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0) 397 return (!i->asTex() && i->op != OP_EXPORT && i->op != OP_STORE); 398 399 if (s >= opInfo[i->op].srcNr) 400 return false; 401 if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf))) 402 return false; 403 404 // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0 405 if (ld->src(0).isIndirect(0)) 406 return false; 407 408 for (int k = 0; i->srcExists(k); ++k) { 409 if (i->src(k).getFile() == FILE_IMMEDIATE) { 410 if (i->getSrc(k)->reg.data.u64 != 0) 411 return false; 412 } else 413 if (i->src(k).getFile() != FILE_GPR && 414 i->src(k).getFile() != FILE_PREDICATE) { 415 return false; 416 } 417 } 418 419 // not all instructions support full 32 bit immediates 420 if (sf == FILE_IMMEDIATE) { 421 Storage ® = ld->getSrc(0)->asImm()->reg; 422 423 if (opInfo[i->op].immdBits != 0xffffffff) { 424 if (i->sType == TYPE_F32) { 425 if (reg.data.u32 & 0xfff) 426 return false; 427 } else 428 if (i->sType == TYPE_S32 || i->sType == TYPE_U32) { 429 // with u32, 0xfffff counts as 0xffffffff as well 430 if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000) 431 return false; 432 } 433 } else 434 if (i->op == OP_MAD || i->op == OP_FMA) { 435 // requires src == dst, cannot decide before RA 436 // (except if we implement more constraints) 437 if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff) 438 return false; 439 } 440 } 441 442 return true; 443} 444 445bool 446TargetNVC0::isAccessSupported(DataFile file, DataType ty) const 447{ 448 if (ty == TYPE_NONE) 449 return false; 450 if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ? 451 return typeSizeof(ty) <= 8; 452 if (ty == TYPE_B96) 453 return (file == FILE_SHADER_INPUT) || (file == FILE_SHADER_OUTPUT); 454 return true; 455} 456 457bool 458TargetNVC0::isOpSupported(operation op, DataType ty) const 459{ 460 if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32)) 461 return false; 462 if (op == OP_SAD && ty != TYPE_S32 && ty != TYPE_U32) 463 return false; 464 if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD) 465 return false; 466 return true; 467} 468 469bool 470TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const 471{ 472 if (!isFloatType(insn->dType)) { 473 switch (insn->op) { 474 case OP_ABS: 475 case OP_NEG: 476 case OP_CVT: 477 case OP_CEIL: 478 case OP_FLOOR: 479 case OP_TRUNC: 480 case OP_AND: 481 case OP_OR: 482 case OP_XOR: 483 break; 484 case OP_ADD: 485 if (mod.abs()) 486 return false; 487 if (insn->src(s ? 0 : 1).mod.neg()) 488 return false; 489 break; 490 case OP_SUB: 491 if (s == 0) 492 return insn->src(1).mod.neg() ? false : true; 493 break; 494 default: 495 return false; 496 } 497 } 498 if (s > 3) 499 return false; 500 return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod; 501} 502 503bool 504TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const 505{ 506 if (insn->getPredicate()) 507 return false; 508 return opInfo[insn->op].predicate; 509} 510 511bool 512TargetNVC0::isSatSupported(const Instruction *insn) const 513{ 514 if (insn->op == OP_CVT) 515 return true; 516 if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT)) 517 return false; 518 519 if (insn->dType == TYPE_U32) 520 return (insn->op == OP_ADD) || (insn->op == OP_MAD); 521 522 return insn->dType == TYPE_F32; 523} 524 525bool 526TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const 527{ 528 if (op != OP_MUL) 529 return false; 530 f = fabsf(f); 531 e = static_cast<int>(log2f(f)); 532 if (e < -3 || e > 3) 533 return false; 534 return f == exp2f(static_cast<float>(e)); 535} 536 537// TODO: better values 538// this could be more precise, e.g. depending on the issue-to-read/write delay 539// of the depending instruction, but it's good enough 540int TargetNVC0::getLatency(const Instruction *i) const 541{ 542 if (chipset >= 0xe4) { 543 if (i->dType == TYPE_F64 || i->sType == TYPE_F64) 544 return 20; 545 switch (i->op) { 546 case OP_LINTERP: 547 case OP_PINTERP: 548 return 15; 549 case OP_LOAD: 550 if (i->src(0).getFile() == FILE_MEMORY_CONST) 551 return 9; 552 // fall through 553 case OP_VFETCH: 554 return 24; 555 default: 556 if (Target::getOpClass(i->op) == OPCLASS_TEXTURE) 557 return 17; 558 if (i->op == OP_MUL && i->dType != TYPE_F32) 559 return 15; 560 return 9; 561 } 562 } else { 563 if (i->op == OP_LOAD) { 564 if (i->cache == CACHE_CV) 565 return 700; 566 return 48; 567 } 568 return 24; 569 } 570 return 32; 571} 572 573// These are "inverse" throughput values, i.e. the number of cycles required 574// to issue a specific instruction for a full warp (32 threads). 575// 576// Assuming we have more than 1 warp in flight, a higher issue latency results 577// in a lower result latency since the MP will have spent more time with other 578// warps. 579// This also helps to determine the number of cycles between instructions in 580// a single warp. 581// 582int TargetNVC0::getThroughput(const Instruction *i) const 583{ 584 // TODO: better values 585 if (i->dType == TYPE_F32) { 586 switch (i->op) { 587 case OP_ADD: 588 case OP_MUL: 589 case OP_MAD: 590 case OP_FMA: 591 return 1; 592 case OP_CVT: 593 case OP_CEIL: 594 case OP_FLOOR: 595 case OP_TRUNC: 596 case OP_SET: 597 case OP_SLCT: 598 case OP_MIN: 599 case OP_MAX: 600 return 2; 601 case OP_RCP: 602 case OP_RSQ: 603 case OP_LG2: 604 case OP_SIN: 605 case OP_COS: 606 case OP_PRESIN: 607 case OP_PREEX2: 608 default: 609 return 8; 610 } 611 } else 612 if (i->dType == TYPE_U32 || i->dType == TYPE_S32) { 613 switch (i->op) { 614 case OP_ADD: 615 case OP_AND: 616 case OP_OR: 617 case OP_XOR: 618 case OP_NOT: 619 return 1; 620 case OP_MUL: 621 case OP_MAD: 622 case OP_CVT: 623 case OP_SET: 624 case OP_SLCT: 625 case OP_SHL: 626 case OP_SHR: 627 case OP_NEG: 628 case OP_ABS: 629 case OP_MIN: 630 case OP_MAX: 631 default: 632 return 2; 633 } 634 } else 635 if (i->dType == TYPE_F64) { 636 return 2; 637 } else { 638 return 1; 639 } 640} 641 642bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const 643{ 644 const OpClass clA = operationClass[a->op]; 645 const OpClass clB = operationClass[b->op]; 646 647 if (getChipset() >= 0xe4) { 648 // not texturing 649 // not if the 2nd instruction isn't necessarily executed 650 if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW) 651 return false; 652 // anything with MOV 653 if (a->op == OP_MOV || b->op == OP_MOV) 654 return true; 655 if (clA == clB) { 656 // only F32 arith or integer additions 657 if (clA != OPCLASS_ARITH) 658 return false; 659 return (a->dType == TYPE_F32 || a->op == OP_ADD || 660 b->dType == TYPE_F32 || b->op == OP_ADD); 661 } 662 // nothing with TEXBAR 663 if (a->op == OP_TEXBAR || b->op == OP_TEXBAR) 664 return false; 665 // no loads and stores accessing the the same space 666 if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) || 667 (clB == OPCLASS_LOAD && clA == OPCLASS_STORE)) 668 if (a->src(0).getFile() == b->src(0).getFile()) 669 return false; 670 // no > 32-bit ops 671 if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 || 672 typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4) 673 return false; 674 return true; 675 } else { 676 return false; // info not needed (yet) 677 } 678} 679 680} // namespace nv50_ir 681