nv50_program.c revision 2b963f5c723401aa2646bd48eefe065cd335e280
19681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd/* 29681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * Copyright 2008 Ben Skeggs 3bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 49681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * Permission is hereby granted, free of charge, to any person obtaining a 59681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * copy of this software and associated documentation files (the "Software"), 69681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * to deal in the Software without restriction, including without limitation 79681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * and/or sell copies of the Software, and to permit persons to whom the 99681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * Software is furnished to do so, subject to the following conditions: 109681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * 119681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * The above copyright notice and this permission notice shall be included in 129681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * all copies or substantial portions of the Software. 13bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 149681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 159681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 169681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 179681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 189681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 199681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 201a0fb70d743f900859d9278c6ae01cfc2a993dadLennart Poettering * SOFTWARE. 211a0fb70d743f900859d9278c6ae01cfc2a993dadLennart Poettering */ 221a0fb70d743f900859d9278c6ae01cfc2a993dadLennart Poettering 231a0fb70d743f900859d9278c6ae01cfc2a993dadLennart Poettering#include "pipe/p_context.h" 249681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd#include "pipe/p_defines.h" 25bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#include "pipe/p_state.h" 269681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd#include "pipe/p_inlines.h" 279681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd 28bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#include "pipe/p_shader_tokens.h" 299681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd#include "tgsi/tgsi_parse.h" 30bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#include "tgsi/tgsi_util.h" 31bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering 32bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#include "nv50_context.h" 33bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering 34bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering#define NV50_SU_MAX_TEMP 64 35bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering//#define NV50_PROGRAM_DUMP 36bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering 37bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering/* ARL - gallium craps itself on progs/vp/arl.txt 38bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 39bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * MSB - Like MAD, but MUL+SUB 40bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * - Fuck it off, introduce a way to negate args for ops that 41bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * support it. 42bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 43bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * Look into inlining IMMD for ops other than MOV (make it general?) 44bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * - Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD, 45bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * but can emit to P_TEMP first - then MOV later. NVIDIA does this 46bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 47bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * In ops such as ADD it's possible to construct a bad opcode in the !is_long() 48bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * case, if the emit_src() causes the inst to suddenly become long. 49bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 50bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * Verify half-insns work where expected - and force disable them where they 51bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * don't work - MUL has it forcibly disabled atm as it fixes POW.. 52bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 53bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * FUCK! watch dst==src vectors, can overwrite components that are needed. 54ccfcd5c42c68752fbd6de318fe5ce4269f5a7c06Lennart Poettering * ie. SUB R0, R0.yzxw, R0 55bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 56bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * Things to check with renouveau: 57bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * FP attr/result assignment - how? 58bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * attrib 59bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * - 0x16bc maps vp output onto fp hpos 60bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * - 0x16c0 maps vp output onto fp col0 61bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * result 62bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * - colr always 0-3 63bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * - depr always 4 64bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x16bc->0x16e8 --> some binding between vp/fp regs 65bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x16b8 --> VP output count 66bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 67bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005 68bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * "MOV rcol.x, fcol.y" = 0x00000004 69bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x19a8 --> as above but 0x00000100 and 0x00000000 70bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * - 0x00100000 used when KIL used 71bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x196c --> as above but 0x00000011 and 0x00000000 72bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 73bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * 0x1988 --> 0xXXNNNNNN 74bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering * - XX == FP high something 75bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering */ 76bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poetteringstruct nv50_reg { 77bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering enum { 78bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering P_TEMP, 79bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering P_ATTR, 80bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering P_RESULT, 81bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering P_CONST, 82bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering P_IMMD 83bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering } type; 84bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering int index; 85bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering 86bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering int hw; 87bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering int neg; 88bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering 899681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd int rhw; /* result hw for FP outputs, or interpolant index */ 909681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd int acc; /* instruction where this reg is last read (first insn == 1) */ 91bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering}; 92bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering 939681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloydstruct nv50_pc { 94bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering struct nv50_program *p; 959681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd 96bb8bd5490a71bc77570653cf53be88edd37679e3Lennart Poettering /* hw resources */ 979681c6175cda1ab1bb3bf5b0ffe326f0b80823deTrent Lloyd struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; 98 99 /* tgsi resources */ 100 struct nv50_reg *temp; 101 int temp_nr; 102 struct nv50_reg *attr; 103 int attr_nr; 104 struct nv50_reg *result; 105 int result_nr; 106 struct nv50_reg *param; 107 int param_nr; 108 struct nv50_reg *immd; 109 float *immd_buf; 110 int immd_nr; 111 112 struct nv50_reg *temp_temp[16]; 113 unsigned temp_temp_nr; 114 115 /* broadcast and destination replacement regs */ 116 struct nv50_reg *r_brdc; 117 struct nv50_reg *r_dst[4]; 118 119 unsigned interp_mode[32]; 120 /* perspective interpolation registers */ 121 struct nv50_reg *iv_p; 122 struct nv50_reg *iv_c; 123 124 /* current instruction and total number of insns */ 125 unsigned insn_cur; 126 unsigned insn_nr; 127 128 boolean allow32; 129}; 130 131static void 132alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) 133{ 134 int i = 0; 135 136 if (reg->type == P_RESULT) { 137 if (pc->p->cfg.high_result < (reg->hw + 1)) 138 pc->p->cfg.high_result = reg->hw + 1; 139 } 140 141 if (reg->type != P_TEMP) 142 return; 143 144 if (reg->hw >= 0) { 145 /*XXX: do this here too to catch FP temp-as-attr usage.. 146 * not clean, but works */ 147 if (pc->p->cfg.high_temp < (reg->hw + 1)) 148 pc->p->cfg.high_temp = reg->hw + 1; 149 return; 150 } 151 152 if (reg->rhw != -1) { 153 /* try to allocate temporary with index rhw first */ 154 if (!(pc->r_temp[reg->rhw])) { 155 pc->r_temp[reg->rhw] = reg; 156 reg->hw = reg->rhw; 157 if (pc->p->cfg.high_temp < (reg->rhw + 1)) 158 pc->p->cfg.high_temp = reg->rhw + 1; 159 return; 160 } 161 /* make sure we don't get things like $r0 needs to go 162 * in $r1 and $r1 in $r0 163 */ 164 i = pc->result_nr * 4; 165 } 166 167 for (; i < NV50_SU_MAX_TEMP; i++) { 168 if (!(pc->r_temp[i])) { 169 pc->r_temp[i] = reg; 170 reg->hw = i; 171 if (pc->p->cfg.high_temp < (i + 1)) 172 pc->p->cfg.high_temp = i + 1; 173 return; 174 } 175 } 176 177 assert(0); 178} 179 180static struct nv50_reg * 181alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) 182{ 183 struct nv50_reg *r; 184 int i; 185 186 if (dst && dst->type == P_TEMP && dst->hw == -1) 187 return dst; 188 189 for (i = 0; i < NV50_SU_MAX_TEMP; i++) { 190 if (!pc->r_temp[i]) { 191 r = CALLOC_STRUCT(nv50_reg); 192 r->type = P_TEMP; 193 r->index = -1; 194 r->hw = i; 195 r->rhw = -1; 196 pc->r_temp[i] = r; 197 return r; 198 } 199 } 200 201 assert(0); 202 return NULL; 203} 204 205/* Assign the hw of the discarded temporary register src 206 * to the tgsi register dst and free src. 207 */ 208static void 209assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 210{ 211 assert(src->index == -1 && src->hw != -1); 212 213 if (dst->hw != -1) 214 pc->r_temp[dst->hw] = NULL; 215 pc->r_temp[src->hw] = dst; 216 dst->hw = src->hw; 217 218 FREE(src); 219} 220 221/* release the hardware resource held by r */ 222static void 223release_hw(struct nv50_pc *pc, struct nv50_reg *r) 224{ 225 assert(r->type == P_TEMP); 226 if (r->hw == -1) 227 return; 228 229 assert(pc->r_temp[r->hw] == r); 230 pc->r_temp[r->hw] = NULL; 231 232 r->acc = 0; 233 if (r->index == -1) 234 FREE(r); 235} 236 237static void 238free_temp(struct nv50_pc *pc, struct nv50_reg *r) 239{ 240 if (r->index == -1) { 241 unsigned hw = r->hw; 242 243 FREE(pc->r_temp[hw]); 244 pc->r_temp[hw] = NULL; 245 } 246} 247 248static int 249alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) 250{ 251 int i; 252 253 if ((idx + 4) >= NV50_SU_MAX_TEMP) 254 return 1; 255 256 if (pc->r_temp[idx] || pc->r_temp[idx + 1] || 257 pc->r_temp[idx + 2] || pc->r_temp[idx + 3]) 258 return alloc_temp4(pc, dst, idx + 4); 259 260 for (i = 0; i < 4; i++) { 261 dst[i] = CALLOC_STRUCT(nv50_reg); 262 dst[i]->type = P_TEMP; 263 dst[i]->index = -1; 264 dst[i]->hw = idx + i; 265 pc->r_temp[idx + i] = dst[i]; 266 } 267 268 return 0; 269} 270 271static void 272free_temp4(struct nv50_pc *pc, struct nv50_reg *reg[4]) 273{ 274 int i; 275 276 for (i = 0; i < 4; i++) 277 free_temp(pc, reg[i]); 278} 279 280static struct nv50_reg * 281temp_temp(struct nv50_pc *pc) 282{ 283 if (pc->temp_temp_nr >= 16) 284 assert(0); 285 286 pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL); 287 return pc->temp_temp[pc->temp_temp_nr++]; 288} 289 290static void 291kill_temp_temp(struct nv50_pc *pc) 292{ 293 int i; 294 295 for (i = 0; i < pc->temp_temp_nr; i++) 296 free_temp(pc, pc->temp_temp[i]); 297 pc->temp_temp_nr = 0; 298} 299 300static int 301ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w) 302{ 303 pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * 4 * sizeof(float)), 304 (pc->immd_nr + 1) * 4 * sizeof(float)); 305 pc->immd_buf[(pc->immd_nr * 4) + 0] = x; 306 pc->immd_buf[(pc->immd_nr * 4) + 1] = y; 307 pc->immd_buf[(pc->immd_nr * 4) + 2] = z; 308 pc->immd_buf[(pc->immd_nr * 4) + 3] = w; 309 310 return pc->immd_nr++; 311} 312 313static struct nv50_reg * 314alloc_immd(struct nv50_pc *pc, float f) 315{ 316 struct nv50_reg *r = CALLOC_STRUCT(nv50_reg); 317 unsigned hw; 318 319 for (hw = 0; hw < pc->immd_nr * 4; hw++) 320 if (pc->immd_buf[hw] == f) 321 break; 322 323 if (hw == pc->immd_nr * 4) 324 hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4; 325 326 r->type = P_IMMD; 327 r->hw = hw; 328 r->index = -1; 329 return r; 330} 331 332static struct nv50_program_exec * 333exec(struct nv50_pc *pc) 334{ 335 struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec); 336 337 e->param.index = -1; 338 return e; 339} 340 341static void 342emit(struct nv50_pc *pc, struct nv50_program_exec *e) 343{ 344 struct nv50_program *p = pc->p; 345 346 if (p->exec_tail) 347 p->exec_tail->next = e; 348 if (!p->exec_head) 349 p->exec_head = e; 350 p->exec_tail = e; 351 p->exec_size += (e->inst[0] & 1) ? 2 : 1; 352} 353 354static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *); 355 356static boolean 357is_long(struct nv50_program_exec *e) 358{ 359 if (e->inst[0] & 1) 360 return TRUE; 361 return FALSE; 362} 363 364static boolean 365is_immd(struct nv50_program_exec *e) 366{ 367 if (is_long(e) && (e->inst[1] & 3) == 3) 368 return TRUE; 369 return FALSE; 370} 371 372static INLINE void 373set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx, 374 struct nv50_program_exec *e) 375{ 376 set_long(pc, e); 377 e->inst[1] &= ~((0x1f << 7) | (0x3 << 12)); 378 e->inst[1] |= (pred << 7) | (idx << 12); 379} 380 381static INLINE void 382set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx, 383 struct nv50_program_exec *e) 384{ 385 set_long(pc, e); 386 e->inst[1] &= ~((0x3 << 4) | (1 << 6)); 387 e->inst[1] |= (idx << 4) | (on << 6); 388} 389 390static INLINE void 391set_long(struct nv50_pc *pc, struct nv50_program_exec *e) 392{ 393 if (is_long(e)) 394 return; 395 396 e->inst[0] |= 1; 397 set_pred(pc, 0xf, 0, e); 398 set_pred_wr(pc, 0, 0, e); 399} 400 401static INLINE void 402set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) 403{ 404 if (dst->type == P_RESULT) { 405 set_long(pc, e); 406 e->inst[1] |= 0x00000008; 407 } 408 409 alloc_reg(pc, dst); 410 e->inst[0] |= (dst->hw << 2); 411} 412 413static INLINE void 414set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) 415{ 416 float f = pc->immd_buf[imm->hw]; 417 unsigned val = fui(imm->neg ? -f : f); 418 419 set_long(pc, e); 420 /*XXX: can't be predicated - bits overlap.. catch cases where both 421 * are required and avoid them. */ 422 set_pred(pc, 0, 0, e); 423 set_pred_wr(pc, 0, 0, e); 424 425 e->inst[1] |= 0x00000002 | 0x00000001; 426 e->inst[0] |= (val & 0x3f) << 16; 427 e->inst[1] |= (val >> 6) << 2; 428} 429 430 431#define INTERP_LINEAR 0 432#define INTERP_FLAT 1 433#define INTERP_PERSPECTIVE 2 434#define INTERP_CENTROID 4 435 436/* interpolant index has been stored in dst->rhw */ 437static void 438emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, 439 unsigned mode) 440{ 441 assert(dst->rhw != -1); 442 struct nv50_program_exec *e = exec(pc); 443 444 e->inst[0] |= 0x80000000; 445 set_dst(pc, dst, e); 446 e->inst[0] |= (dst->rhw << 16); 447 448 if (mode & INTERP_FLAT) { 449 e->inst[0] |= (1 << 8); 450 } else { 451 if (mode & INTERP_PERSPECTIVE) { 452 e->inst[0] |= (1 << 25); 453 alloc_reg(pc, iv); 454 e->inst[0] |= (iv->hw << 9); 455 } 456 457 if (mode & INTERP_CENTROID) 458 e->inst[0] |= (1 << 24); 459 } 460 461 emit(pc, e); 462} 463 464static void 465set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, 466 struct nv50_program_exec *e) 467{ 468 set_long(pc, e); 469 470 e->param.index = src->hw; 471 e->param.shift = s; 472 e->param.mask = m << (s % 32); 473 474 e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); 475} 476 477static void 478emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 479{ 480 struct nv50_program_exec *e = exec(pc); 481 482 e->inst[0] |= 0x10000000; 483 484 set_dst(pc, dst, e); 485 486 if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) { 487 set_immd(pc, src, e); 488 /*XXX: 32-bit, but steals part of "half" reg space - need to 489 * catch and handle this case if/when we do half-regs 490 */ 491 } else 492 if (src->type == P_IMMD || src->type == P_CONST) { 493 set_long(pc, e); 494 set_data(pc, src, 0x7f, 9, e); 495 e->inst[1] |= 0x20000000; /* src0 const? */ 496 } else { 497 if (src->type == P_ATTR) { 498 set_long(pc, e); 499 e->inst[1] |= 0x00200000; 500 } 501 502 alloc_reg(pc, src); 503 e->inst[0] |= (src->hw << 9); 504 } 505 506 if (is_long(e) && !is_immd(e)) { 507 e->inst[1] |= 0x04000000; /* 32-bit */ 508 e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */ 509 if (!(e->inst[1] & 0x20000000)) 510 e->inst[1] |= 0x00030000; /* "subsubop" 0xf */ 511 } else 512 e->inst[0] |= 0x00008000; 513 514 emit(pc, e); 515} 516 517static INLINE void 518emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f) 519{ 520 struct nv50_reg *imm = alloc_immd(pc, f); 521 emit_mov(pc, dst, imm); 522 FREE(imm); 523} 524 525static boolean 526check_swap_src_0_1(struct nv50_pc *pc, 527 struct nv50_reg **s0, struct nv50_reg **s1) 528{ 529 struct nv50_reg *src0 = *s0, *src1 = *s1; 530 531 if (src0->type == P_CONST) { 532 if (src1->type != P_CONST) { 533 *s0 = src1; 534 *s1 = src0; 535 return TRUE; 536 } 537 } else 538 if (src1->type == P_ATTR) { 539 if (src0->type != P_ATTR) { 540 *s0 = src1; 541 *s1 = src0; 542 return TRUE; 543 } 544 } 545 546 return FALSE; 547} 548 549static void 550set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 551{ 552 if (src->type == P_ATTR) { 553 set_long(pc, e); 554 e->inst[1] |= 0x00200000; 555 } else 556 if (src->type == P_CONST || src->type == P_IMMD) { 557 struct nv50_reg *temp = temp_temp(pc); 558 559 emit_mov(pc, temp, src); 560 src = temp; 561 } 562 563 alloc_reg(pc, src); 564 e->inst[0] |= (src->hw << 9); 565} 566 567static void 568set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 569{ 570 if (src->type == P_ATTR) { 571 struct nv50_reg *temp = temp_temp(pc); 572 573 emit_mov(pc, temp, src); 574 src = temp; 575 } else 576 if (src->type == P_CONST || src->type == P_IMMD) { 577 assert(!(e->inst[0] & 0x00800000)); 578 if (e->inst[0] & 0x01000000) { 579 struct nv50_reg *temp = temp_temp(pc); 580 581 emit_mov(pc, temp, src); 582 src = temp; 583 } else { 584 set_data(pc, src, 0x7f, 16, e); 585 e->inst[0] |= 0x00800000; 586 } 587 } 588 589 alloc_reg(pc, src); 590 e->inst[0] |= (src->hw << 16); 591} 592 593static void 594set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) 595{ 596 set_long(pc, e); 597 598 if (src->type == P_ATTR) { 599 struct nv50_reg *temp = temp_temp(pc); 600 601 emit_mov(pc, temp, src); 602 src = temp; 603 } else 604 if (src->type == P_CONST || src->type == P_IMMD) { 605 assert(!(e->inst[0] & 0x01000000)); 606 if (e->inst[0] & 0x00800000) { 607 struct nv50_reg *temp = temp_temp(pc); 608 609 emit_mov(pc, temp, src); 610 src = temp; 611 } else { 612 set_data(pc, src, 0x7f, 32+14, e); 613 e->inst[0] |= 0x01000000; 614 } 615 } 616 617 alloc_reg(pc, src); 618 e->inst[1] |= (src->hw << 14); 619} 620 621static void 622emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 623 struct nv50_reg *src1) 624{ 625 struct nv50_program_exec *e = exec(pc); 626 627 e->inst[0] |= 0xc0000000; 628 629 if (!pc->allow32) 630 set_long(pc, e); 631 632 check_swap_src_0_1(pc, &src0, &src1); 633 set_dst(pc, dst, e); 634 set_src_0(pc, src0, e); 635 if (src1->type == P_IMMD && !is_long(e)) { 636 if (src0->neg) 637 e->inst[0] |= 0x00008000; 638 set_immd(pc, src1, e); 639 } else { 640 set_src_1(pc, src1, e); 641 if (src0->neg ^ src1->neg) { 642 if (is_long(e)) 643 e->inst[1] |= 0x08000000; 644 else 645 e->inst[0] |= 0x00008000; 646 } 647 } 648 649 emit(pc, e); 650} 651 652static void 653emit_add(struct nv50_pc *pc, struct nv50_reg *dst, 654 struct nv50_reg *src0, struct nv50_reg *src1) 655{ 656 struct nv50_program_exec *e = exec(pc); 657 658 e->inst[0] |= 0xb0000000; 659 660 check_swap_src_0_1(pc, &src0, &src1); 661 662 if (!pc->allow32 || src0->neg || src1->neg) { 663 set_long(pc, e); 664 e->inst[1] |= (src0->neg << 26) | (src1->neg << 27); 665 } 666 667 set_dst(pc, dst, e); 668 set_src_0(pc, src0, e); 669 if (src1->type == P_CONST || src1->type == P_ATTR || is_long(e)) 670 set_src_2(pc, src1, e); 671 else 672 if (src1->type == P_IMMD) 673 set_immd(pc, src1, e); 674 else 675 set_src_1(pc, src1, e); 676 677 emit(pc, e); 678} 679 680static void 681emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, 682 struct nv50_reg *src0, struct nv50_reg *src1) 683{ 684 struct nv50_program_exec *e = exec(pc); 685 686 set_long(pc, e); 687 e->inst[0] |= 0xb0000000; 688 e->inst[1] |= (sub << 29); 689 690 check_swap_src_0_1(pc, &src0, &src1); 691 set_dst(pc, dst, e); 692 set_src_0(pc, src0, e); 693 set_src_1(pc, src1, e); 694 695 emit(pc, e); 696} 697 698static INLINE void 699emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 700 struct nv50_reg *src1) 701{ 702 src1->neg ^= 1; 703 emit_add(pc, dst, src0, src1); 704 src1->neg ^= 1; 705} 706 707static void 708emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 709 struct nv50_reg *src1, struct nv50_reg *src2) 710{ 711 struct nv50_program_exec *e = exec(pc); 712 713 e->inst[0] |= 0xe0000000; 714 715 check_swap_src_0_1(pc, &src0, &src1); 716 set_dst(pc, dst, e); 717 set_src_0(pc, src0, e); 718 set_src_1(pc, src1, e); 719 set_src_2(pc, src2, e); 720 721 if (src0->neg ^ src1->neg) 722 e->inst[1] |= 0x04000000; 723 if (src2->neg) 724 e->inst[1] |= 0x08000000; 725 726 emit(pc, e); 727} 728 729static INLINE void 730emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, 731 struct nv50_reg *src1, struct nv50_reg *src2) 732{ 733 src2->neg ^= 1; 734 emit_mad(pc, dst, src0, src1, src2); 735 src2->neg ^= 1; 736} 737 738static void 739emit_flop(struct nv50_pc *pc, unsigned sub, 740 struct nv50_reg *dst, struct nv50_reg *src) 741{ 742 struct nv50_program_exec *e = exec(pc); 743 744 e->inst[0] |= 0x90000000; 745 if (sub) { 746 set_long(pc, e); 747 e->inst[1] |= (sub << 29); 748 } 749 750 set_dst(pc, dst, e); 751 set_src_0(pc, src, e); 752 753 emit(pc, e); 754} 755 756static void 757emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 758{ 759 struct nv50_program_exec *e = exec(pc); 760 761 e->inst[0] |= 0xb0000000; 762 763 set_dst(pc, dst, e); 764 set_src_0(pc, src, e); 765 set_long(pc, e); 766 e->inst[1] |= (6 << 29) | 0x00004000; 767 768 emit(pc, e); 769} 770 771static void 772emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 773{ 774 struct nv50_program_exec *e = exec(pc); 775 776 e->inst[0] |= 0xb0000000; 777 778 set_dst(pc, dst, e); 779 set_src_0(pc, src, e); 780 set_long(pc, e); 781 e->inst[1] |= (6 << 29); 782 783 emit(pc, e); 784} 785 786#define CVTOP_RN 0x01 787#define CVTOP_FLOOR 0x03 788#define CVTOP_CEIL 0x05 789#define CVTOP_TRUNC 0x07 790#define CVTOP_SAT 0x08 791#define CVTOP_ABS 0x10 792 793/* 0x04 == 32 bit */ 794/* 0x40 == dst is float */ 795/* 0x80 == src is float */ 796#define CVT_F32_F32 0xc4 797#define CVT_F32_S32 0x44 798#define CVT_F32_U32 0x64 799#define CVT_S32_F32 0x8c 800#define CVT_S32_S32 0x0c 801#define CVT_F32_F32_ROP 0xcc 802 803static void 804emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, 805 int wp, unsigned cvn, unsigned fmt) 806{ 807 struct nv50_program_exec *e; 808 809 e = exec(pc); 810 set_long(pc, e); 811 812 e->inst[0] |= 0xa0000000; 813 e->inst[1] |= 0x00004000; 814 e->inst[1] |= (cvn << 16); 815 e->inst[1] |= (fmt << 24); 816 set_src_0(pc, src, e); 817 818 if (wp >= 0) 819 set_pred_wr(pc, 1, wp, e); 820 821 if (dst) 822 set_dst(pc, dst, e); 823 else { 824 e->inst[0] |= 0x000001fc; 825 e->inst[1] |= 0x00000008; 826 } 827 828 emit(pc, e); 829} 830 831/* nv50 Condition codes: 832 * 0x1 = LT 833 * 0x2 = EQ 834 * 0x3 = LE 835 * 0x4 = GT 836 * 0x5 = NE 837 * 0x6 = GE 838 * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) 839 * 0x8 = unordered bit (allows NaN) 840 */ 841static void 842emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, 843 struct nv50_reg *src0, struct nv50_reg *src1) 844{ 845 struct nv50_program_exec *e = exec(pc); 846 struct nv50_reg *rdst; 847 848 assert(ccode < 16); 849 if (check_swap_src_0_1(pc, &src0, &src1)) 850 ccode = ccode ^ 0x7; 851 852 rdst = dst; 853 if (dst && dst->type != P_TEMP) 854 dst = alloc_temp(pc, NULL); 855 856 /* set.u32 */ 857 set_long(pc, e); 858 e->inst[0] |= 0xb0000000; 859 e->inst[1] |= 0x60000000 | (ccode << 14); 860 861 /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but 862 * that doesn't seem to match what the hw actually does 863 e->inst[1] |= 0x04000000; << breaks things, u32 by default ? 864 */ 865 866 if (wp >= 0) 867 set_pred_wr(pc, 1, wp, e); 868 if (dst) 869 set_dst(pc, dst, e); 870 else { 871 e->inst[0] |= 0x000001fc; 872 e->inst[1] |= 0x00000008; 873 } 874 875 set_src_0(pc, src0, e); 876 set_src_1(pc, src1, e); 877 878 emit(pc, e); 879 880 /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */ 881 if (rdst) 882 emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32); 883 if (rdst && rdst != dst) 884 free_temp(pc, dst); 885} 886 887static INLINE unsigned 888map_tgsi_setop_cc(unsigned op) 889{ 890 switch (op) { 891 case TGSI_OPCODE_SLT: return 0x1; 892 case TGSI_OPCODE_SGE: return 0x6; 893 case TGSI_OPCODE_SEQ: return 0x2; 894 case TGSI_OPCODE_SGT: return 0x4; 895 case TGSI_OPCODE_SLE: return 0x3; 896 case TGSI_OPCODE_SNE: return 0xd; 897 default: 898 assert(0); 899 return 0; 900 } 901} 902 903static INLINE void 904emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 905{ 906 emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP); 907} 908 909static void 910emit_pow(struct nv50_pc *pc, struct nv50_reg *dst, 911 struct nv50_reg *v, struct nv50_reg *e) 912{ 913 struct nv50_reg *temp = alloc_temp(pc, NULL); 914 915 emit_flop(pc, 3, temp, v); 916 emit_mul(pc, temp, temp, e); 917 emit_preex2(pc, temp, temp); 918 emit_flop(pc, 6, dst, temp); 919 920 free_temp(pc, temp); 921} 922 923static INLINE void 924emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 925{ 926 emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32); 927} 928 929static INLINE void 930emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 931{ 932 emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32); 933} 934 935static void 936emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 937 struct nv50_reg **src) 938{ 939 struct nv50_reg *one = alloc_immd(pc, 1.0); 940 struct nv50_reg *zero = alloc_immd(pc, 0.0); 941 struct nv50_reg *neg128 = alloc_immd(pc, -127.999999); 942 struct nv50_reg *pos128 = alloc_immd(pc, 127.999999); 943 struct nv50_reg *tmp[4]; 944 boolean allow32 = pc->allow32; 945 946 pc->allow32 = FALSE; 947 948 if (mask & (3 << 1)) { 949 tmp[0] = alloc_temp(pc, NULL); 950 emit_minmax(pc, 4, tmp[0], src[0], zero); 951 } 952 953 if (mask & (1 << 2)) { 954 set_pred_wr(pc, 1, 0, pc->p->exec_tail); 955 956 tmp[1] = temp_temp(pc); 957 emit_minmax(pc, 4, tmp[1], src[1], zero); 958 959 tmp[3] = temp_temp(pc); 960 emit_minmax(pc, 4, tmp[3], src[3], neg128); 961 emit_minmax(pc, 5, tmp[3], tmp[3], pos128); 962 963 emit_pow(pc, dst[2], tmp[1], tmp[3]); 964 emit_mov(pc, dst[2], zero); 965 set_pred(pc, 3, 0, pc->p->exec_tail); 966 } 967 968 if (mask & (1 << 1)) 969 assimilate_temp(pc, dst[1], tmp[0]); 970 else 971 if (mask & (1 << 2)) 972 free_temp(pc, tmp[0]); 973 974 pc->allow32 = allow32; 975 976 /* do this last, in case src[i,j] == dst[0,3] */ 977 if (mask & (1 << 0)) 978 emit_mov(pc, dst[0], one); 979 980 if (mask & (1 << 3)) 981 emit_mov(pc, dst[3], one); 982 983 FREE(pos128); 984 FREE(neg128); 985 FREE(zero); 986 FREE(one); 987} 988 989static void 990emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) 991{ 992 struct nv50_program_exec *e = exec(pc); 993 994 set_long(pc, e); 995 e->inst[0] |= 0xa0000000; /* delta */ 996 e->inst[1] |= (7 << 29); /* delta */ 997 e->inst[1] |= 0x04000000; /* negate arg0? probably not */ 998 e->inst[1] |= (1 << 14); /* src .f32 */ 999 set_dst(pc, dst, e); 1000 set_src_0(pc, src, e); 1001 1002 emit(pc, e); 1003} 1004 1005static void 1006emit_kil(struct nv50_pc *pc, struct nv50_reg *src) 1007{ 1008 struct nv50_program_exec *e; 1009 const int r_pred = 1; 1010 1011 /* Sets predicate reg ? */ 1012 e = exec(pc); 1013 e->inst[0] = 0xa00001fd; 1014 e->inst[1] = 0xc4014788; 1015 set_src_0(pc, src, e); 1016 set_pred_wr(pc, 1, r_pred, e); 1017 if (src->neg) 1018 e->inst[1] |= 0x20000000; 1019 emit(pc, e); 1020 1021 /* This is probably KILP */ 1022 e = exec(pc); 1023 e->inst[0] = 0x000001fe; 1024 set_long(pc, e); 1025 set_pred(pc, 1 /* LT? */, r_pred, e); 1026 emit(pc, e); 1027} 1028 1029static void 1030emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, 1031 struct nv50_reg **src, unsigned unit, unsigned type, boolean proj) 1032{ 1033 struct nv50_reg *temp, *t[4]; 1034 struct nv50_program_exec *e; 1035 1036 unsigned c, mode, dim; 1037 1038 switch (type) { 1039 case TGSI_TEXTURE_1D: 1040 dim = 1; 1041 break; 1042 case TGSI_TEXTURE_UNKNOWN: 1043 case TGSI_TEXTURE_2D: 1044 case TGSI_TEXTURE_SHADOW1D: /* XXX: x, z */ 1045 case TGSI_TEXTURE_RECT: 1046 dim = 2; 1047 break; 1048 case TGSI_TEXTURE_3D: 1049 case TGSI_TEXTURE_CUBE: 1050 case TGSI_TEXTURE_SHADOW2D: 1051 case TGSI_TEXTURE_SHADOWRECT: /* XXX */ 1052 dim = 3; 1053 break; 1054 default: 1055 assert(0); 1056 break; 1057 } 1058 1059 /* some cards need t[0]'s hw index to be a multiple of 4 */ 1060 alloc_temp4(pc, t, 0); 1061 1062 if (proj) { 1063 if (src[0]->type == P_TEMP && src[0]->rhw != -1) { 1064 mode = pc->interp_mode[src[0]->index]; 1065 1066 t[3]->rhw = src[3]->rhw; 1067 emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID)); 1068 emit_flop(pc, 0, t[3], t[3]); 1069 1070 for (c = 0; c < dim; c++) { 1071 t[c]->rhw = src[c]->rhw; 1072 emit_interp(pc, t[c], t[3], 1073 (mode | INTERP_PERSPECTIVE)); 1074 } 1075 } else { 1076 emit_flop(pc, 0, t[3], src[3]); 1077 for (c = 0; c < dim; c++) 1078 emit_mul(pc, t[c], src[c], t[3]); 1079 1080 /* XXX: for some reason the blob sometimes uses MAD: 1081 * emit_mad(pc, t[c], src[0][c], t[3], t[3]) 1082 * pc->p->exec_tail->inst[1] |= 0x080fc000; 1083 */ 1084 } 1085 } else { 1086 if (type == TGSI_TEXTURE_CUBE) { 1087 temp = temp_temp(pc); 1088 emit_minmax(pc, 4, temp, src[0], src[1]); 1089 emit_minmax(pc, 4, temp, temp, src[2]); 1090 emit_flop(pc, 0, temp, temp); 1091 for (c = 0; c < 3; c++) 1092 emit_mul(pc, t[c], src[c], temp); 1093 } else { 1094 for (c = 0; c < dim; c++) 1095 emit_mov(pc, t[c], src[c]); 1096 } 1097 } 1098 1099 e = exec(pc); 1100 set_long(pc, e); 1101 e->inst[0] |= 0xf0000000; 1102 e->inst[1] |= 0x00000004; 1103 set_dst(pc, t[0], e); 1104 e->inst[0] |= (unit << 9); 1105 1106 if (dim == 2) 1107 e->inst[0] |= 0x00400000; 1108 else 1109 if (dim == 3) 1110 e->inst[0] |= 0x00800000; 1111 1112 e->inst[0] |= (mask & 0x3) << 25; 1113 e->inst[1] |= (mask & 0xc) << 12; 1114 1115 emit(pc, e); 1116 1117#if 1 1118 if (mask & 1) emit_mov(pc, dst[0], t[0]); 1119 if (mask & 2) emit_mov(pc, dst[1], t[1]); 1120 if (mask & 4) emit_mov(pc, dst[2], t[2]); 1121 if (mask & 8) emit_mov(pc, dst[3], t[3]); 1122 1123 free_temp4(pc, t); 1124#else 1125 /* XXX: if p.e. MUL is used directly after TEX, it would still use 1126 * the texture coordinates, not the fetched values: latency ? */ 1127 1128 for (c = 0; c < 4; c++) { 1129 if (mask & (1 << c)) 1130 assimilate_temp(pc, dst[c], t[c]); 1131 else 1132 free_temp(pc, t[c]); 1133 } 1134#endif 1135} 1136 1137static void 1138convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) 1139{ 1140 unsigned q = 0, m = ~0; 1141 1142 assert(!is_long(e)); 1143 1144 switch (e->inst[0] >> 28) { 1145 case 0x1: 1146 /* MOV */ 1147 q = 0x0403c000; 1148 m = 0xffff7fff; 1149 break; 1150 case 0x8: 1151 /* INTERP (move centroid, perspective and flat bits) */ 1152 m = ~0x03000100; 1153 q = (e->inst[0] & (3 << 24)) >> (24 - 16); 1154 q |= (e->inst[0] & (1 << 8)) << (18 - 8); 1155 break; 1156 case 0x9: 1157 /* RCP */ 1158 break; 1159 case 0xB: 1160 /* ADD */ 1161 m = ~(127 << 16); 1162 q = ((e->inst[0] & (~m)) >> 2); 1163 break; 1164 case 0xC: 1165 /* MUL */ 1166 m = ~0x00008000; 1167 q = ((e->inst[0] & (~m)) << 12); 1168 break; 1169 case 0xE: 1170 /* MAD (if src2 == dst) */ 1171 q = ((e->inst[0] & 0x1fc) << 12); 1172 break; 1173 default: 1174 assert(0); 1175 break; 1176 } 1177 1178 set_long(pc, e); 1179 pc->p->exec_size++; 1180 1181 e->inst[0] &= m; 1182 e->inst[1] |= q; 1183} 1184 1185static boolean 1186negate_supported(const struct tgsi_full_instruction *insn, int i) 1187{ 1188 switch (insn->Instruction.Opcode) { 1189 case TGSI_OPCODE_DP3: 1190 case TGSI_OPCODE_DP4: 1191 case TGSI_OPCODE_MUL: 1192 case TGSI_OPCODE_KIL: 1193 case TGSI_OPCODE_ADD: 1194 case TGSI_OPCODE_SUB: 1195 case TGSI_OPCODE_MAD: 1196 return TRUE; 1197 case TGSI_OPCODE_POW: 1198 return (i == 1) ? TRUE : FALSE; 1199 default: 1200 return FALSE; 1201 } 1202} 1203 1204/* Return a read mask for source registers deduced from opcode & write mask. */ 1205static unsigned 1206nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) 1207{ 1208 unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask; 1209 1210 switch (insn->Instruction.Opcode) { 1211 case TGSI_OPCODE_COS: 1212 case TGSI_OPCODE_SIN: 1213 return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); 1214 case TGSI_OPCODE_DP3: 1215 return 0x7; 1216 case TGSI_OPCODE_DP4: 1217 case TGSI_OPCODE_DPH: 1218 case TGSI_OPCODE_KIL: /* WriteMask ignored */ 1219 return 0xf; 1220 case TGSI_OPCODE_DST: 1221 return mask & (c ? 0xa : 0x6); 1222 case TGSI_OPCODE_EX2: 1223 case TGSI_OPCODE_LG2: 1224 case TGSI_OPCODE_POW: 1225 case TGSI_OPCODE_RCP: 1226 case TGSI_OPCODE_RSQ: 1227 case TGSI_OPCODE_SCS: 1228 return 0x1; 1229 case TGSI_OPCODE_LIT: 1230 return 0xb; 1231 case TGSI_OPCODE_TEX: 1232 case TGSI_OPCODE_TXP: 1233 { 1234 const struct tgsi_instruction_ext_texture *tex; 1235 1236 assert(insn->Instruction.Extended); 1237 tex = &insn->InstructionExtTexture; 1238 1239 mask = 0x7; 1240 if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) 1241 mask |= 0x8; 1242 1243 switch (tex->Texture) { 1244 case TGSI_TEXTURE_1D: 1245 mask &= 0x9; 1246 break; 1247 case TGSI_TEXTURE_2D: 1248 mask &= 0xb; 1249 break; 1250 default: 1251 break; 1252 } 1253 } 1254 return mask; 1255 case TGSI_OPCODE_XPD: 1256 x = 0; 1257 if (mask & 1) x |= 0x6; 1258 if (mask & 2) x |= 0x5; 1259 if (mask & 4) x |= 0x3; 1260 return x; 1261 default: 1262 break; 1263 } 1264 1265 return mask; 1266} 1267 1268static struct nv50_reg * 1269tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) 1270{ 1271 switch (dst->DstRegister.File) { 1272 case TGSI_FILE_TEMPORARY: 1273 return &pc->temp[dst->DstRegister.Index * 4 + c]; 1274 case TGSI_FILE_OUTPUT: 1275 return &pc->result[dst->DstRegister.Index * 4 + c]; 1276 case TGSI_FILE_NULL: 1277 return NULL; 1278 default: 1279 break; 1280 } 1281 1282 return NULL; 1283} 1284 1285static struct nv50_reg * 1286tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, 1287 boolean neg) 1288{ 1289 struct nv50_reg *r = NULL; 1290 struct nv50_reg *temp; 1291 unsigned sgn, c; 1292 1293 sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); 1294 1295 c = tgsi_util_get_full_src_register_extswizzle(src, chan); 1296 switch (c) { 1297 case TGSI_EXTSWIZZLE_X: 1298 case TGSI_EXTSWIZZLE_Y: 1299 case TGSI_EXTSWIZZLE_Z: 1300 case TGSI_EXTSWIZZLE_W: 1301 switch (src->SrcRegister.File) { 1302 case TGSI_FILE_INPUT: 1303 r = &pc->attr[src->SrcRegister.Index * 4 + c]; 1304 break; 1305 case TGSI_FILE_TEMPORARY: 1306 r = &pc->temp[src->SrcRegister.Index * 4 + c]; 1307 break; 1308 case TGSI_FILE_CONSTANT: 1309 r = &pc->param[src->SrcRegister.Index * 4 + c]; 1310 break; 1311 case TGSI_FILE_IMMEDIATE: 1312 r = &pc->immd[src->SrcRegister.Index * 4 + c]; 1313 break; 1314 case TGSI_FILE_SAMPLER: 1315 break; 1316 default: 1317 assert(0); 1318 break; 1319 } 1320 break; 1321 case TGSI_EXTSWIZZLE_ZERO: 1322 r = alloc_immd(pc, 0.0); 1323 return r; 1324 case TGSI_EXTSWIZZLE_ONE: 1325 if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET) 1326 return alloc_immd(pc, -1.0); 1327 return alloc_immd(pc, 1.0); 1328 default: 1329 assert(0); 1330 break; 1331 } 1332 1333 switch (sgn) { 1334 case TGSI_UTIL_SIGN_KEEP: 1335 break; 1336 case TGSI_UTIL_SIGN_CLEAR: 1337 temp = temp_temp(pc); 1338 emit_abs(pc, temp, r); 1339 r = temp; 1340 break; 1341 case TGSI_UTIL_SIGN_TOGGLE: 1342 if (neg) 1343 r->neg = 1; 1344 else { 1345 temp = temp_temp(pc); 1346 emit_neg(pc, temp, r); 1347 r = temp; 1348 } 1349 break; 1350 case TGSI_UTIL_SIGN_SET: 1351 temp = temp_temp(pc); 1352 emit_abs(pc, temp, r); 1353 if (neg) 1354 temp->neg = 1; 1355 else 1356 emit_neg(pc, temp, temp); 1357 r = temp; 1358 break; 1359 default: 1360 assert(0); 1361 break; 1362 } 1363 1364 return r; 1365} 1366 1367/* return TRUE for ops that produce only a single result */ 1368static boolean 1369is_scalar_op(unsigned op) 1370{ 1371 switch (op) { 1372 case TGSI_OPCODE_COS: 1373 case TGSI_OPCODE_DP2: 1374 case TGSI_OPCODE_DP3: 1375 case TGSI_OPCODE_DP4: 1376 case TGSI_OPCODE_DPH: 1377 case TGSI_OPCODE_EX2: 1378 case TGSI_OPCODE_LG2: 1379 case TGSI_OPCODE_POW: 1380 case TGSI_OPCODE_RCP: 1381 case TGSI_OPCODE_RSQ: 1382 case TGSI_OPCODE_SIN: 1383 /* 1384 case TGSI_OPCODE_KIL: 1385 case TGSI_OPCODE_LIT: 1386 case TGSI_OPCODE_SCS: 1387 */ 1388 return TRUE; 1389 default: 1390 return FALSE; 1391 } 1392} 1393 1394/* Returns a bitmask indicating which dst components depend 1395 * on source s, component c (reverse of nv50_tgsi_src_mask). 1396 */ 1397static unsigned 1398nv50_tgsi_dst_revdep(unsigned op, int s, int c) 1399{ 1400 if (is_scalar_op(op)) 1401 return 0x1; 1402 1403 switch (op) { 1404 case TGSI_OPCODE_DST: 1405 return (1 << c) & (s ? 0xa : 0x6); 1406 case TGSI_OPCODE_XPD: 1407 switch (c) { 1408 case 0: return 0x6; 1409 case 1: return 0x5; 1410 case 2: return 0x3; 1411 case 3: return 0x0; 1412 default: 1413 assert(0); 1414 return 0x0; 1415 } 1416 case TGSI_OPCODE_LIT: 1417 case TGSI_OPCODE_SCS: 1418 case TGSI_OPCODE_TEX: 1419 case TGSI_OPCODE_TXP: 1420 /* these take care of dangerous swizzles themselves */ 1421 return 0x0; 1422 case TGSI_OPCODE_IF: 1423 case TGSI_OPCODE_KIL: 1424 /* don't call this function for these ops */ 1425 assert(0); 1426 return 0; 1427 default: 1428 /* linear vector instruction */ 1429 return (1 << c); 1430 } 1431} 1432 1433static boolean 1434nv50_program_tx_insn(struct nv50_pc *pc, 1435 const struct tgsi_full_instruction *inst) 1436{ 1437 struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; 1438 unsigned mask, sat, unit; 1439 int i, c; 1440 1441 mask = inst->FullDstRegisters[0].DstRegister.WriteMask; 1442 sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; 1443 1444 memset(src, 0, sizeof(src)); 1445 1446 for (c = 0; c < 4; c++) { 1447 if ((mask & (1 << c)) && !pc->r_dst[c]) 1448 dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); 1449 else 1450 dst[c] = pc->r_dst[c]; 1451 rdst[c] = dst[c]; 1452 } 1453 1454 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1455 const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; 1456 unsigned src_mask; 1457 boolean neg_supp; 1458 1459 src_mask = nv50_tgsi_src_mask(inst, i); 1460 neg_supp = negate_supported(inst, i); 1461 1462 if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) 1463 unit = fs->SrcRegister.Index; 1464 1465 for (c = 0; c < 4; c++) 1466 if (src_mask & (1 << c)) 1467 src[i][c] = tgsi_src(pc, c, fs, neg_supp); 1468 } 1469 1470 brdc = temp = pc->r_brdc; 1471 if (brdc && brdc->type != P_TEMP) { 1472 temp = temp_temp(pc); 1473 if (sat) 1474 brdc = temp; 1475 } else 1476 if (sat) { 1477 for (c = 0; c < 4; c++) { 1478 if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) 1479 continue; 1480 rdst[c] = dst[c]; 1481 dst[c] = temp_temp(pc); 1482 } 1483 } 1484 1485 assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); 1486 1487 switch (inst->Instruction.Opcode) { 1488 case TGSI_OPCODE_ABS: 1489 for (c = 0; c < 4; c++) { 1490 if (!(mask & (1 << c))) 1491 continue; 1492 emit_abs(pc, dst[c], src[0][c]); 1493 } 1494 break; 1495 case TGSI_OPCODE_ADD: 1496 for (c = 0; c < 4; c++) { 1497 if (!(mask & (1 << c))) 1498 continue; 1499 emit_add(pc, dst[c], src[0][c], src[1][c]); 1500 } 1501 break; 1502 case TGSI_OPCODE_COS: 1503 if (mask & 8) { 1504 emit_precossin(pc, temp, src[0][3]); 1505 emit_flop(pc, 5, dst[3], temp); 1506 if (!(mask &= 7)) 1507 break; 1508 if (temp == dst[3]) 1509 temp = brdc = temp_temp(pc); 1510 } 1511 emit_precossin(pc, temp, src[0][0]); 1512 emit_flop(pc, 5, brdc, temp); 1513 break; 1514 case TGSI_OPCODE_DP3: 1515 emit_mul(pc, temp, src[0][0], src[1][0]); 1516 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1517 emit_mad(pc, brdc, src[0][2], src[1][2], temp); 1518 break; 1519 case TGSI_OPCODE_DP4: 1520 emit_mul(pc, temp, src[0][0], src[1][0]); 1521 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1522 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1523 emit_mad(pc, brdc, src[0][3], src[1][3], temp); 1524 break; 1525 case TGSI_OPCODE_DPH: 1526 emit_mul(pc, temp, src[0][0], src[1][0]); 1527 emit_mad(pc, temp, src[0][1], src[1][1], temp); 1528 emit_mad(pc, temp, src[0][2], src[1][2], temp); 1529 emit_add(pc, brdc, src[1][3], temp); 1530 break; 1531 case TGSI_OPCODE_DST: 1532 if (mask & (1 << 1)) 1533 emit_mul(pc, dst[1], src[0][1], src[1][1]); 1534 if (mask & (1 << 2)) 1535 emit_mov(pc, dst[2], src[0][2]); 1536 if (mask & (1 << 3)) 1537 emit_mov(pc, dst[3], src[1][3]); 1538 if (mask & (1 << 0)) 1539 emit_mov_immdval(pc, dst[0], 1.0f); 1540 break; 1541 case TGSI_OPCODE_EX2: 1542 emit_preex2(pc, temp, src[0][0]); 1543 emit_flop(pc, 6, brdc, temp); 1544 break; 1545 case TGSI_OPCODE_FLR: 1546 for (c = 0; c < 4; c++) { 1547 if (!(mask & (1 << c))) 1548 continue; 1549 emit_flr(pc, dst[c], src[0][c]); 1550 } 1551 break; 1552 case TGSI_OPCODE_FRC: 1553 temp = temp_temp(pc); 1554 for (c = 0; c < 4; c++) { 1555 if (!(mask & (1 << c))) 1556 continue; 1557 emit_flr(pc, temp, src[0][c]); 1558 emit_sub(pc, dst[c], src[0][c], temp); 1559 } 1560 break; 1561 case TGSI_OPCODE_KIL: 1562 emit_kil(pc, src[0][0]); 1563 emit_kil(pc, src[0][1]); 1564 emit_kil(pc, src[0][2]); 1565 emit_kil(pc, src[0][3]); 1566 pc->p->cfg.fp.regs[2] |= 0x00100000; 1567 break; 1568 case TGSI_OPCODE_LIT: 1569 emit_lit(pc, &dst[0], mask, &src[0][0]); 1570 break; 1571 case TGSI_OPCODE_LG2: 1572 emit_flop(pc, 3, brdc, src[0][0]); 1573 break; 1574 case TGSI_OPCODE_LRP: 1575 temp = temp_temp(pc); 1576 for (c = 0; c < 4; c++) { 1577 if (!(mask & (1 << c))) 1578 continue; 1579 emit_sub(pc, temp, src[1][c], src[2][c]); 1580 emit_mad(pc, dst[c], temp, src[0][c], src[2][c]); 1581 } 1582 break; 1583 case TGSI_OPCODE_MAD: 1584 for (c = 0; c < 4; c++) { 1585 if (!(mask & (1 << c))) 1586 continue; 1587 emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]); 1588 } 1589 break; 1590 case TGSI_OPCODE_MAX: 1591 for (c = 0; c < 4; c++) { 1592 if (!(mask & (1 << c))) 1593 continue; 1594 emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]); 1595 } 1596 break; 1597 case TGSI_OPCODE_MIN: 1598 for (c = 0; c < 4; c++) { 1599 if (!(mask & (1 << c))) 1600 continue; 1601 emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]); 1602 } 1603 break; 1604 case TGSI_OPCODE_MOV: 1605 case TGSI_OPCODE_SWZ: 1606 for (c = 0; c < 4; c++) { 1607 if (!(mask & (1 << c))) 1608 continue; 1609 emit_mov(pc, dst[c], src[0][c]); 1610 } 1611 break; 1612 case TGSI_OPCODE_MUL: 1613 for (c = 0; c < 4; c++) { 1614 if (!(mask & (1 << c))) 1615 continue; 1616 emit_mul(pc, dst[c], src[0][c], src[1][c]); 1617 } 1618 break; 1619 case TGSI_OPCODE_POW: 1620 emit_pow(pc, brdc, src[0][0], src[1][0]); 1621 break; 1622 case TGSI_OPCODE_RCP: 1623 emit_flop(pc, 0, brdc, src[0][0]); 1624 break; 1625 case TGSI_OPCODE_RSQ: 1626 emit_flop(pc, 2, brdc, src[0][0]); 1627 break; 1628 case TGSI_OPCODE_SCS: 1629 temp = temp_temp(pc); 1630 if (mask & 3) 1631 emit_precossin(pc, temp, src[0][0]); 1632 if (mask & (1 << 0)) 1633 emit_flop(pc, 5, dst[0], temp); 1634 if (mask & (1 << 1)) 1635 emit_flop(pc, 4, dst[1], temp); 1636 if (mask & (1 << 2)) 1637 emit_mov_immdval(pc, dst[2], 0.0); 1638 if (mask & (1 << 3)) 1639 emit_mov_immdval(pc, dst[3], 1.0); 1640 break; 1641 case TGSI_OPCODE_SIN: 1642 if (mask & 8) { 1643 emit_precossin(pc, temp, src[0][3]); 1644 emit_flop(pc, 4, dst[3], temp); 1645 if (!(mask &= 7)) 1646 break; 1647 if (temp == dst[3]) 1648 temp = brdc = temp_temp(pc); 1649 } 1650 emit_precossin(pc, temp, src[0][0]); 1651 emit_flop(pc, 4, brdc, temp); 1652 break; 1653 case TGSI_OPCODE_SLT: 1654 case TGSI_OPCODE_SGE: 1655 case TGSI_OPCODE_SEQ: 1656 case TGSI_OPCODE_SGT: 1657 case TGSI_OPCODE_SLE: 1658 case TGSI_OPCODE_SNE: 1659 i = map_tgsi_setop_cc(inst->Instruction.Opcode); 1660 for (c = 0; c < 4; c++) { 1661 if (!(mask & (1 << c))) 1662 continue; 1663 emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]); 1664 } 1665 break; 1666 case TGSI_OPCODE_SUB: 1667 for (c = 0; c < 4; c++) { 1668 if (!(mask & (1 << c))) 1669 continue; 1670 emit_sub(pc, dst[c], src[0][c], src[1][c]); 1671 } 1672 break; 1673 case TGSI_OPCODE_TEX: 1674 emit_tex(pc, dst, mask, src[0], unit, 1675 inst->InstructionExtTexture.Texture, FALSE); 1676 break; 1677 case TGSI_OPCODE_TXP: 1678 emit_tex(pc, dst, mask, src[0], unit, 1679 inst->InstructionExtTexture.Texture, TRUE); 1680 break; 1681 case TGSI_OPCODE_XPD: 1682 temp = temp_temp(pc); 1683 if (mask & (1 << 0)) { 1684 emit_mul(pc, temp, src[0][2], src[1][1]); 1685 emit_msb(pc, dst[0], src[0][1], src[1][2], temp); 1686 } 1687 if (mask & (1 << 1)) { 1688 emit_mul(pc, temp, src[0][0], src[1][2]); 1689 emit_msb(pc, dst[1], src[0][2], src[1][0], temp); 1690 } 1691 if (mask & (1 << 2)) { 1692 emit_mul(pc, temp, src[0][1], src[1][0]); 1693 emit_msb(pc, dst[2], src[0][0], src[1][1], temp); 1694 } 1695 if (mask & (1 << 3)) 1696 emit_mov_immdval(pc, dst[3], 1.0); 1697 break; 1698 case TGSI_OPCODE_END: 1699 break; 1700 default: 1701 NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode); 1702 return FALSE; 1703 } 1704 1705 if (brdc) { 1706 if (sat) 1707 emit_sat(pc, brdc, brdc); 1708 for (c = 0; c < 4; c++) 1709 if ((mask & (1 << c)) && dst[c] != brdc) 1710 emit_mov(pc, dst[c], brdc); 1711 } else 1712 if (sat) { 1713 for (c = 0; c < 4; c++) { 1714 if (!(mask & (1 << c))) 1715 continue; 1716 /* in this case we saturate later */ 1717 if (dst[c]->type == P_TEMP && dst[c]->index < 0) 1718 continue; 1719 emit_sat(pc, rdst[c], dst[c]); 1720 } 1721 } 1722 1723 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { 1724 for (c = 0; c < 4; c++) { 1725 if (!src[i][c]) 1726 continue; 1727 if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD) 1728 FREE(src[i][c]); 1729 } 1730 } 1731 1732 kill_temp_temp(pc); 1733 return TRUE; 1734} 1735 1736static void 1737prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok, 1738 unsigned *r_usage[2]) 1739{ 1740 const struct tgsi_full_instruction *insn; 1741 const struct tgsi_full_src_register *src; 1742 const struct tgsi_dst_register *dst; 1743 1744 unsigned i, c, k, n, mask, *acc_p; 1745 1746 insn = &tok->FullInstruction; 1747 dst = &insn->FullDstRegisters[0].DstRegister; 1748 mask = dst->WriteMask; 1749 1750 if (!r_usage[0]) 1751 r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned)); 1752 if (!r_usage[1]) 1753 r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned)); 1754 1755 if (dst->File == TGSI_FILE_TEMPORARY) { 1756 for (c = 0; c < 4; c++) { 1757 if (!(mask & (1 << c))) 1758 continue; 1759 r_usage[0][dst->Index * 4 + c] = pc->insn_nr; 1760 } 1761 } 1762 1763 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 1764 src = &insn->FullSrcRegisters[i]; 1765 1766 switch (src->SrcRegister.File) { 1767 case TGSI_FILE_TEMPORARY: 1768 acc_p = r_usage[0]; 1769 break; 1770 case TGSI_FILE_INPUT: 1771 acc_p = r_usage[1]; 1772 break; 1773 default: 1774 continue; 1775 } 1776 1777 mask = nv50_tgsi_src_mask(insn, i); 1778 1779 for (c = 0; c < 4; c++) { 1780 if (!(mask & (1 << c))) 1781 continue; 1782 1783 k = tgsi_util_get_full_src_register_extswizzle(src, c); 1784 switch (k) { 1785 case TGSI_EXTSWIZZLE_X: 1786 case TGSI_EXTSWIZZLE_Y: 1787 case TGSI_EXTSWIZZLE_Z: 1788 case TGSI_EXTSWIZZLE_W: 1789 n = src->SrcRegister.Index * 4 + k; 1790 acc_p[n] = pc->insn_nr; 1791 break; 1792 default: 1793 break; 1794 } 1795 } 1796 } 1797} 1798 1799/* Returns a bitmask indicating which dst components need to be 1800 * written to temporaries first to avoid 'corrupting' sources. 1801 * 1802 * m[i] (out) indicate component to write in the i-th position 1803 * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source 1804 */ 1805static unsigned 1806nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) 1807{ 1808 unsigned i, c, x, unsafe; 1809 1810 for (c = 0; c < 4; c++) 1811 m[c] = c; 1812 1813 /* Swap as long as a dst component written earlier is depended on 1814 * by one written later, but the next one isn't depended on by it. 1815 */ 1816 for (c = 0; c < 3; c++) { 1817 if (rdep[m[c + 1]] & (1 << m[c])) 1818 continue; /* if next one is depended on by us */ 1819 for (i = c + 1; i < 4; i++) 1820 /* if we are depended on by a later one */ 1821 if (rdep[m[c]] & (1 << m[i])) 1822 break; 1823 if (i == 4) 1824 continue; 1825 /* now, swap */ 1826 x = m[c]; 1827 m[c] = m[c + 1]; 1828 m[c + 1] = x; 1829 1830 /* restart */ 1831 c = 0; 1832 } 1833 1834 /* mark dependencies that could not be resolved by reordering */ 1835 for (i = 0; i < 3; ++i) 1836 for (c = i + 1; c < 4; ++c) 1837 if (rdep[m[i]] & (1 << m[c])) 1838 unsafe |= (1 << i); 1839 1840 /* NOTE: $unsafe is with respect to order, not component */ 1841 return unsafe; 1842} 1843 1844/* Select a suitable dst register for broadcasting scalar results, 1845 * or return NULL if we have to allocate an extra TEMP. 1846 * 1847 * If e.g. only 1 component is written, we may also emit the final 1848 * result to a write-only register. 1849 */ 1850static struct nv50_reg * 1851tgsi_broadcast_dst(struct nv50_pc *pc, 1852 const struct tgsi_full_dst_register *fd, unsigned mask) 1853{ 1854 if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) { 1855 int c = ffs(~mask & fd->DstRegister.WriteMask); 1856 if (c) 1857 return tgsi_dst(pc, c - 1, fd); 1858 } else { 1859 int c = ffs(fd->DstRegister.WriteMask) - 1; 1860 if ((1 << c) == fd->DstRegister.WriteMask) 1861 return tgsi_dst(pc, c, fd); 1862 } 1863 1864 return NULL; 1865} 1866 1867/* Scan source swizzles and return a bitmask indicating dst regs that 1868 * also occur among the src regs, and fill rdep for nv50_revdep_reoder. 1869 */ 1870static unsigned 1871nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, 1872 unsigned rdep[4]) 1873{ 1874 const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0]; 1875 const struct tgsi_full_src_register *fs; 1876 unsigned i, deqs = 0; 1877 1878 for (i = 0; i < 4; ++i) 1879 rdep[i] = 0; 1880 1881 for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { 1882 unsigned chn, mask = nv50_tgsi_src_mask(insn, i); 1883 boolean neg_supp = negate_supported(insn, i); 1884 1885 fs = &insn->FullSrcRegisters[i]; 1886 if (fs->SrcRegister.File != fd->DstRegister.File || 1887 fs->SrcRegister.Index != fd->DstRegister.Index) 1888 continue; 1889 1890 for (chn = 0; chn < 4; ++chn) { 1891 unsigned s, c; 1892 1893 if (!(mask & (1 << chn))) /* src is not read */ 1894 continue; 1895 c = tgsi_util_get_full_src_register_extswizzle(fs, chn); 1896 s = tgsi_util_get_full_src_register_sign_mode(fs, chn); 1897 1898 if (c > TGSI_EXTSWIZZLE_W || 1899 !(fd->DstRegister.WriteMask & (1 << c))) 1900 continue; 1901 1902 /* no danger if src is copied to TEMP first */ 1903 if ((s != TGSI_UTIL_SIGN_KEEP) && 1904 (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp)) 1905 continue; 1906 1907 rdep[c] |= nv50_tgsi_dst_revdep( 1908 insn->Instruction.Opcode, i, chn); 1909 deqs |= (1 << c); 1910 } 1911 } 1912 1913 return deqs; 1914} 1915 1916static boolean 1917nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) 1918{ 1919 struct tgsi_full_instruction insn = tok->FullInstruction; 1920 const struct tgsi_full_dst_register *fd; 1921 unsigned i, deqs, rdep[4], m[4]; 1922 1923 fd = &tok->FullInstruction.FullDstRegisters[0]; 1924 deqs = nv50_tgsi_scan_swizzle(&insn, rdep); 1925 1926 if (is_scalar_op(insn.Instruction.Opcode)) { 1927 pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); 1928 if (!pc->r_brdc) 1929 pc->r_brdc = temp_temp(pc); 1930 return nv50_program_tx_insn(pc, &insn); 1931 } 1932 pc->r_brdc = NULL; 1933 1934 if (!deqs) 1935 return nv50_program_tx_insn(pc, &insn); 1936 1937 deqs = nv50_revdep_reorder(m, rdep); 1938 1939 for (i = 0; i < 4; ++i) { 1940 assert(pc->r_dst[m[i]] == NULL); 1941 1942 insn.FullDstRegisters[0].DstRegister.WriteMask = 1943 fd->DstRegister.WriteMask & (1 << m[i]); 1944 1945 if (!insn.FullDstRegisters[0].DstRegister.WriteMask) 1946 continue; 1947 1948 if (deqs & (1 << i)) 1949 pc->r_dst[m[i]] = alloc_temp(pc, NULL); 1950 1951 if (!nv50_program_tx_insn(pc, &insn)) 1952 return FALSE; 1953 } 1954 1955 for (i = 0; i < 4; i++) { 1956 struct nv50_reg *reg = pc->r_dst[i]; 1957 if (!reg) 1958 continue; 1959 pc->r_dst[i] = NULL; 1960 1961 if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) 1962 emit_sat(pc, tgsi_dst(pc, i, fd), reg); 1963 else 1964 emit_mov(pc, tgsi_dst(pc, i, fd), reg); 1965 free_temp(pc, reg); 1966 } 1967 1968 return TRUE; 1969} 1970 1971static unsigned 1972load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid, 1973 int *aid, int *p_oid) 1974{ 1975 struct nv50_reg *iv; 1976 int oid, c, n; 1977 unsigned mask = 0; 1978 1979 iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p; 1980 1981 for (c = 0, n = i * 4; c < 4; c++, n++) { 1982 oid = (*p_oid)++; 1983 pc->attr[n].type = P_TEMP; 1984 pc->attr[n].index = i; 1985 1986 if (pc->attr[n].acc == acc[n]) 1987 continue; 1988 mask |= (1 << c); 1989 1990 pc->attr[n].acc = acc[n]; 1991 pc->attr[n].rhw = pc->attr[n].hw = -1; 1992 alloc_reg(pc, &pc->attr[n]); 1993 1994 pc->attr[n].rhw = (*aid)++; 1995 emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]); 1996 1997 pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4)); 1998 (*mid)++; 1999 pc->p->cfg.fp.regs[1] += 0x00010001; 2000 } 2001 2002 return mask; 2003} 2004 2005static boolean 2006nv50_program_tx_prep(struct nv50_pc *pc) 2007{ 2008 struct tgsi_parse_context p; 2009 boolean ret = FALSE; 2010 unsigned i, c; 2011 unsigned fcol, bcol, fcrd, depr; 2012 2013 /* count (centroid) perspective interpolations */ 2014 unsigned centroid_loads = 0; 2015 unsigned perspect_loads = 0; 2016 2017 /* track register access for temps and attrs */ 2018 unsigned *r_usage[2]; 2019 r_usage[0] = NULL; 2020 r_usage[1] = NULL; 2021 2022 depr = fcol = bcol = fcrd = 0xffff; 2023 2024 if (pc->p->type == PIPE_SHADER_FRAGMENT) { 2025 pc->p->cfg.fp.regs[0] = 0x01000404; 2026 pc->p->cfg.fp.regs[1] = 0x00000400; 2027 } 2028 2029 tgsi_parse_init(&p, pc->p->pipe.tokens); 2030 while (!tgsi_parse_end_of_tokens(&p)) { 2031 const union tgsi_full_token *tok = &p.FullToken; 2032 2033 tgsi_parse_token(&p); 2034 switch (tok->Token.Type) { 2035 case TGSI_TOKEN_TYPE_IMMEDIATE: 2036 { 2037 const struct tgsi_full_immediate *imm = 2038 &p.FullToken.FullImmediate; 2039 2040 ctor_immd(pc, imm->u[0].Float, 2041 imm->u[1].Float, 2042 imm->u[2].Float, 2043 imm->u[3].Float); 2044 } 2045 break; 2046 case TGSI_TOKEN_TYPE_DECLARATION: 2047 { 2048 const struct tgsi_full_declaration *d; 2049 unsigned last, first, mode; 2050 2051 d = &p.FullToken.FullDeclaration; 2052 first = d->DeclarationRange.First; 2053 last = d->DeclarationRange.Last; 2054 2055 switch (d->Declaration.File) { 2056 case TGSI_FILE_TEMPORARY: 2057 if (pc->temp_nr < (last + 1)) 2058 pc->temp_nr = last + 1; 2059 break; 2060 case TGSI_FILE_OUTPUT: 2061 if (pc->result_nr < (last + 1)) 2062 pc->result_nr = last + 1; 2063 2064 if (!d->Declaration.Semantic) 2065 break; 2066 2067 switch (d->Semantic.SemanticName) { 2068 case TGSI_SEMANTIC_POSITION: 2069 depr = first; 2070 pc->p->cfg.fp.regs[2] |= 0x00000100; 2071 pc->p->cfg.fp.regs[3] |= 0x00000011; 2072 break; 2073 default: 2074 break; 2075 } 2076 2077 break; 2078 case TGSI_FILE_INPUT: 2079 { 2080 if (pc->attr_nr < (last + 1)) 2081 pc->attr_nr = last + 1; 2082 2083 if (pc->p->type != PIPE_SHADER_FRAGMENT) 2084 break; 2085 2086 switch (d->Declaration.Interpolate) { 2087 case TGSI_INTERPOLATE_CONSTANT: 2088 mode = INTERP_FLAT; 2089 break; 2090 case TGSI_INTERPOLATE_PERSPECTIVE: 2091 mode = INTERP_PERSPECTIVE; 2092 break; 2093 default: 2094 mode = INTERP_LINEAR; 2095 break; 2096 } 2097 2098 if (d->Declaration.Semantic) { 2099 switch (d->Semantic.SemanticName) { 2100 case TGSI_SEMANTIC_POSITION: 2101 fcrd = first; 2102 break; 2103 case TGSI_SEMANTIC_COLOR: 2104 fcol = first; 2105 mode = INTERP_PERSPECTIVE; 2106 break; 2107 case TGSI_SEMANTIC_BCOLOR: 2108 bcol = first; 2109 mode = INTERP_PERSPECTIVE; 2110 break; 2111 } 2112 } 2113 2114 if (d->Declaration.Centroid) { 2115 mode |= INTERP_CENTROID; 2116 if (mode & INTERP_PERSPECTIVE) 2117 centroid_loads++; 2118 } else 2119 if (mode & INTERP_PERSPECTIVE) 2120 perspect_loads++; 2121 2122 assert(last < 32); 2123 for (i = first; i <= last; i++) 2124 pc->interp_mode[i] = mode; 2125 } 2126 break; 2127 case TGSI_FILE_CONSTANT: 2128 if (pc->param_nr < (last + 1)) 2129 pc->param_nr = last + 1; 2130 break; 2131 case TGSI_FILE_SAMPLER: 2132 break; 2133 default: 2134 NOUVEAU_ERR("bad decl file %d\n", 2135 d->Declaration.File); 2136 goto out_err; 2137 } 2138 } 2139 break; 2140 case TGSI_TOKEN_TYPE_INSTRUCTION: 2141 pc->insn_nr++; 2142 prep_inspect_insn(pc, tok, r_usage); 2143 break; 2144 default: 2145 break; 2146 } 2147 } 2148 2149 if (pc->temp_nr) { 2150 pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg)); 2151 if (!pc->temp) 2152 goto out_err; 2153 2154 for (i = 0; i < pc->temp_nr; i++) { 2155 for (c = 0; c < 4; c++) { 2156 pc->temp[i*4+c].type = P_TEMP; 2157 pc->temp[i*4+c].hw = -1; 2158 pc->temp[i*4+c].rhw = -1; 2159 pc->temp[i*4+c].index = i; 2160 pc->temp[i*4+c].acc = r_usage[0][i*4+c]; 2161 } 2162 } 2163 } 2164 2165 if (pc->attr_nr) { 2166 int oid = 4, mid = 4, aid = 0; 2167 /* oid = VP output id 2168 * aid = FP attribute/interpolant id 2169 * mid = VP output mapping field ID 2170 */ 2171 2172 pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg)); 2173 if (!pc->attr) 2174 goto out_err; 2175 2176 if (pc->p->type == PIPE_SHADER_FRAGMENT) { 2177 /* position should be loaded first */ 2178 if (fcrd != 0xffff) { 2179 unsigned mask; 2180 mid = 0; 2181 mask = load_fp_attrib(pc, fcrd, r_usage[1], 2182 &mid, &aid, &oid); 2183 oid = 0; 2184 pc->p->cfg.fp.regs[1] |= (mask << 24); 2185 pc->p->cfg.fp.map[0] = 0x04040404 * fcrd; 2186 } 2187 pc->p->cfg.fp.map[0] += 0x03020100; 2188 2189 /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */ 2190 2191 if (perspect_loads) { 2192 pc->iv_p = alloc_temp(pc, NULL); 2193 2194 if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) { 2195 pc->p->cfg.fp.regs[1] |= 0x08000000; 2196 pc->iv_p->rhw = aid++; 2197 emit_interp(pc, pc->iv_p, NULL, 2198 INTERP_LINEAR); 2199 emit_flop(pc, 0, pc->iv_p, pc->iv_p); 2200 } else { 2201 pc->iv_p->rhw = aid - 1; 2202 emit_flop(pc, 0, pc->iv_p, 2203 &pc->attr[fcrd * 4 + 3]); 2204 } 2205 } 2206 2207 if (centroid_loads) { 2208 pc->iv_c = alloc_temp(pc, NULL); 2209 pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++; 2210 emit_interp(pc, pc->iv_c, NULL, 2211 INTERP_CENTROID); 2212 emit_flop(pc, 0, pc->iv_c, pc->iv_c); 2213 pc->p->cfg.fp.regs[1] |= 0x08000000; 2214 } 2215 2216 for (c = 0; c < 4; c++) { 2217 /* I don't know what these values do, but 2218 * let's set them like the blob does: 2219 */ 2220 if (fcol != 0xffff && r_usage[1][fcol * 4 + c]) 2221 pc->p->cfg.fp.regs[0] += 0x00010000; 2222 if (bcol != 0xffff && r_usage[1][bcol * 4 + c]) 2223 pc->p->cfg.fp.regs[0] += 0x00010000; 2224 } 2225 2226 for (i = 0; i < pc->attr_nr; i++) 2227 load_fp_attrib(pc, i, r_usage[1], 2228 &mid, &aid, &oid); 2229 2230 if (pc->iv_p) 2231 free_temp(pc, pc->iv_p); 2232 if (pc->iv_c) 2233 free_temp(pc, pc->iv_c); 2234 2235 pc->p->cfg.fp.high_map = (mid / 4); 2236 pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0); 2237 } else { 2238 /* vertex program */ 2239 for (i = 0; i < pc->attr_nr * 4; i++) { 2240 pc->p->cfg.vp.attr[aid / 32] |= 2241 (1 << (aid % 32)); 2242 pc->attr[i].type = P_ATTR; 2243 pc->attr[i].hw = aid++; 2244 pc->attr[i].index = i / 4; 2245 } 2246 } 2247 } 2248 2249 if (pc->result_nr) { 2250 int rid = 0; 2251 2252 pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg)); 2253 if (!pc->result) 2254 goto out_err; 2255 2256 for (i = 0; i < pc->result_nr; i++) { 2257 for (c = 0; c < 4; c++) { 2258 if (pc->p->type == PIPE_SHADER_FRAGMENT) { 2259 pc->result[i*4+c].type = P_TEMP; 2260 pc->result[i*4+c].hw = -1; 2261 pc->result[i*4+c].rhw = (i == depr) ? 2262 -1 : rid++; 2263 } else { 2264 pc->result[i*4+c].type = P_RESULT; 2265 pc->result[i*4+c].hw = rid++; 2266 } 2267 pc->result[i*4+c].index = i; 2268 } 2269 2270 if (pc->p->type == PIPE_SHADER_FRAGMENT && 2271 depr != 0xffff) { 2272 pc->result[depr * 4 + 2].rhw = 2273 (pc->result_nr - 1) * 4; 2274 } 2275 } 2276 } 2277 2278 if (pc->param_nr) { 2279 int rid = 0; 2280 2281 pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg)); 2282 if (!pc->param) 2283 goto out_err; 2284 2285 for (i = 0; i < pc->param_nr; i++) { 2286 for (c = 0; c < 4; c++) { 2287 pc->param[i*4+c].type = P_CONST; 2288 pc->param[i*4+c].hw = rid++; 2289 pc->param[i*4+c].index = i; 2290 } 2291 } 2292 } 2293 2294 if (pc->immd_nr) { 2295 int rid = 0; 2296 2297 pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg)); 2298 if (!pc->immd) 2299 goto out_err; 2300 2301 for (i = 0; i < pc->immd_nr; i++) { 2302 for (c = 0; c < 4; c++) { 2303 pc->immd[i*4+c].type = P_IMMD; 2304 pc->immd[i*4+c].hw = rid++; 2305 pc->immd[i*4+c].index = i; 2306 } 2307 } 2308 } 2309 2310 ret = TRUE; 2311out_err: 2312 if (r_usage[0]) 2313 FREE(r_usage[0]); 2314 if (r_usage[1]) 2315 FREE(r_usage[1]); 2316 2317 tgsi_parse_free(&p); 2318 return ret; 2319} 2320 2321static void 2322free_nv50_pc(struct nv50_pc *pc) 2323{ 2324 if (pc->immd) 2325 FREE(pc->immd); 2326 if (pc->param) 2327 FREE(pc->param); 2328 if (pc->result) 2329 FREE(pc->result); 2330 if (pc->attr) 2331 FREE(pc->attr); 2332 if (pc->temp) 2333 FREE(pc->temp); 2334 2335 FREE(pc); 2336} 2337 2338static boolean 2339nv50_program_tx(struct nv50_program *p) 2340{ 2341 struct tgsi_parse_context parse; 2342 struct nv50_pc *pc; 2343 unsigned k; 2344 boolean ret; 2345 2346 pc = CALLOC_STRUCT(nv50_pc); 2347 if (!pc) 2348 return FALSE; 2349 pc->p = p; 2350 pc->p->cfg.high_temp = 4; 2351 2352 ret = nv50_program_tx_prep(pc); 2353 if (ret == FALSE) 2354 goto out_cleanup; 2355 2356 tgsi_parse_init(&parse, pc->p->pipe.tokens); 2357 while (!tgsi_parse_end_of_tokens(&parse)) { 2358 const union tgsi_full_token *tok = &parse.FullToken; 2359 2360 /* don't allow half insn/immd on first and last instruction */ 2361 pc->allow32 = TRUE; 2362 if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr) 2363 pc->allow32 = FALSE; 2364 2365 tgsi_parse_token(&parse); 2366 2367 switch (tok->Token.Type) { 2368 case TGSI_TOKEN_TYPE_INSTRUCTION: 2369 ++pc->insn_cur; 2370 ret = nv50_tgsi_insn(pc, tok); 2371 if (ret == FALSE) 2372 goto out_err; 2373 break; 2374 default: 2375 break; 2376 } 2377 } 2378 2379 if (p->type == PIPE_SHADER_FRAGMENT) { 2380 struct nv50_reg out; 2381 2382 out.type = P_TEMP; 2383 for (k = 0; k < pc->result_nr * 4; k++) { 2384 if (pc->result[k].rhw == -1) 2385 continue; 2386 if (pc->result[k].hw != pc->result[k].rhw) { 2387 out.hw = pc->result[k].rhw; 2388 emit_mov(pc, &out, &pc->result[k]); 2389 } 2390 if (pc->p->cfg.high_result < (pc->result[k].rhw + 1)) 2391 pc->p->cfg.high_result = pc->result[k].rhw + 1; 2392 } 2393 } 2394 2395 /* look for single half instructions and make them long */ 2396 struct nv50_program_exec *e, *e_prev; 2397 2398 for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) { 2399 if (!is_long(e)) 2400 k++; 2401 2402 if (!e->next || is_long(e->next)) { 2403 if (k & 1) 2404 convert_to_long(pc, e); 2405 k = 0; 2406 } 2407 2408 if (e->next) 2409 e_prev = e; 2410 } 2411 2412 if (!is_long(pc->p->exec_tail)) { 2413 /* this may occur if moving FP results */ 2414 assert(e_prev && !is_long(e_prev)); 2415 convert_to_long(pc, e_prev); 2416 convert_to_long(pc, pc->p->exec_tail); 2417 } 2418 2419 assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head)); 2420 pc->p->exec_tail->inst[1] |= 0x00000001; 2421 2422 p->param_nr = pc->param_nr * 4; 2423 p->immd_nr = pc->immd_nr * 4; 2424 p->immd = pc->immd_buf; 2425 2426out_err: 2427 tgsi_parse_free(&parse); 2428 2429out_cleanup: 2430 free_nv50_pc(pc); 2431 return ret; 2432} 2433 2434static void 2435nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p) 2436{ 2437 if (nv50_program_tx(p) == FALSE) 2438 assert(0); 2439 p->translated = TRUE; 2440} 2441 2442static void 2443nv50_program_upload_data(struct nv50_context *nv50, float *map, 2444 unsigned start, unsigned count, unsigned cbuf) 2445{ 2446 struct nouveau_channel *chan = nv50->screen->base.channel; 2447 struct nouveau_grobj *tesla = nv50->screen->tesla; 2448 2449 while (count) { 2450 unsigned nr = count > 2047 ? 2047 : count; 2451 2452 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 2453 OUT_RING (chan, (cbuf << 0) | (start << 8)); 2454 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 2455 OUT_RINGp (chan, map, nr); 2456 2457 map += nr; 2458 start += nr; 2459 count -= nr; 2460 } 2461} 2462 2463static void 2464nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) 2465{ 2466 struct pipe_screen *pscreen = nv50->pipe.screen; 2467 2468 if (!p->data[0] && p->immd_nr) { 2469 struct nouveau_resource *heap = nv50->screen->immd_heap[0]; 2470 2471 if (nouveau_resource_alloc(heap, p->immd_nr, p, &p->data[0])) { 2472 while (heap->next && heap->size < p->immd_nr) { 2473 struct nv50_program *evict = heap->next->priv; 2474 nouveau_resource_free(&evict->data[0]); 2475 } 2476 2477 if (nouveau_resource_alloc(heap, p->immd_nr, p, 2478 &p->data[0])) 2479 assert(0); 2480 } 2481 2482 /* immediates only need to be uploaded again when freed */ 2483 nv50_program_upload_data(nv50, p->immd, p->data[0]->start, 2484 p->immd_nr, NV50_CB_PMISC); 2485 } 2486 2487 if (!p->data[1] && p->param_nr) { 2488 struct nouveau_resource *heap = 2489 nv50->screen->parm_heap[p->type]; 2490 2491 if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) { 2492 while (heap->next && heap->size < p->param_nr) { 2493 struct nv50_program *evict = heap->next->priv; 2494 nouveau_resource_free(&evict->data[1]); 2495 } 2496 2497 if (nouveau_resource_alloc(heap, p->param_nr, p, 2498 &p->data[1])) 2499 assert(0); 2500 } 2501 } 2502 2503 if (p->param_nr) { 2504 unsigned cbuf = NV50_CB_PVP; 2505 float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type], 2506 PIPE_BUFFER_USAGE_CPU_READ); 2507 if (p->type == PIPE_SHADER_FRAGMENT) 2508 cbuf = NV50_CB_PFP; 2509 nv50_program_upload_data(nv50, map, p->data[1]->start, 2510 p->param_nr, cbuf); 2511 pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]); 2512 } 2513} 2514 2515static void 2516nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) 2517{ 2518 struct nouveau_channel *chan = nv50->screen->base.channel; 2519 struct nouveau_grobj *tesla = nv50->screen->tesla; 2520 struct nv50_program_exec *e; 2521 struct nouveau_stateobj *so; 2522 const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR; 2523 unsigned start, count, *up, *ptr; 2524 boolean upload = FALSE; 2525 2526 if (!p->bo) { 2527 nouveau_bo_new(chan->device, NOUVEAU_BO_VRAM, 0x100, 2528 p->exec_size * 4, &p->bo); 2529 upload = TRUE; 2530 } 2531 2532 if ((p->data[0] && p->data[0]->start != p->data_start[0]) || 2533 (p->data[1] && p->data[1]->start != p->data_start[1])) { 2534 for (e = p->exec_head; e; e = e->next) { 2535 unsigned ei, ci, bs; 2536 2537 if (e->param.index < 0) 2538 continue; 2539 bs = (e->inst[1] >> 22) & 0x07; 2540 assert(bs < 2); 2541 ei = e->param.shift >> 5; 2542 ci = e->param.index + p->data[bs]->start; 2543 2544 e->inst[ei] &= ~e->param.mask; 2545 e->inst[ei] |= (ci << e->param.shift); 2546 } 2547 2548 if (p->data[0]) 2549 p->data_start[0] = p->data[0]->start; 2550 if (p->data[1]) 2551 p->data_start[1] = p->data[1]->start; 2552 2553 upload = TRUE; 2554 } 2555 2556 if (!upload) 2557 return; 2558 2559#ifdef NV50_PROGRAM_DUMP 2560 NOUVEAU_ERR("-------\n"); 2561 for (e = p->exec_head; e; e = e->next) { 2562 NOUVEAU_ERR("0x%08x\n", e->inst[0]); 2563 if (is_long(e)) 2564 NOUVEAU_ERR("0x%08x\n", e->inst[1]); 2565 } 2566#endif 2567 2568 up = ptr = MALLOC(p->exec_size * 4); 2569 for (e = p->exec_head; e; e = e->next) { 2570 *(ptr++) = e->inst[0]; 2571 if (is_long(e)) 2572 *(ptr++) = e->inst[1]; 2573 } 2574 2575 so = so_new(4,2); 2576 so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); 2577 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0); 2578 so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0); 2579 so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4)); 2580 2581 start = 0; count = p->exec_size; 2582 while (count) { 2583 struct nouveau_channel *chan = nv50->screen->base.channel; 2584 unsigned nr; 2585 2586 so_emit(chan, so); 2587 2588 nr = MIN2(count, 2047); 2589 nr = MIN2(chan->pushbuf->remaining, nr); 2590 if (chan->pushbuf->remaining < (nr + 3)) { 2591 FIRE_RING(chan); 2592 continue; 2593 } 2594 2595 BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); 2596 OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD); 2597 BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); 2598 OUT_RINGp (chan, up + start, nr); 2599 2600 start += nr; 2601 count -= nr; 2602 } 2603 2604 FREE(up); 2605 so_ref(NULL, &so); 2606} 2607 2608void 2609nv50_vertprog_validate(struct nv50_context *nv50) 2610{ 2611 struct nouveau_grobj *tesla = nv50->screen->tesla; 2612 struct nv50_program *p = nv50->vertprog; 2613 struct nouveau_stateobj *so; 2614 2615 if (!p->translated) { 2616 nv50_program_validate(nv50, p); 2617 if (!p->translated) 2618 assert(0); 2619 } 2620 2621 nv50_program_validate_data(nv50, p); 2622 nv50_program_validate_code(nv50, p); 2623 2624 so = so_new(13, 2); 2625 so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2); 2626 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2627 NOUVEAU_BO_HIGH, 0, 0); 2628 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2629 NOUVEAU_BO_LOW, 0, 0); 2630 so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); 2631 so_data (so, p->cfg.vp.attr[0]); 2632 so_data (so, p->cfg.vp.attr[1]); 2633 so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); 2634 so_data (so, p->cfg.high_result); 2635 so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2); 2636 so_data (so, p->cfg.high_result); //8); 2637 so_data (so, p->cfg.high_temp); 2638 so_method(so, tesla, NV50TCL_VP_START_ID, 1); 2639 so_data (so, 0); /* program start offset */ 2640 so_ref(so, &nv50->state.vertprog); 2641 so_ref(NULL, &so); 2642} 2643 2644void 2645nv50_fragprog_validate(struct nv50_context *nv50) 2646{ 2647 struct nouveau_grobj *tesla = nv50->screen->tesla; 2648 struct nv50_program *p = nv50->fragprog; 2649 struct nouveau_stateobj *so; 2650 unsigned i; 2651 2652 if (!p->translated) { 2653 nv50_program_validate(nv50, p); 2654 if (!p->translated) 2655 assert(0); 2656 } 2657 2658 nv50_program_validate_data(nv50, p); 2659 nv50_program_validate_code(nv50, p); 2660 2661 so = so_new(64, 2); 2662 so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); 2663 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2664 NOUVEAU_BO_HIGH, 0, 0); 2665 so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | 2666 NOUVEAU_BO_LOW, 0, 0); 2667 so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); 2668 so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */ 2669 so_data (so, 0x00000004); 2670 so_data (so, 0x00000000); 2671 so_data (so, 0x00000000); 2672 so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map); 2673 for (i = 0; i < p->cfg.fp.high_map; i++) 2674 so_data(so, p->cfg.fp.map[i]); 2675 so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2); 2676 so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */ 2677 so_data (so, p->cfg.high_temp); 2678 so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); 2679 so_data (so, p->cfg.high_result); 2680 so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1); 2681 so_data (so, p->cfg.fp.regs[2]); 2682 so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); 2683 so_data (so, p->cfg.fp.regs[3]); 2684 so_method(so, tesla, NV50TCL_FP_START_ID, 1); 2685 so_data (so, 0); /* program start offset */ 2686 so_ref(so, &nv50->state.fragprog); 2687 so_ref(NULL, &so); 2688} 2689 2690void 2691nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 2692{ 2693 while (p->exec_head) { 2694 struct nv50_program_exec *e = p->exec_head; 2695 2696 p->exec_head = e->next; 2697 FREE(e); 2698 } 2699 p->exec_tail = NULL; 2700 p->exec_size = 0; 2701 2702 nouveau_bo_ref(NULL, &p->bo); 2703 2704 nouveau_resource_free(&p->data[0]); 2705 nouveau_resource_free(&p->data[1]); 2706 2707 p->translated = 0; 2708} 2709