1/* 2 * Copyright 2010 Christoph Bumiller 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice shall be included in 12 * all copies or substantial portions of the Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF 19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 * SOFTWARE. 21 */ 22 23#include "nv50_program.h" 24#include "nv50_context.h" 25 26#include "codegen/nv50_ir_driver.h" 27 28static INLINE unsigned 29bitcount4(const uint32_t val) 30{ 31 static const uint8_t cnt[16] 32 = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 33 return cnt[val & 0xf]; 34} 35 36static int 37nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info) 38{ 39 struct nv50_program *prog = (struct nv50_program *)info->driverPriv; 40 unsigned i, n, c; 41 42 n = 0; 43 for (i = 0; i < info->numInputs; ++i) { 44 prog->in[i].id = i; 45 prog->in[i].sn = info->in[i].sn; 46 prog->in[i].si = info->in[i].si; 47 prog->in[i].hw = n; 48 prog->in[i].mask = info->in[i].mask; 49 50 prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32); 51 52 for (c = 0; c < 4; ++c) 53 if (info->in[i].mask & (1 << c)) 54 info->in[i].slot[c] = n++; 55 } 56 prog->in_nr = info->numInputs; 57 58 for (i = 0; i < info->numSysVals; ++i) { 59 switch (info->sv[i].sn) { 60 case TGSI_SEMANTIC_INSTANCEID: 61 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID; 62 continue; 63 case TGSI_SEMANTIC_VERTEXID: 64 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID; 65 prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_UNK12; 66 continue; 67 default: 68 break; 69 } 70 } 71 72 /* 73 * Corner case: VP has no inputs, but we will still need to submit data to 74 * draw it. HW will shout at us and won't draw anything if we don't enable 75 * any input, so let's just pretend it's the first one. 76 */ 77 if (prog->vp.attrs[0] == 0 && 78 prog->vp.attrs[1] == 0 && 79 prog->vp.attrs[2] == 0) 80 prog->vp.attrs[0] |= 0xf; 81 82 /* VertexID before InstanceID */ 83 if (info->io.vertexId < info->numSysVals) 84 info->sv[info->io.vertexId].slot[0] = n++; 85 if (info->io.instanceId < info->numSysVals) 86 info->sv[info->io.instanceId].slot[0] = n++; 87 88 n = 0; 89 for (i = 0; i < info->numOutputs; ++i) { 90 switch (info->out[i].sn) { 91 case TGSI_SEMANTIC_PSIZE: 92 prog->vp.psiz = i; 93 break; 94 case TGSI_SEMANTIC_CLIPDIST: 95 prog->vp.clpd[info->out[i].si] = n; 96 break; 97 case TGSI_SEMANTIC_EDGEFLAG: 98 prog->vp.edgeflag = i; 99 break; 100 case TGSI_SEMANTIC_BCOLOR: 101 prog->vp.bfc[info->out[i].si] = i; 102 break; 103 default: 104 break; 105 } 106 prog->out[i].id = i; 107 prog->out[i].sn = info->out[i].sn; 108 prog->out[i].si = info->out[i].si; 109 prog->out[i].hw = n; 110 prog->out[i].mask = info->out[i].mask; 111 112 for (c = 0; c < 4; ++c) 113 if (info->out[i].mask & (1 << c)) 114 info->out[i].slot[c] = n++; 115 } 116 prog->out_nr = info->numOutputs; 117 prog->max_out = n; 118 119 if (prog->vp.psiz < info->numOutputs) 120 prog->vp.psiz = prog->out[prog->vp.psiz].hw; 121 122 return 0; 123} 124 125static int 126nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info) 127{ 128 struct nv50_program *prog = (struct nv50_program *)info->driverPriv; 129 unsigned i, n, m, c; 130 unsigned nvary; 131 unsigned nflat; 132 unsigned nintp = 0; 133 134 /* count recorded non-flat inputs */ 135 for (m = 0, i = 0; i < info->numInputs; ++i) { 136 switch (info->in[i].sn) { 137 case TGSI_SEMANTIC_POSITION: 138 case TGSI_SEMANTIC_FACE: 139 continue; 140 default: 141 m += info->in[i].flat ? 0 : 1; 142 break; 143 } 144 } 145 /* careful: id may be != i in info->in[prog->in[i].id] */ 146 147 /* Fill prog->in[] so that non-flat inputs are first and 148 * kick out special inputs that don't use the RESULT_MAP. 149 */ 150 for (n = 0, i = 0; i < info->numInputs; ++i) { 151 if (info->in[i].sn == TGSI_SEMANTIC_POSITION) { 152 prog->fp.interp |= info->in[i].mask << 24; 153 for (c = 0; c < 4; ++c) 154 if (info->in[i].mask & (1 << c)) 155 info->in[i].slot[c] = nintp++; 156 } else 157 if (info->in[i].sn == TGSI_SEMANTIC_FACE) { 158 info->in[i].slot[0] = 255; 159 } else { 160 unsigned j = info->in[i].flat ? m++ : n++; 161 162 if (info->in[i].sn == TGSI_SEMANTIC_COLOR) 163 prog->vp.bfc[info->in[i].si] = j; 164 165 prog->in[j].id = i; 166 prog->in[j].mask = info->in[i].mask; 167 prog->in[j].sn = info->in[i].sn; 168 prog->in[j].si = info->in[i].si; 169 prog->in[j].linear = info->in[i].linear; 170 171 prog->in_nr++; 172 } 173 } 174 if (!(prog->fp.interp & (8 << 24))) { 175 ++nintp; 176 prog->fp.interp |= 8 << 24; 177 } 178 179 for (i = 0; i < prog->in_nr; ++i) { 180 int j = prog->in[i].id; 181 182 prog->in[i].hw = nintp; 183 for (c = 0; c < 4; ++c) 184 if (prog->in[i].mask & (1 << c)) 185 info->in[j].slot[c] = nintp++; 186 } 187 /* (n == m) if m never increased, i.e. no flat inputs */ 188 nflat = (n < m) ? (nintp - prog->in[n].hw) : 0; 189 nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */ 190 nvary = nintp - nflat; 191 192 prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT; 193 prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT; 194 195 /* put front/back colors right after HPOS */ 196 prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT; 197 for (i = 0; i < 2; ++i) 198 if (prog->vp.bfc[i] < 0xff) 199 prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16; 200 201 /* FP outputs */ 202 203 if (info->prop.fp.numColourResults > 1) 204 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS; 205 206 for (i = 0; i < info->numOutputs; ++i) { 207 prog->out[i].id = i; 208 prog->out[i].sn = info->out[i].sn; 209 prog->out[i].si = info->out[i].si; 210 prog->out[i].mask = info->out[i].mask; 211 212 if (i == info->io.fragDepth || i == info->io.sampleMask) 213 continue; 214 prog->out[i].hw = info->out[i].si * 4; 215 216 for (c = 0; c < 4; ++c) 217 info->out[i].slot[c] = prog->out[i].hw + c; 218 219 prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4); 220 } 221 222 if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) 223 info->out[info->io.sampleMask].slot[0] = prog->max_out++; 224 225 if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS) 226 info->out[info->io.fragDepth].slot[2] = prog->max_out++; 227 228 if (!prog->max_out) 229 prog->max_out = 4; 230 231 return 0; 232} 233 234static int 235nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info) 236{ 237 switch (info->type) { 238 case PIPE_SHADER_VERTEX: 239 return nv50_vertprog_assign_slots(info); 240 case PIPE_SHADER_GEOMETRY: 241 return nv50_vertprog_assign_slots(info); 242 case PIPE_SHADER_FRAGMENT: 243 return nv50_fragprog_assign_slots(info); 244 default: 245 return -1; 246 } 247} 248 249static struct nv50_stream_output_state * 250nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info, 251 const struct pipe_stream_output_info *pso) 252{ 253 struct nv50_stream_output_state *so; 254 unsigned b, i, c; 255 unsigned base[4]; 256 257 so = MALLOC_STRUCT(nv50_stream_output_state); 258 if (!so) 259 return NULL; 260 memset(so->map, 0xff, sizeof(so->map)); 261 262 for (b = 0; b < 4; ++b) 263 so->num_attribs[b] = 0; 264 for (i = 0; i < pso->num_outputs; ++i) { 265 unsigned end = pso->output[i].dst_offset + pso->output[i].num_components; 266 b = pso->output[i].output_buffer; 267 assert(b < 4); 268 so->num_attribs[b] = MAX2(so->num_attribs[b], end); 269 } 270 271 so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED; 272 273 so->stride[0] = pso->stride[0] * 4; 274 base[0] = 0; 275 for (b = 1; b < 4; ++b) { 276 assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]); 277 so->stride[b] = so->num_attribs[b] * 4; 278 if (so->num_attribs[b]) 279 so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT; 280 base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4); 281 } 282 if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) { 283 assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX); 284 so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT; 285 } 286 287 so->map_size = base[3] + so->num_attribs[3]; 288 289 for (i = 0; i < pso->num_outputs; ++i) { 290 const unsigned s = pso->output[i].start_component; 291 const unsigned p = pso->output[i].dst_offset; 292 const unsigned r = pso->output[i].register_index; 293 b = pso->output[i].output_buffer; 294 295 for (c = 0; c < pso->output[i].num_components; ++c) 296 so->map[base[b] + p + c] = info->out[r].slot[s + c]; 297 } 298 299 return so; 300} 301 302boolean 303nv50_program_translate(struct nv50_program *prog, uint16_t chipset) 304{ 305 struct nv50_ir_prog_info *info; 306 int ret; 307 const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80; 308 309 info = CALLOC_STRUCT(nv50_ir_prog_info); 310 if (!info) 311 return FALSE; 312 313 info->type = prog->type; 314 info->target = chipset; 315 info->bin.sourceRep = NV50_PROGRAM_IR_TGSI; 316 info->bin.source = (void *)prog->pipe.tokens; 317 318 info->io.ucpBinding = 15; 319 info->io.ucpBase = 0; 320 info->io.genUserClip = prog->vp.clpd_nr; 321 322 info->assignSlots = nv50_program_assign_varying_slots; 323 324 prog->vp.bfc[0] = 0xff; 325 prog->vp.bfc[1] = 0xff; 326 prog->vp.edgeflag = 0xff; 327 prog->vp.clpd[0] = map_undef; 328 prog->vp.clpd[1] = map_undef; 329 prog->vp.psiz = map_undef; 330 prog->gp.primid = 0x80; 331 332 info->driverPriv = prog; 333 334#ifdef DEBUG 335 info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3); 336 info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0); 337#else 338 info->optLevel = 3; 339#endif 340 341 ret = nv50_ir_generate_code(info); 342 if (ret) { 343 NOUVEAU_ERR("shader translation failed: %i\n", ret); 344 goto out; 345 } 346 if (info->bin.syms) /* we don't need them yet */ 347 FREE(info->bin.syms); 348 349 prog->code = info->bin.code; 350 prog->code_size = info->bin.codeSize; 351 prog->fixups = info->bin.relocData; 352 prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1); 353 prog->tls_space = info->bin.tlsSpace; 354 355 if (prog->type == PIPE_SHADER_FRAGMENT) { 356 if (info->prop.fp.writesDepth) { 357 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z; 358 prog->fp.flags[1] = 0x11; 359 } 360 if (info->prop.fp.usesDiscard) 361 prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL; 362 } 363 364 if (prog->pipe.stream_output.num_outputs) 365 prog->so = nv50_program_create_strmout_state(info, 366 &prog->pipe.stream_output); 367 368out: 369 FREE(info); 370 return !ret; 371} 372 373boolean 374nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog) 375{ 376 struct nouveau_heap *heap; 377 int ret; 378 uint32_t size = align(prog->code_size, 0x40); 379 380 switch (prog->type) { 381 case PIPE_SHADER_VERTEX: heap = nv50->screen->vp_code_heap; break; 382 case PIPE_SHADER_GEOMETRY: heap = nv50->screen->fp_code_heap; break; 383 case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break; 384 default: 385 assert(!"invalid program type"); 386 return FALSE; 387 } 388 389 ret = nouveau_heap_alloc(heap, size, prog, &prog->mem); 390 if (ret) { 391 /* Out of space: evict everything to compactify the code segment, hoping 392 * the working set is much smaller and drifts slowly. Improve me ! 393 */ 394 while (heap->next) { 395 struct nv50_program *evict = heap->next->priv; 396 if (evict) 397 nouveau_heap_free(&evict->mem); 398 } 399 debug_printf("WARNING: out of code space, evicting all shaders.\n"); 400 } 401 prog->code_base = prog->mem->start; 402 403 ret = nv50_tls_realloc(nv50->screen, prog->tls_space); 404 if (ret < 0) 405 return FALSE; 406 if (ret > 0) 407 nv50->state.new_tls_space = TRUE; 408 409 if (prog->fixups) 410 nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0); 411 412 nv50_sifc_linear_u8(&nv50->base, nv50->screen->code, 413 (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base, 414 NOUVEAU_BO_VRAM, prog->code_size, prog->code); 415 416 BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1); 417 PUSH_DATA (nv50->base.pushbuf, 0); 418 419 return TRUE; 420} 421 422void 423nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) 424{ 425 const struct pipe_shader_state pipe = p->pipe; 426 const ubyte type = p->type; 427 428 if (p->mem) 429 nouveau_heap_free(&p->mem); 430 431 if (p->code) 432 FREE(p->code); 433 434 if (p->fixups) 435 FREE(p->fixups); 436 437 if (p->so) 438 FREE(p->so); 439 440 memset(p, 0, sizeof(*p)); 441 442 p->pipe = pipe; 443 p->type = type; 444} 445