1/*
2 * Copyright 2010 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include "nv50/nv50_program.h"
24#include "nv50/nv50_context.h"
25
26#include "codegen/nv50_ir_driver.h"
27
28static inline unsigned
29bitcount4(const uint32_t val)
30{
31   static const uint8_t cnt[16]
32   = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
33   return cnt[val & 0xf];
34}
35
36static int
37nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
38{
39   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
40   unsigned i, n, c;
41
42   n = 0;
43   for (i = 0; i < info->numInputs; ++i) {
44      prog->in[i].id = i;
45      prog->in[i].sn = info->in[i].sn;
46      prog->in[i].si = info->in[i].si;
47      prog->in[i].hw = n;
48      prog->in[i].mask = info->in[i].mask;
49
50      prog->vp.attrs[(4 * i) / 32] |= info->in[i].mask << ((4 * i) % 32);
51
52      for (c = 0; c < 4; ++c)
53         if (info->in[i].mask & (1 << c))
54            info->in[i].slot[c] = n++;
55
56      if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
57         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
58   }
59   prog->in_nr = info->numInputs;
60
61   for (i = 0; i < info->numSysVals; ++i) {
62      switch (info->sv[i].sn) {
63      case TGSI_SEMANTIC_INSTANCEID:
64         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_INSTANCE_ID;
65         continue;
66      case TGSI_SEMANTIC_VERTEXID:
67         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
68         prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;
69         continue;
70      default:
71         break;
72      }
73   }
74
75   /*
76    * Corner case: VP has no inputs, but we will still need to submit data to
77    * draw it. HW will shout at us and won't draw anything if we don't enable
78    * any input, so let's just pretend it's the first one.
79    */
80   if (prog->vp.attrs[0] == 0 &&
81       prog->vp.attrs[1] == 0 &&
82       prog->vp.attrs[2] == 0)
83      prog->vp.attrs[0] |= 0xf;
84
85   /* VertexID before InstanceID */
86   if (info->io.vertexId < info->numSysVals)
87      info->sv[info->io.vertexId].slot[0] = n++;
88   if (info->io.instanceId < info->numSysVals)
89      info->sv[info->io.instanceId].slot[0] = n++;
90
91   n = 0;
92   for (i = 0; i < info->numOutputs; ++i) {
93      switch (info->out[i].sn) {
94      case TGSI_SEMANTIC_PSIZE:
95         prog->vp.psiz = i;
96         break;
97      case TGSI_SEMANTIC_CLIPDIST:
98         prog->vp.clpd[info->out[i].si] = n;
99         break;
100      case TGSI_SEMANTIC_EDGEFLAG:
101         prog->vp.edgeflag = i;
102         break;
103      case TGSI_SEMANTIC_BCOLOR:
104         prog->vp.bfc[info->out[i].si] = i;
105         break;
106      case TGSI_SEMANTIC_LAYER:
107         prog->gp.has_layer = true;
108         prog->gp.layerid = n;
109         break;
110      case TGSI_SEMANTIC_VIEWPORT_INDEX:
111         prog->gp.has_viewport = true;
112         prog->gp.viewportid = n;
113         break;
114      default:
115         break;
116      }
117      prog->out[i].id = i;
118      prog->out[i].sn = info->out[i].sn;
119      prog->out[i].si = info->out[i].si;
120      prog->out[i].hw = n;
121      prog->out[i].mask = info->out[i].mask;
122
123      for (c = 0; c < 4; ++c)
124         if (info->out[i].mask & (1 << c))
125            info->out[i].slot[c] = n++;
126   }
127   prog->out_nr = info->numOutputs;
128   prog->max_out = n;
129   if (!prog->max_out)
130      prog->max_out = 1;
131
132   if (prog->vp.psiz < info->numOutputs)
133      prog->vp.psiz = prog->out[prog->vp.psiz].hw;
134
135   return 0;
136}
137
138static int
139nv50_fragprog_assign_slots(struct nv50_ir_prog_info *info)
140{
141   struct nv50_program *prog = (struct nv50_program *)info->driverPriv;
142   unsigned i, n, m, c;
143   unsigned nvary;
144   unsigned nflat;
145   unsigned nintp = 0;
146
147   /* count recorded non-flat inputs */
148   for (m = 0, i = 0; i < info->numInputs; ++i) {
149      switch (info->in[i].sn) {
150      case TGSI_SEMANTIC_POSITION:
151         continue;
152      default:
153         m += info->in[i].flat ? 0 : 1;
154         break;
155      }
156   }
157   /* careful: id may be != i in info->in[prog->in[i].id] */
158
159   /* Fill prog->in[] so that non-flat inputs are first and
160    * kick out special inputs that don't use the RESULT_MAP.
161    */
162   for (n = 0, i = 0; i < info->numInputs; ++i) {
163      if (info->in[i].sn == TGSI_SEMANTIC_POSITION) {
164         prog->fp.interp |= info->in[i].mask << 24;
165         for (c = 0; c < 4; ++c)
166            if (info->in[i].mask & (1 << c))
167               info->in[i].slot[c] = nintp++;
168      } else {
169         unsigned j = info->in[i].flat ? m++ : n++;
170
171         if (info->in[i].sn == TGSI_SEMANTIC_COLOR)
172            prog->vp.bfc[info->in[i].si] = j;
173         else if (info->in[i].sn == TGSI_SEMANTIC_PRIMID)
174            prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_PRIMITIVE_ID;
175
176         prog->in[j].id = i;
177         prog->in[j].mask = info->in[i].mask;
178         prog->in[j].sn = info->in[i].sn;
179         prog->in[j].si = info->in[i].si;
180         prog->in[j].linear = info->in[i].linear;
181
182         prog->in_nr++;
183      }
184   }
185   if (!(prog->fp.interp & (8 << 24))) {
186      ++nintp;
187      prog->fp.interp |= 8 << 24;
188   }
189
190   for (i = 0; i < prog->in_nr; ++i) {
191      int j = prog->in[i].id;
192
193      prog->in[i].hw = nintp;
194      for (c = 0; c < 4; ++c)
195         if (prog->in[i].mask & (1 << c))
196            info->in[j].slot[c] = nintp++;
197   }
198   /* (n == m) if m never increased, i.e. no flat inputs */
199   nflat = (n < m) ? (nintp - prog->in[n].hw) : 0;
200   nintp -= bitcount4(prog->fp.interp >> 24); /* subtract position inputs */
201   nvary = nintp - nflat;
202
203   prog->fp.interp |= nvary << NV50_3D_FP_INTERPOLANT_CTRL_COUNT_NONFLAT__SHIFT;
204   prog->fp.interp |= nintp << NV50_3D_FP_INTERPOLANT_CTRL_COUNT__SHIFT;
205
206   /* put front/back colors right after HPOS */
207   prog->fp.colors = 4 << NV50_3D_SEMANTIC_COLOR_FFC0_ID__SHIFT;
208   for (i = 0; i < 2; ++i)
209      if (prog->vp.bfc[i] < 0xff)
210         prog->fp.colors += bitcount4(prog->in[prog->vp.bfc[i]].mask) << 16;
211
212   /* FP outputs */
213
214   if (info->prop.fp.numColourResults > 1)
215      prog->fp.flags[0] |= NV50_3D_FP_CONTROL_MULTIPLE_RESULTS;
216
217   for (i = 0; i < info->numOutputs; ++i) {
218      prog->out[i].id = i;
219      prog->out[i].sn = info->out[i].sn;
220      prog->out[i].si = info->out[i].si;
221      prog->out[i].mask = info->out[i].mask;
222
223      if (i == info->io.fragDepth || i == info->io.sampleMask)
224         continue;
225      prog->out[i].hw = info->out[i].si * 4;
226
227      for (c = 0; c < 4; ++c)
228         info->out[i].slot[c] = prog->out[i].hw + c;
229
230      prog->max_out = MAX2(prog->max_out, prog->out[i].hw + 4);
231   }
232
233   if (info->io.sampleMask < PIPE_MAX_SHADER_OUTPUTS) {
234      info->out[info->io.sampleMask].slot[0] = prog->max_out++;
235      prog->fp.has_samplemask = 1;
236   }
237
238   if (info->io.fragDepth < PIPE_MAX_SHADER_OUTPUTS)
239      info->out[info->io.fragDepth].slot[2] = prog->max_out++;
240
241   if (!prog->max_out)
242      prog->max_out = 4;
243
244   return 0;
245}
246
247static int
248nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
249{
250   switch (info->type) {
251   case PIPE_SHADER_VERTEX:
252      return nv50_vertprog_assign_slots(info);
253   case PIPE_SHADER_GEOMETRY:
254      return nv50_vertprog_assign_slots(info);
255   case PIPE_SHADER_FRAGMENT:
256      return nv50_fragprog_assign_slots(info);
257   case PIPE_SHADER_COMPUTE:
258      return 0;
259   default:
260      return -1;
261   }
262}
263
264static struct nv50_stream_output_state *
265nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
266                                  const struct pipe_stream_output_info *pso)
267{
268   struct nv50_stream_output_state *so;
269   unsigned b, i, c;
270   unsigned base[4];
271
272   so = MALLOC_STRUCT(nv50_stream_output_state);
273   if (!so)
274      return NULL;
275   memset(so->map, 0xff, sizeof(so->map));
276
277   for (b = 0; b < 4; ++b)
278      so->num_attribs[b] = 0;
279   for (i = 0; i < pso->num_outputs; ++i) {
280      unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
281      b = pso->output[i].output_buffer;
282      assert(b < 4);
283      so->num_attribs[b] = MAX2(so->num_attribs[b], end);
284   }
285
286   so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
287
288   so->stride[0] = pso->stride[0] * 4;
289   base[0] = 0;
290   for (b = 1; b < 4; ++b) {
291      assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
292      so->stride[b] = so->num_attribs[b] * 4;
293      if (so->num_attribs[b])
294         so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
295      base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
296   }
297   if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
298      assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
299      so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
300   }
301
302   so->map_size = base[3] + so->num_attribs[3];
303
304   for (i = 0; i < pso->num_outputs; ++i) {
305      const unsigned s = pso->output[i].start_component;
306      const unsigned p = pso->output[i].dst_offset;
307      const unsigned r = pso->output[i].register_index;
308      b = pso->output[i].output_buffer;
309
310      if (r >= info->numOutputs)
311         continue;
312
313      for (c = 0; c < pso->output[i].num_components; ++c)
314         so->map[base[b] + p + c] = info->out[r].slot[s + c];
315   }
316
317   return so;
318}
319
320bool
321nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
322                       struct pipe_debug_callback *debug)
323{
324   struct nv50_ir_prog_info *info;
325   int i, ret;
326   const uint8_t map_undef = (prog->type == PIPE_SHADER_VERTEX) ? 0x40 : 0x80;
327
328   info = CALLOC_STRUCT(nv50_ir_prog_info);
329   if (!info)
330      return false;
331
332   info->type = prog->type;
333   info->target = chipset;
334   info->bin.sourceRep = NV50_PROGRAM_IR_TGSI;
335   info->bin.source = (void *)prog->pipe.tokens;
336
337   info->io.auxCBSlot = 15;
338   info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
339   info->io.genUserClip = prog->vp.clpd_nr;
340   if (prog->fp.alphatest)
341      info->io.alphaRefBase = NV50_CB_AUX_ALPHATEST_OFFSET;
342
343   info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
344   info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
345   info->io.msInfoCBSlot = 15;
346   info->io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
347
348   info->assignSlots = nv50_program_assign_varying_slots;
349
350   prog->vp.bfc[0] = 0xff;
351   prog->vp.bfc[1] = 0xff;
352   prog->vp.edgeflag = 0xff;
353   prog->vp.clpd[0] = map_undef;
354   prog->vp.clpd[1] = map_undef;
355   prog->vp.psiz = map_undef;
356   prog->gp.has_layer = 0;
357   prog->gp.has_viewport = 0;
358
359   if (prog->type == PIPE_SHADER_COMPUTE)
360      info->prop.cp.inputOffset = 0x10;
361
362   info->driverPriv = prog;
363
364#ifdef DEBUG
365   info->optLevel = debug_get_num_option("NV50_PROG_OPTIMIZE", 3);
366   info->dbgFlags = debug_get_num_option("NV50_PROG_DEBUG", 0);
367#else
368   info->optLevel = 3;
369#endif
370
371   ret = nv50_ir_generate_code(info);
372   if (ret) {
373      NOUVEAU_ERR("shader translation failed: %i\n", ret);
374      goto out;
375   }
376
377   prog->code = info->bin.code;
378   prog->code_size = info->bin.codeSize;
379   prog->fixups = info->bin.relocData;
380   prog->interps = info->bin.fixupData;
381   prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
382   prog->tls_space = info->bin.tlsSpace;
383
384   prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
385
386   prog->vp.clip_enable = (1 << info->io.clipDistances) - 1;
387   prog->vp.cull_enable =
388      ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
389   prog->vp.clip_mode = 0;
390   for (i = 0; i < info->io.cullDistances; ++i)
391      prog->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
392
393   if (prog->type == PIPE_SHADER_FRAGMENT) {
394      if (info->prop.fp.writesDepth) {
395         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
396         prog->fp.flags[1] = 0x11;
397      }
398      if (info->prop.fp.usesDiscard)
399         prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
400   } else
401   if (prog->type == PIPE_SHADER_GEOMETRY) {
402      switch (info->prop.gp.outputPrim) {
403      case PIPE_PRIM_LINE_STRIP:
404         prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_LINE_STRIP;
405         break;
406      case PIPE_PRIM_TRIANGLE_STRIP:
407         prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_TRIANGLE_STRIP;
408         break;
409      case PIPE_PRIM_POINTS:
410      default:
411         assert(info->prop.gp.outputPrim == PIPE_PRIM_POINTS);
412         prog->gp.prim_type = NV50_3D_GP_OUTPUT_PRIMITIVE_TYPE_POINTS;
413         break;
414      }
415      prog->gp.vert_count = CLAMP(info->prop.gp.maxVertices, 1, 1024);
416   }
417
418   if (prog->type == PIPE_SHADER_COMPUTE) {
419      prog->cp.syms = info->bin.syms;
420      prog->cp.num_syms = info->bin.numSyms;
421   } else {
422      FREE(info->bin.syms);
423   }
424
425   if (prog->pipe.stream_output.num_outputs)
426      prog->so = nv50_program_create_strmout_state(info,
427                                                   &prog->pipe.stream_output);
428
429   pipe_debug_message(debug, SHADER_INFO,
430                      "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
431                      prog->type, info->bin.tlsSpace, prog->max_gpr,
432                      info->bin.instructions, info->bin.codeSize);
433
434out:
435   FREE(info);
436   return !ret;
437}
438
439bool
440nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
441{
442   struct nouveau_heap *heap;
443   int ret;
444   uint32_t size = align(prog->code_size, 0x40);
445   uint8_t prog_type;
446
447   switch (prog->type) {
448   case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
449   case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
450   case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
451   case PIPE_SHADER_COMPUTE:  heap = nv50->screen->fp_code_heap; break;
452   default:
453      assert(!"invalid program type");
454      return false;
455   }
456
457   ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
458   if (ret) {
459      /* Out of space: evict everything to compactify the code segment, hoping
460       * the working set is much smaller and drifts slowly. Improve me !
461       */
462      while (heap->next) {
463         struct nv50_program *evict = heap->next->priv;
464         if (evict)
465            nouveau_heap_free(&evict->mem);
466      }
467      debug_printf("WARNING: out of code space, evicting all shaders.\n");
468      ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
469      if (ret) {
470         NOUVEAU_ERR("shader too large (0x%x) to fit in code space ?\n", size);
471         return false;
472      }
473   }
474
475   if (prog->type == PIPE_SHADER_COMPUTE) {
476      /* CP code must be uploaded in FP code segment. */
477      prog_type = 1;
478   } else {
479      prog->code_base = prog->mem->start;
480      prog_type = prog->type;
481   }
482
483   ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
484   if (ret < 0) {
485      nouveau_heap_free(&prog->mem);
486      return false;
487   }
488   if (ret > 0)
489      nv50->state.new_tls_space = true;
490
491   if (prog->fixups)
492      nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
493   if (prog->interps)
494      nv50_ir_apply_fixups(prog->interps, prog->code,
495                           prog->fp.force_persample_interp,
496                           false /* flatshade */,
497                           prog->fp.alphatest - 1);
498
499   nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
500                       (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
501                       NOUVEAU_BO_VRAM, prog->code_size, prog->code);
502
503   BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
504   PUSH_DATA (nv50->base.pushbuf, 0);
505
506   return true;
507}
508
509void
510nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
511{
512   const struct pipe_shader_state pipe = p->pipe;
513   const ubyte type = p->type;
514
515   if (p->mem)
516      nouveau_heap_free(&p->mem);
517
518   FREE(p->code);
519
520   FREE(p->fixups);
521   FREE(p->interps);
522   FREE(p->so);
523
524   if (type == PIPE_SHADER_COMPUTE)
525      FREE(p->cp.syms);
526
527   memset(p, 0, sizeof(*p));
528
529   p->pipe = pipe;
530   p->type = type;
531}
532