brw_wm_emit.c revision 9b4053cabd8bda180b352d2d2047209f6ca5f6e8
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "main/macros.h"
34#include "brw_context.h"
35#include "brw_wm.h"
36
37static bool
38can_do_pln(struct intel_context *intel, const struct brw_reg *deltas)
39{
40   struct brw_context *brw = brw_context(&intel->ctx);
41
42   if (!brw->has_pln)
43      return false;
44
45   if (deltas[1].nr != deltas[0].nr + 1)
46      return false;
47
48   if (intel->gen < 6 && ((deltas[0].nr & 1) != 0))
49      return false;
50
51   return true;
52}
53
54/* Return the SrcReg index of the channels that can be immediate float operands
55 * instead of usage of PROGRAM_CONSTANT values through push/pull.
56 */
57bool
58brw_wm_arg_can_be_immediate(enum prog_opcode opcode, int arg)
59{
60   int opcode_array[] = {
61      [OPCODE_ADD] = 2,
62      [OPCODE_CMP] = 3,
63      [OPCODE_DP3] = 2,
64      [OPCODE_DP4] = 2,
65      [OPCODE_DPH] = 2,
66      [OPCODE_MAX] = 2,
67      [OPCODE_MIN] = 2,
68      [OPCODE_MOV] = 1,
69      [OPCODE_MUL] = 2,
70      [OPCODE_SEQ] = 2,
71      [OPCODE_SGE] = 2,
72      [OPCODE_SGT] = 2,
73      [OPCODE_SLE] = 2,
74      [OPCODE_SLT] = 2,
75      [OPCODE_SNE] = 2,
76      [OPCODE_SWZ] = 1,
77      [OPCODE_XPD] = 2,
78   };
79
80   /* These opcodes get broken down in a way that allow two
81    * args to be immediates.
82    */
83   if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
84      if (arg == 1 || arg == 2)
85	 return true;
86   }
87
88   if (opcode > ARRAY_SIZE(opcode_array))
89      return false;
90
91   return arg == opcode_array[opcode] - 1;
92}
93
94/**
95 * Computes the screen-space x,y position of the pixels.
96 *
97 * This will be used by emit_delta_xy() or emit_wpos_xy() for
98 * interpolation of attributes..
99 *
100 * Payload R0:
101 *
102 * R0.0 -- pixel mask, one bit for each of 4 pixels in 4 tiles,
103 *         corresponding to each of the 16 execution channels.
104 * R0.1..8 -- ?
105 * R1.0 -- triangle vertex 0.X
106 * R1.1 -- triangle vertex 0.Y
107 * R1.2 -- tile 0 x,y coords (2 packed uwords)
108 * R1.3 -- tile 1 x,y coords (2 packed uwords)
109 * R1.4 -- tile 2 x,y coords (2 packed uwords)
110 * R1.5 -- tile 3 x,y coords (2 packed uwords)
111 * R1.6 -- ?
112 * R1.7 -- ?
113 * R1.8 -- ?
114 */
115void emit_pixel_xy(struct brw_wm_compile *c,
116		   const struct brw_reg *dst,
117		   GLuint mask)
118{
119   struct brw_compile *p = &c->func;
120   struct brw_reg r1 = brw_vec1_grf(1, 0);
121   struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
122   struct brw_reg dst0_uw, dst1_uw;
123
124   brw_push_insn_state(p);
125   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
126
127   if (c->dispatch_width == 16) {
128      dst0_uw = vec16(retype(dst[0], BRW_REGISTER_TYPE_UW));
129      dst1_uw = vec16(retype(dst[1], BRW_REGISTER_TYPE_UW));
130   } else {
131      dst0_uw = vec8(retype(dst[0], BRW_REGISTER_TYPE_UW));
132      dst1_uw = vec8(retype(dst[1], BRW_REGISTER_TYPE_UW));
133   }
134
135   /* Calculate pixel centers by adding 1 or 0 to each of the
136    * micro-tile coordinates passed in r1.
137    */
138   if (mask & WRITEMASK_X) {
139      brw_ADD(p,
140	      dst0_uw,
141	      stride(suboffset(r1_uw, 4), 2, 4, 0),
142	      brw_imm_v(0x10101010));
143   }
144
145   if (mask & WRITEMASK_Y) {
146      brw_ADD(p,
147	      dst1_uw,
148	      stride(suboffset(r1_uw,5), 2, 4, 0),
149	      brw_imm_v(0x11001100));
150   }
151   brw_pop_insn_state(p);
152}
153
154/**
155 * Computes the screen-space x,y distance of the pixels from the start
156 * vertex.
157 *
158 * This will be used in linterp or pinterp with the start vertex value
159 * and the Cx, Cy, and C0 coefficients passed in from the setup engine
160 * to produce interpolated attribute values.
161 */
162void emit_delta_xy(struct brw_compile *p,
163		   const struct brw_reg *dst,
164		   GLuint mask,
165		   const struct brw_reg *arg0)
166{
167   struct intel_context *intel = &p->brw->intel;
168   struct brw_reg r1 = brw_vec1_grf(1, 0);
169
170   if (mask == 0)
171      return;
172
173   assert(mask == WRITEMASK_XY);
174
175   if (intel->gen >= 6) {
176       /* XXX Gen6 WM doesn't have Xstart/Ystart in payload r1.0/r1.1.
177	  Just add them with 0.0 for dst reg.. */
178       r1 = brw_imm_v(0x00000000);
179       brw_ADD(p,
180	       dst[0],
181	       retype(arg0[0], BRW_REGISTER_TYPE_UW),
182	       r1);
183       brw_ADD(p,
184	       dst[1],
185	       retype(arg0[1], BRW_REGISTER_TYPE_UW),
186	       r1);
187       return;
188   }
189
190   /* Calc delta X,Y by subtracting origin in r1 from the pixel
191    * centers produced by emit_pixel_xy().
192    */
193   brw_ADD(p,
194	   dst[0],
195	   retype(arg0[0], BRW_REGISTER_TYPE_UW),
196	   negate(r1));
197   brw_ADD(p,
198	   dst[1],
199	   retype(arg0[1], BRW_REGISTER_TYPE_UW),
200	   negate(suboffset(r1,1)));
201}
202
203/**
204 * Computes the pixel offset from the window origin for gl_FragCoord().
205 */
206void emit_wpos_xy(struct brw_wm_compile *c,
207		  const struct brw_reg *dst,
208		  GLuint mask,
209		  const struct brw_reg *arg0)
210{
211   struct brw_compile *p = &c->func;
212   struct intel_context *intel = &p->brw->intel;
213   struct brw_reg delta_x = retype(arg0[0], BRW_REGISTER_TYPE_W);
214   struct brw_reg delta_y = retype(arg0[1], BRW_REGISTER_TYPE_W);
215
216   if (mask & WRITEMASK_X) {
217      if (intel->gen >= 6) {
218	 struct brw_reg delta_x_f = retype(delta_x, BRW_REGISTER_TYPE_F);
219	 brw_MOV(p, delta_x_f, delta_x);
220	 delta_x = delta_x_f;
221      }
222
223      if (c->fp->program.PixelCenterInteger) {
224	 /* X' = X */
225	 brw_MOV(p, dst[0], delta_x);
226      } else {
227	 /* X' = X + 0.5 */
228	 brw_ADD(p, dst[0], delta_x, brw_imm_f(0.5));
229      }
230   }
231
232   if (mask & WRITEMASK_Y) {
233      if (intel->gen >= 6) {
234	 struct brw_reg delta_y_f = retype(delta_y, BRW_REGISTER_TYPE_F);
235	 brw_MOV(p, delta_y_f, delta_y);
236	 delta_y = delta_y_f;
237      }
238
239      if (c->fp->program.OriginUpperLeft) {
240	 if (c->fp->program.PixelCenterInteger) {
241	    /* Y' = Y */
242	    brw_MOV(p, dst[1], delta_y);
243	 } else {
244	    brw_ADD(p, dst[1], delta_y, brw_imm_f(0.5));
245	 }
246      } else {
247	 float center_offset = c->fp->program.PixelCenterInteger ? 0.0 : 0.5;
248
249	 /* Y' = (height - 1) - Y + center */
250	 brw_ADD(p, dst[1], negate(delta_y),
251		 brw_imm_f(c->key.drawable_height - 1 + center_offset));
252      }
253   }
254}
255
256
257void emit_pixel_w(struct brw_wm_compile *c,
258		  const struct brw_reg *dst,
259		  GLuint mask,
260		  const struct brw_reg *arg0,
261		  const struct brw_reg *deltas)
262{
263   struct brw_compile *p = &c->func;
264   struct intel_context *intel = &p->brw->intel;
265   struct brw_reg src;
266   struct brw_reg temp_dst;
267
268   if (intel->gen >= 6)
269	temp_dst = dst[3];
270   else
271	temp_dst = brw_message_reg(2);
272
273   assert(intel->gen < 6);
274
275   /* Don't need this if all you are doing is interpolating color, for
276    * instance.
277    */
278   if (mask & WRITEMASK_W) {
279      struct brw_reg interp3 = brw_vec1_grf(arg0[0].nr+1, 4);
280
281      /* Calc 1/w - just linterp wpos[3] optimized by putting the
282       * result straight into a message reg.
283       */
284      if (can_do_pln(intel, deltas)) {
285	 brw_PLN(p, temp_dst, interp3, deltas[0]);
286      } else {
287	 brw_LINE(p, brw_null_reg(), interp3, deltas[0]);
288	 brw_MAC(p, temp_dst, suboffset(interp3, 1), deltas[1]);
289      }
290
291      /* Calc w */
292      if (intel->gen >= 6)
293	 src = temp_dst;
294      else
295	 src = brw_null_reg();
296
297      if (c->dispatch_width == 16) {
298	 brw_math_16(p, dst[3],
299		     BRW_MATH_FUNCTION_INV,
300		     2, src,
301		     BRW_MATH_PRECISION_FULL);
302      } else {
303	 brw_math(p, dst[3],
304		  BRW_MATH_FUNCTION_INV,
305		  2, src,
306		  BRW_MATH_DATA_VECTOR,
307		  BRW_MATH_PRECISION_FULL);
308      }
309   }
310}
311
312void emit_linterp(struct brw_compile *p,
313		  const struct brw_reg *dst,
314		  GLuint mask,
315		  const struct brw_reg *arg0,
316		  const struct brw_reg *deltas)
317{
318   struct intel_context *intel = &p->brw->intel;
319   struct brw_reg interp[4];
320   GLuint nr = arg0[0].nr;
321   GLuint i;
322
323   interp[0] = brw_vec1_grf(nr, 0);
324   interp[1] = brw_vec1_grf(nr, 4);
325   interp[2] = brw_vec1_grf(nr+1, 0);
326   interp[3] = brw_vec1_grf(nr+1, 4);
327
328   for (i = 0; i < 4; i++) {
329      if (mask & (1<<i)) {
330	 if (intel->gen >= 6) {
331	    brw_PLN(p, dst[i], interp[i], brw_vec8_grf(2, 0));
332	 } else if (can_do_pln(intel, deltas)) {
333	    brw_PLN(p, dst[i], interp[i], deltas[0]);
334	 } else {
335	    brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
336	    brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
337	 }
338      }
339   }
340}
341
342
343void emit_pinterp(struct brw_compile *p,
344		  const struct brw_reg *dst,
345		  GLuint mask,
346		  const struct brw_reg *arg0,
347		  const struct brw_reg *deltas,
348		  const struct brw_reg *w)
349{
350   struct intel_context *intel = &p->brw->intel;
351   struct brw_reg interp[4];
352   GLuint nr = arg0[0].nr;
353   GLuint i;
354
355   if (intel->gen >= 6) {
356      emit_linterp(p, dst, mask, arg0, interp);
357      return;
358   }
359
360   interp[0] = brw_vec1_grf(nr, 0);
361   interp[1] = brw_vec1_grf(nr, 4);
362   interp[2] = brw_vec1_grf(nr+1, 0);
363   interp[3] = brw_vec1_grf(nr+1, 4);
364
365   for (i = 0; i < 4; i++) {
366      if (mask & (1<<i)) {
367	 if (can_do_pln(intel, deltas)) {
368	    brw_PLN(p, dst[i], interp[i], deltas[0]);
369	 } else {
370	    brw_LINE(p, brw_null_reg(), interp[i], deltas[0]);
371	    brw_MAC(p, dst[i], suboffset(interp[i],1), deltas[1]);
372	 }
373      }
374   }
375   for (i = 0; i < 4; i++) {
376      if (mask & (1<<i)) {
377	 brw_MUL(p, dst[i], dst[i], w[3]);
378      }
379   }
380}
381
382
383void emit_cinterp(struct brw_compile *p,
384		  const struct brw_reg *dst,
385		  GLuint mask,
386		  const struct brw_reg *arg0)
387{
388   struct brw_reg interp[4];
389   GLuint nr = arg0[0].nr;
390   GLuint i;
391
392   interp[0] = brw_vec1_grf(nr, 0);
393   interp[1] = brw_vec1_grf(nr, 4);
394   interp[2] = brw_vec1_grf(nr+1, 0);
395   interp[3] = brw_vec1_grf(nr+1, 4);
396
397   for (i = 0; i < 4; i++) {
398      if (mask & (1<<i)) {
399         brw_MOV(p, dst[i], suboffset(interp[i],3));	/* TODO: optimize away like other moves */
400      }
401   }
402}
403
404/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
405void emit_frontfacing(struct brw_compile *p,
406		      const struct brw_reg *dst,
407		      GLuint mask)
408{
409   struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
410   GLuint i;
411
412   if (!(mask & WRITEMASK_XYZW))
413      return;
414
415   for (i = 0; i < 4; i++) {
416      if (mask & (1<<i)) {
417	 brw_MOV(p, dst[i], brw_imm_f(0.0));
418      }
419   }
420
421   /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
422    * us front face
423    */
424   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
425   for (i = 0; i < 4; i++) {
426      if (mask & (1<<i)) {
427	 brw_MOV(p, dst[i], brw_imm_f(1.0));
428      }
429   }
430   brw_set_predicate_control_flag_value(p, 0xff);
431}
432
433/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
434 * looking like:
435 *
436 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
437 *
438 * and we're trying to produce:
439 *
440 *           DDX                     DDY
441 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
442 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
443 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
444 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
445 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
446 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
447 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
448 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
449 *
450 * and add another set of two more subspans if in 16-pixel dispatch mode.
451 *
452 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
453 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
454 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
455 * between each other.  We could probably do it like ddx and swizzle the right
456 * order later, but bail for now and just produce
457 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
458 *
459 * The negate_value boolean is used to negate the d/dy computation for FBOs,
460 * since they place the origin at the upper left instead of the lower left.
461 */
462void emit_ddxy(struct brw_compile *p,
463	       const struct brw_reg *dst,
464	       GLuint mask,
465	       bool is_ddx,
466	       const struct brw_reg *arg0,
467               bool negate_value)
468{
469   int i;
470   struct brw_reg src0, src1;
471
472   if (mask & SATURATE)
473      brw_set_saturate(p, 1);
474   for (i = 0; i < 4; i++ ) {
475      if (mask & (1<<i)) {
476	 if (is_ddx) {
477	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
478			   BRW_REGISTER_TYPE_F,
479			   BRW_VERTICAL_STRIDE_2,
480			   BRW_WIDTH_2,
481			   BRW_HORIZONTAL_STRIDE_0,
482			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
483	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
484			   BRW_REGISTER_TYPE_F,
485			   BRW_VERTICAL_STRIDE_2,
486			   BRW_WIDTH_2,
487			   BRW_HORIZONTAL_STRIDE_0,
488			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
489	 } else {
490	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
491			   BRW_REGISTER_TYPE_F,
492			   BRW_VERTICAL_STRIDE_4,
493			   BRW_WIDTH_4,
494			   BRW_HORIZONTAL_STRIDE_0,
495			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
496	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
497			   BRW_REGISTER_TYPE_F,
498			   BRW_VERTICAL_STRIDE_4,
499			   BRW_WIDTH_4,
500			   BRW_HORIZONTAL_STRIDE_0,
501			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
502	 }
503         if (negate_value)
504            brw_ADD(p, dst[i], src1, negate(src0));
505         else
506            brw_ADD(p, dst[i], src0, negate(src1));
507      }
508   }
509   if (mask & SATURATE)
510      brw_set_saturate(p, 0);
511}
512
513void emit_alu1(struct brw_compile *p,
514	       struct brw_instruction *(*func)(struct brw_compile *,
515					       struct brw_reg,
516					       struct brw_reg),
517	       const struct brw_reg *dst,
518	       GLuint mask,
519	       const struct brw_reg *arg0)
520{
521   GLuint i;
522
523   if (mask & SATURATE)
524      brw_set_saturate(p, 1);
525
526   for (i = 0; i < 4; i++) {
527      if (mask & (1<<i)) {
528	 func(p, dst[i], arg0[i]);
529      }
530   }
531
532   if (mask & SATURATE)
533      brw_set_saturate(p, 0);
534}
535
536
537void emit_alu2(struct brw_compile *p,
538	       struct brw_instruction *(*func)(struct brw_compile *,
539					       struct brw_reg,
540					       struct brw_reg,
541					       struct brw_reg),
542	       const struct brw_reg *dst,
543	       GLuint mask,
544	       const struct brw_reg *arg0,
545	       const struct brw_reg *arg1)
546{
547   GLuint i;
548
549   if (mask & SATURATE)
550      brw_set_saturate(p, 1);
551
552   for (i = 0; i < 4; i++) {
553      if (mask & (1<<i)) {
554	 func(p, dst[i], arg0[i], arg1[i]);
555      }
556   }
557
558   if (mask & SATURATE)
559      brw_set_saturate(p, 0);
560}
561
562
563void emit_mad(struct brw_compile *p,
564	      const struct brw_reg *dst,
565	      GLuint mask,
566	      const struct brw_reg *arg0,
567	      const struct brw_reg *arg1,
568	      const struct brw_reg *arg2)
569{
570   GLuint i;
571
572   for (i = 0; i < 4; i++) {
573      if (mask & (1<<i)) {
574	 brw_MUL(p, dst[i], arg0[i], arg1[i]);
575
576	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
577	 brw_ADD(p, dst[i], dst[i], arg2[i]);
578	 brw_set_saturate(p, 0);
579      }
580   }
581}
582
583void emit_lrp(struct brw_compile *p,
584	      const struct brw_reg *dst,
585	      GLuint mask,
586	      const struct brw_reg *arg0,
587	      const struct brw_reg *arg1,
588	      const struct brw_reg *arg2)
589{
590   GLuint i;
591
592   /* Uses dst as a temporary:
593    */
594   for (i = 0; i < 4; i++) {
595      if (mask & (1<<i)) {
596	 /* Can I use the LINE instruction for this?
597	  */
598	 brw_ADD(p, dst[i], negate(arg0[i]), brw_imm_f(1.0));
599	 brw_MUL(p, brw_null_reg(), dst[i], arg2[i]);
600
601	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
602	 brw_MAC(p, dst[i], arg0[i], arg1[i]);
603	 brw_set_saturate(p, 0);
604      }
605   }
606}
607
608void emit_sop(struct brw_compile *p,
609	      const struct brw_reg *dst,
610	      GLuint mask,
611	      GLuint cond,
612	      const struct brw_reg *arg0,
613	      const struct brw_reg *arg1)
614{
615   GLuint i;
616
617   for (i = 0; i < 4; i++) {
618      if (mask & (1<<i)) {
619	 brw_push_insn_state(p);
620	 brw_CMP(p, brw_null_reg(), cond, arg0[i], arg1[i]);
621	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
622	 brw_MOV(p, dst[i], brw_imm_f(0));
623	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
624	 brw_MOV(p, dst[i], brw_imm_f(1.0));
625	 brw_pop_insn_state(p);
626      }
627   }
628}
629
630static void emit_slt( struct brw_compile *p,
631		      const struct brw_reg *dst,
632		      GLuint mask,
633		      const struct brw_reg *arg0,
634		      const struct brw_reg *arg1 )
635{
636   emit_sop(p, dst, mask, BRW_CONDITIONAL_L, arg0, arg1);
637}
638
639static void emit_sle( struct brw_compile *p,
640		      const struct brw_reg *dst,
641		      GLuint mask,
642		      const struct brw_reg *arg0,
643		      const struct brw_reg *arg1 )
644{
645   emit_sop(p, dst, mask, BRW_CONDITIONAL_LE, arg0, arg1);
646}
647
648static void emit_sgt( struct brw_compile *p,
649		      const struct brw_reg *dst,
650		      GLuint mask,
651		      const struct brw_reg *arg0,
652		      const struct brw_reg *arg1 )
653{
654   emit_sop(p, dst, mask, BRW_CONDITIONAL_G, arg0, arg1);
655}
656
657static void emit_sge( struct brw_compile *p,
658		      const struct brw_reg *dst,
659		      GLuint mask,
660		      const struct brw_reg *arg0,
661		      const struct brw_reg *arg1 )
662{
663   emit_sop(p, dst, mask, BRW_CONDITIONAL_GE, arg0, arg1);
664}
665
666static void emit_seq( struct brw_compile *p,
667		      const struct brw_reg *dst,
668		      GLuint mask,
669		      const struct brw_reg *arg0,
670		      const struct brw_reg *arg1 )
671{
672   emit_sop(p, dst, mask, BRW_CONDITIONAL_EQ, arg0, arg1);
673}
674
675static void emit_sne( struct brw_compile *p,
676		      const struct brw_reg *dst,
677		      GLuint mask,
678		      const struct brw_reg *arg0,
679		      const struct brw_reg *arg1 )
680{
681   emit_sop(p, dst, mask, BRW_CONDITIONAL_NEQ, arg0, arg1);
682}
683
684void emit_cmp(struct brw_compile *p,
685	      const struct brw_reg *dst,
686	      GLuint mask,
687	      const struct brw_reg *arg0,
688	      const struct brw_reg *arg1,
689	      const struct brw_reg *arg2)
690{
691   GLuint i;
692
693   for (i = 0; i < 4; i++) {
694      if (mask & (1<<i)) {
695	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
696
697	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
698	 brw_SEL(p, dst[i], arg1[i], arg2[i]);
699	 brw_set_saturate(p, 0);
700	 brw_set_predicate_control_flag_value(p, 0xff);
701      }
702   }
703}
704
705void emit_sign(struct brw_compile *p,
706	       const struct brw_reg *dst,
707	       GLuint mask,
708	       const struct brw_reg *arg0)
709{
710   GLuint i;
711
712   for (i = 0; i < 4; i++) {
713      if (mask & (1<<i)) {
714	 brw_MOV(p, dst[i], brw_imm_f(0.0));
715
716	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], brw_imm_f(0));
717	 brw_MOV(p, dst[i], brw_imm_f(-1.0));
718	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
719
720	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0[i], brw_imm_f(0));
721	 brw_MOV(p, dst[i], brw_imm_f(1.0));
722	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
723      }
724   }
725}
726
727void emit_max(struct brw_compile *p,
728	      const struct brw_reg *dst,
729	      GLuint mask,
730	      const struct brw_reg *arg0,
731	      const struct brw_reg *arg1)
732{
733   GLuint i;
734
735   for (i = 0; i < 4; i++) {
736      if (mask & (1<<i)) {
737	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], arg1[i]);
738
739	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
740	 brw_SEL(p, dst[i], arg0[i], arg1[i]);
741	 brw_set_saturate(p, 0);
742	 brw_set_predicate_control_flag_value(p, 0xff);
743      }
744   }
745}
746
747void emit_min(struct brw_compile *p,
748	      const struct brw_reg *dst,
749	      GLuint mask,
750	      const struct brw_reg *arg0,
751	      const struct brw_reg *arg1)
752{
753   GLuint i;
754
755   for (i = 0; i < 4; i++) {
756      if (mask & (1<<i)) {
757	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0[i], arg1[i]);
758
759	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
760	 brw_SEL(p, dst[i], arg0[i], arg1[i]);
761	 brw_set_saturate(p, 0);
762	 brw_set_predicate_control_flag_value(p, 0xff);
763      }
764   }
765}
766
767
768void emit_dp2(struct brw_compile *p,
769	      const struct brw_reg *dst,
770	      GLuint mask,
771	      const struct brw_reg *arg0,
772	      const struct brw_reg *arg1)
773{
774   int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
775
776   if (!(mask & WRITEMASK_XYZW))
777      return; /* Do not emit dead code */
778
779   assert(is_power_of_two(mask & WRITEMASK_XYZW));
780
781   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
782
783   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
784   brw_MAC(p, dst[dst_chan], arg0[1], arg1[1]);
785   brw_set_saturate(p, 0);
786}
787
788
789void emit_dp3(struct brw_compile *p,
790	      const struct brw_reg *dst,
791	      GLuint mask,
792	      const struct brw_reg *arg0,
793	      const struct brw_reg *arg1)
794{
795   int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
796
797   if (!(mask & WRITEMASK_XYZW))
798      return; /* Do not emit dead code */
799
800   assert(is_power_of_two(mask & WRITEMASK_XYZW));
801
802   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
803   brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
804
805   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
806   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
807   brw_set_saturate(p, 0);
808}
809
810
811void emit_dp4(struct brw_compile *p,
812	      const struct brw_reg *dst,
813	      GLuint mask,
814	      const struct brw_reg *arg0,
815	      const struct brw_reg *arg1)
816{
817   int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
818
819   if (!(mask & WRITEMASK_XYZW))
820      return; /* Do not emit dead code */
821
822   assert(is_power_of_two(mask & WRITEMASK_XYZW));
823
824   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
825   brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
826   brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
827
828   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
829   brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
830   brw_set_saturate(p, 0);
831}
832
833
834void emit_dph(struct brw_compile *p,
835	      const struct brw_reg *dst,
836	      GLuint mask,
837	      const struct brw_reg *arg0,
838	      const struct brw_reg *arg1)
839{
840   const int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
841
842   if (!(mask & WRITEMASK_XYZW))
843      return; /* Do not emit dead code */
844
845   assert(is_power_of_two(mask & WRITEMASK_XYZW));
846
847   brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
848   brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
849   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
850
851   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
852   brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
853   brw_set_saturate(p, 0);
854}
855
856
857void emit_xpd(struct brw_compile *p,
858	      const struct brw_reg *dst,
859	      GLuint mask,
860	      const struct brw_reg *arg0,
861	      const struct brw_reg *arg1)
862{
863   GLuint i;
864
865   assert((mask & WRITEMASK_W) != WRITEMASK_W);
866
867   for (i = 0 ; i < 3; i++) {
868      if (mask & (1<<i)) {
869	 GLuint i2 = (i+2)%3;
870	 GLuint i1 = (i+1)%3;
871
872	 brw_MUL(p, brw_null_reg(), negate(arg0[i2]), arg1[i1]);
873
874	 brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
875	 brw_MAC(p, dst[i], arg0[i1], arg1[i2]);
876	 brw_set_saturate(p, 0);
877      }
878   }
879}
880
881
882void emit_math1(struct brw_wm_compile *c,
883		GLuint function,
884		const struct brw_reg *dst,
885		GLuint mask,
886		const struct brw_reg *arg0)
887{
888   struct brw_compile *p = &c->func;
889   struct intel_context *intel = &p->brw->intel;
890   int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
891   struct brw_reg src;
892
893   if (!(mask & WRITEMASK_XYZW))
894      return; /* Do not emit dead code */
895
896   assert(is_power_of_two(mask & WRITEMASK_XYZW));
897
898   if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
899			    arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
900			   arg0[0].negate || arg0[0].abs)) {
901      /* Gen6 math requires that source and dst horizontal stride be 1,
902       * and that the argument be in the GRF.
903       *
904       * The hardware ignores source modifiers (negate and abs) on math
905       * instructions, so we also move to a temp to set those up.
906       */
907      src = dst[dst_chan];
908      brw_MOV(p, src, arg0[0]);
909   } else {
910      src = arg0[0];
911   }
912
913   /* Send two messages to perform all 16 operations:
914    */
915   brw_push_insn_state(p);
916   brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
917   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
918   brw_math(p,
919	    dst[dst_chan],
920	    function,
921	    2,
922	    src,
923	    BRW_MATH_DATA_VECTOR,
924	    BRW_MATH_PRECISION_FULL);
925
926   if (c->dispatch_width == 16) {
927      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
928      brw_math(p,
929	       offset(dst[dst_chan],1),
930	       function,
931	       3,
932	       sechalf(src),
933	       BRW_MATH_DATA_VECTOR,
934	       BRW_MATH_PRECISION_FULL);
935   }
936   brw_pop_insn_state(p);
937}
938
939
940void emit_math2(struct brw_wm_compile *c,
941		GLuint function,
942		const struct brw_reg *dst,
943		GLuint mask,
944		const struct brw_reg *arg0,
945		const struct brw_reg *arg1)
946{
947   struct brw_compile *p = &c->func;
948   struct intel_context *intel = &p->brw->intel;
949   int dst_chan = ffs(mask & WRITEMASK_XYZW) - 1;
950
951   if (!(mask & WRITEMASK_XYZW))
952      return; /* Do not emit dead code */
953
954   assert(is_power_of_two(mask & WRITEMASK_XYZW));
955
956   brw_push_insn_state(p);
957
958   /* math can only operate on up to a vec8 at a time, so in
959    * dispatch_width==16 we have to do the second half manually.
960    */
961   if (intel->gen >= 6) {
962      struct brw_reg src0 = arg0[0];
963      struct brw_reg src1 = arg1[0];
964      struct brw_reg temp_dst = dst[dst_chan];
965
966      if (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
967	 brw_MOV(p, temp_dst, src0);
968	 src0 = temp_dst;
969      }
970
971      if (arg1[0].hstride == BRW_HORIZONTAL_STRIDE_0) {
972	 /* This is a heinous hack to get a temporary register for use
973	  * in case both arg0 and arg1 are constants.  Why you're
974	  * doing exponentiation on constant values in the shader, we
975	  * don't know.
976	  *
977	  * max_wm_grf is almost surely less than the maximum GRF, and
978	  * gen6 doesn't care about the number of GRFs used in a
979	  * shader like pre-gen6 did.
980	  */
981	 struct brw_reg temp = brw_vec8_grf(c->max_wm_grf, 0);
982	 brw_MOV(p, temp, src1);
983	 src1 = temp;
984      }
985
986      brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
987      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
988      brw_math2(p,
989		temp_dst,
990		function,
991		src0,
992		src1);
993      if (c->dispatch_width == 16) {
994	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
995	 brw_math2(p,
996		   sechalf(temp_dst),
997		   function,
998		   sechalf(src0),
999		   sechalf(src1));
1000      }
1001   } else {
1002      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1003      brw_MOV(p, brw_message_reg(3), arg1[0]);
1004      if (c->dispatch_width == 16) {
1005	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1006	 brw_MOV(p, brw_message_reg(5), sechalf(arg1[0]));
1007      }
1008
1009      brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1010      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1011      brw_math(p,
1012	       dst[dst_chan],
1013	       function,
1014	       2,
1015	       arg0[0],
1016	       BRW_MATH_DATA_VECTOR,
1017	       BRW_MATH_PRECISION_FULL);
1018
1019      /* Send two messages to perform all 16 operations:
1020       */
1021      if (c->dispatch_width == 16) {
1022	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1023	 brw_math(p,
1024		  offset(dst[dst_chan],1),
1025		  function,
1026		  4,
1027		  sechalf(arg0[0]),
1028		  BRW_MATH_DATA_VECTOR,
1029		  BRW_MATH_PRECISION_FULL);
1030      }
1031   }
1032   brw_pop_insn_state(p);
1033}
1034
1035
1036void emit_tex(struct brw_wm_compile *c,
1037	      struct brw_reg *dst,
1038	      GLuint dst_flags,
1039	      struct brw_reg *arg,
1040	      struct brw_reg depth_payload,
1041	      GLuint tex_idx,
1042	      GLuint sampler,
1043	      bool shadow)
1044{
1045   struct brw_compile *p = &c->func;
1046   struct intel_context *intel = &p->brw->intel;
1047   struct brw_reg dst_retyped;
1048   GLuint cur_mrf = 2, response_length;
1049   GLuint i, nr_texcoords;
1050   GLuint emit;
1051   GLuint msg_type;
1052   GLuint mrf_per_channel;
1053   GLuint simd_mode;
1054
1055   if (c->dispatch_width == 16) {
1056      mrf_per_channel = 2;
1057      response_length = 8;
1058      dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1059      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1060   } else {
1061      mrf_per_channel = 1;
1062      response_length = 4;
1063      dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1064      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1065   }
1066
1067   /* How many input regs are there?
1068    */
1069   switch (tex_idx) {
1070   case TEXTURE_1D_INDEX:
1071      emit = WRITEMASK_X;
1072      nr_texcoords = 1;
1073      break;
1074   case TEXTURE_2D_INDEX:
1075   case TEXTURE_1D_ARRAY_INDEX:
1076   case TEXTURE_RECT_INDEX:
1077   case TEXTURE_EXTERNAL_INDEX:
1078      emit = WRITEMASK_XY;
1079      nr_texcoords = 2;
1080      break;
1081   case TEXTURE_3D_INDEX:
1082   case TEXTURE_2D_ARRAY_INDEX:
1083   case TEXTURE_CUBE_INDEX:
1084      emit = WRITEMASK_XYZ;
1085      nr_texcoords = 3;
1086      break;
1087   default:
1088      /* unexpected target */
1089      abort();
1090   }
1091
1092   /* Pre-Ironlake, the 8-wide sampler always took u,v,r. */
1093   if (intel->gen < 5 && c->dispatch_width == 8)
1094      nr_texcoords = 3;
1095
1096   if (shadow) {
1097      if (intel->gen < 7) {
1098	 /* For shadow comparisons, we have to supply u,v,r. */
1099	 nr_texcoords = 3;
1100      } else {
1101	 /* On Ivybridge, the shadow comparitor comes first. Just load it. */
1102	 brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1103	 cur_mrf += mrf_per_channel;
1104      }
1105   }
1106
1107   /* Emit the texcoords. */
1108   for (i = 0; i < nr_texcoords; i++) {
1109      if (c->key.tex.gl_clamp_mask[i] & (1 << sampler))
1110	 brw_set_saturate(p, true);
1111
1112      if (emit & (1<<i))
1113	 brw_MOV(p, brw_message_reg(cur_mrf), arg[i]);
1114      else
1115	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1116      cur_mrf += mrf_per_channel;
1117
1118      brw_set_saturate(p, false);
1119   }
1120
1121   /* Fill in the shadow comparison reference value. */
1122   if (shadow && intel->gen < 7) {
1123      if (intel->gen >= 5) {
1124	 /* Fill in the cube map array index value. */
1125	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1126	 cur_mrf += mrf_per_channel;
1127      } else if (c->dispatch_width == 8) {
1128	 /* Fill in the LOD bias value. */
1129	 brw_MOV(p, brw_message_reg(cur_mrf), brw_imm_f(0));
1130	 cur_mrf += mrf_per_channel;
1131      }
1132      brw_MOV(p, brw_message_reg(cur_mrf), arg[2]);
1133      cur_mrf += mrf_per_channel;
1134   }
1135
1136   if (intel->gen >= 5) {
1137      if (shadow)
1138	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1139      else
1140	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
1141   } else {
1142      /* Note that G45 and older determines shadow compare and dispatch width
1143       * from message length for most messages.
1144       */
1145      if (c->dispatch_width == 16 && shadow)
1146	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1147      else
1148	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1149   }
1150
1151   brw_SAMPLE(p,
1152	      dst_retyped,
1153	      1,
1154	      retype(depth_payload, BRW_REGISTER_TYPE_UW),
1155              SURF_INDEX_TEXTURE(sampler),
1156	      sampler,
1157	      dst_flags & WRITEMASK_XYZW,
1158	      msg_type,
1159	      response_length,
1160	      cur_mrf - 1,
1161	      1,
1162	      simd_mode,
1163	      BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
1164}
1165
1166
1167void emit_txb(struct brw_wm_compile *c,
1168	      struct brw_reg *dst,
1169	      GLuint dst_flags,
1170	      struct brw_reg *arg,
1171	      struct brw_reg depth_payload,
1172	      GLuint tex_idx,
1173	      GLuint sampler)
1174{
1175   struct brw_compile *p = &c->func;
1176   struct intel_context *intel = &p->brw->intel;
1177   GLuint msgLength;
1178   GLuint msg_type;
1179   GLuint mrf_per_channel;
1180   GLuint response_length;
1181   struct brw_reg dst_retyped;
1182
1183   /* The G45 and older chipsets don't support 8-wide dispatch for LOD biased
1184    * samples, so we'll use the 16-wide instruction, leave the second halves
1185    * undefined, and trust the execution mask to keep the undefined pixels
1186    * from mattering.
1187    */
1188   if (c->dispatch_width == 16 || intel->gen < 5) {
1189      if (intel->gen >= 5)
1190	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1191      else
1192	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1193      mrf_per_channel = 2;
1194      dst_retyped = retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW);
1195      response_length = 8;
1196   } else {
1197      msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1198      mrf_per_channel = 1;
1199      dst_retyped = retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW);
1200      response_length = 4;
1201   }
1202
1203   /* Shadow ignored for txb. */
1204   switch (tex_idx) {
1205   case TEXTURE_1D_INDEX:
1206      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1207      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), brw_imm_f(0));
1208      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1209      break;
1210   case TEXTURE_2D_INDEX:
1211   case TEXTURE_RECT_INDEX:
1212   case TEXTURE_EXTERNAL_INDEX:
1213      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1214      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1215      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), brw_imm_f(0));
1216      break;
1217   case TEXTURE_3D_INDEX:
1218   case TEXTURE_CUBE_INDEX:
1219      brw_MOV(p, brw_message_reg(2 + 0 * mrf_per_channel), arg[0]);
1220      brw_MOV(p, brw_message_reg(2 + 1 * mrf_per_channel), arg[1]);
1221      brw_MOV(p, brw_message_reg(2 + 2 * mrf_per_channel), arg[2]);
1222      break;
1223   default:
1224      /* unexpected target */
1225      abort();
1226   }
1227
1228   brw_MOV(p, brw_message_reg(2 + 3 * mrf_per_channel), arg[3]);
1229   msgLength = 2 + 4 * mrf_per_channel - 1;
1230
1231   brw_SAMPLE(p,
1232	      dst_retyped,
1233	      1,
1234	      retype(depth_payload, BRW_REGISTER_TYPE_UW),
1235              SURF_INDEX_TEXTURE(sampler),
1236	      sampler,
1237	      dst_flags & WRITEMASK_XYZW,
1238	      msg_type,
1239	      response_length,
1240	      msgLength,
1241	      1,
1242	      BRW_SAMPLER_SIMD_MODE_SIMD16,
1243	      BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
1244}
1245
1246
1247static void emit_lit(struct brw_wm_compile *c,
1248		     const struct brw_reg *dst,
1249		     GLuint mask,
1250		     const struct brw_reg *arg0)
1251{
1252   struct brw_compile *p = &c->func;
1253
1254   assert((mask & WRITEMASK_XW) == 0);
1255
1256   if (mask & WRITEMASK_Y) {
1257      brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
1258      brw_MOV(p, dst[1], arg0[0]);
1259      brw_set_saturate(p, 0);
1260   }
1261
1262   if (mask & WRITEMASK_Z) {
1263      emit_math2(c, BRW_MATH_FUNCTION_POW,
1264		 &dst[2],
1265		 WRITEMASK_X | (mask & SATURATE),
1266		 &arg0[1],
1267		 &arg0[3]);
1268   }
1269
1270   /* Ordinarily you'd use an iff statement to skip or shortcircuit
1271    * some of the POW calculations above, but 16-wide iff statements
1272    * seem to lock c1 hardware, so this is a nasty workaround:
1273    */
1274   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_LE, arg0[0], brw_imm_f(0));
1275   {
1276      if (mask & WRITEMASK_Y)
1277	 brw_MOV(p, dst[1], brw_imm_f(0));
1278
1279      if (mask & WRITEMASK_Z)
1280	 brw_MOV(p, dst[2], brw_imm_f(0));
1281   }
1282   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1283}
1284
1285
1286/* Kill pixel - set execution mask to zero for those pixels which
1287 * fail.
1288 */
1289static void emit_kil( struct brw_wm_compile *c,
1290		      struct brw_reg *arg0)
1291{
1292   struct brw_compile *p = &c->func;
1293   struct intel_context *intel = &p->brw->intel;
1294   struct brw_reg pixelmask;
1295   GLuint i, j;
1296
1297   if (intel->gen >= 6)
1298      pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1299   else
1300      pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1301
1302   for (i = 0; i < 4; i++) {
1303      /* Check if we've already done the comparison for this reg
1304       * -- common when someone does KIL TEMP.wwww.
1305       */
1306      for (j = 0; j < i; j++) {
1307	 if (memcmp(&arg0[j], &arg0[i], sizeof(arg0[0])) == 0)
1308	    break;
1309      }
1310      if (j != i)
1311	 continue;
1312
1313      brw_push_insn_state(p);
1314      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));
1315      brw_set_predicate_control_flag_value(p, 0xff);
1316      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1317      brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
1318      brw_pop_insn_state(p);
1319   }
1320}
1321
1322static void fire_fb_write( struct brw_wm_compile *c,
1323			   GLuint base_reg,
1324			   GLuint nr,
1325			   GLuint target,
1326			   GLuint eot )
1327{
1328   struct brw_compile *p = &c->func;
1329   struct intel_context *intel = &p->brw->intel;
1330   uint32_t msg_control;
1331
1332   /* Pass through control information:
1333    *
1334    * Gen6 has done m1 mov in emit_fb_write() for current SIMD16 case.
1335    */
1336/*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
1337   if (intel->gen < 6)
1338   {
1339      brw_push_insn_state(p);
1340      brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
1341      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1342      brw_MOV(p,
1343	       brw_message_reg(base_reg + 1),
1344	       brw_vec8_grf(1, 0));
1345      brw_pop_insn_state(p);
1346   }
1347
1348   if (c->dispatch_width == 16)
1349      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1350   else
1351      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1352
1353   /* Send framebuffer write message: */
1354/*  send (16) null.0<1>:uw m0               r0.0<8;8,1>:uw   0x85a04000:ud    { Align1 EOT } */
1355   brw_fb_WRITE(p,
1356		c->dispatch_width,
1357		base_reg,
1358		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
1359		msg_control,
1360		target,
1361		nr,
1362		0,
1363		eot,
1364		true);
1365}
1366
1367
1368static void emit_aa( struct brw_wm_compile *c,
1369		     struct brw_reg *arg1,
1370		     GLuint reg )
1371{
1372   struct brw_compile *p = &c->func;
1373   GLuint comp = c->aa_dest_stencil_reg / 2;
1374   GLuint off = c->aa_dest_stencil_reg % 2;
1375   struct brw_reg aa = offset(arg1[comp], off);
1376
1377   brw_push_insn_state(p);
1378   brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* ?? */
1379   brw_MOV(p, brw_message_reg(reg), aa);
1380   brw_pop_insn_state(p);
1381}
1382
1383
1384/* Post-fragment-program processing.  Send the results to the
1385 * framebuffer.
1386 * \param arg0  the fragment color
1387 * \param arg1  the pass-through depth value
1388 * \param arg2  the shader-computed depth value
1389 */
1390void emit_fb_write(struct brw_wm_compile *c,
1391		   struct brw_reg *arg0,
1392		   struct brw_reg *arg1,
1393		   struct brw_reg *arg2,
1394		   GLuint target,
1395		   GLuint eot)
1396{
1397   struct brw_compile *p = &c->func;
1398   struct brw_context *brw = p->brw;
1399   struct intel_context *intel = &brw->intel;
1400   GLuint nr = 2;
1401   GLuint channel;
1402
1403   /* Reserve a space for AA - may not be needed:
1404    */
1405   if (c->aa_dest_stencil_reg)
1406      nr += 1;
1407
1408   /* I don't really understand how this achieves the color interleave
1409    * (ie RGBARGBA) in the result:  [Do the saturation here]
1410    */
1411   brw_push_insn_state(p);
1412
1413   if (c->key.clamp_fragment_color)
1414      brw_set_saturate(p, 1);
1415
1416   for (channel = 0; channel < 4; channel++) {
1417      if (intel->gen >= 6) {
1418	 /* gen6 SIMD16 single source DP write looks like:
1419	  * m + 0: r0
1420	  * m + 1: r1
1421	  * m + 2: g0
1422	  * m + 3: g1
1423	  * m + 4: b0
1424	  * m + 5: b1
1425	  * m + 6: a0
1426	  * m + 7: a1
1427	  */
1428	 if (c->dispatch_width == 16) {
1429	    brw_MOV(p, brw_message_reg(nr + channel * 2), arg0[channel]);
1430	 } else {
1431	    brw_MOV(p, brw_message_reg(nr + channel), arg0[channel]);
1432	 }
1433      } else if (c->dispatch_width == 16 && brw->has_compr4) {
1434	 /* pre-gen6 SIMD16 single source DP write looks like:
1435	  * m + 0: r0
1436	  * m + 1: g0
1437	  * m + 2: b0
1438	  * m + 3: a0
1439	  * m + 4: r1
1440	  * m + 5: g1
1441	  * m + 6: b1
1442	  * m + 7: a1
1443	  *
1444	  * By setting the high bit of the MRF register number, we indicate
1445	  * that we want COMPR4 mode - instead of doing the usual destination
1446	  * + 1 for the second half we get destination + 4.
1447	  */
1448	 brw_MOV(p,
1449		 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
1450		 arg0[channel]);
1451      } else {
1452	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
1453	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
1454	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1455	 brw_MOV(p,
1456		 brw_message_reg(nr + channel),
1457		 arg0[channel]);
1458
1459	 if (c->dispatch_width == 16) {
1460	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1461	    brw_MOV(p,
1462		    brw_message_reg(nr + channel + 4),
1463		    sechalf(arg0[channel]));
1464	 }
1465      }
1466   }
1467
1468   brw_set_saturate(p, 0);
1469
1470   /* skip over the regs populated above:
1471    */
1472   if (c->dispatch_width == 16)
1473      nr += 8;
1474   else
1475      nr += 4;
1476
1477   brw_pop_insn_state(p);
1478
1479   if (c->source_depth_to_render_target)
1480   {
1481      if (c->computes_depth)
1482	 brw_MOV(p, brw_message_reg(nr), arg2[2]);
1483      else
1484	 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
1485
1486      nr += 2;
1487   }
1488
1489   if (c->dest_depth_reg)
1490   {
1491      GLuint comp = c->dest_depth_reg / 2;
1492      GLuint off = c->dest_depth_reg % 2;
1493
1494      if (off != 0) {
1495         brw_push_insn_state(p);
1496         brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1497
1498         brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
1499         /* 2nd half? */
1500         brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
1501         brw_pop_insn_state(p);
1502      }
1503      else {
1504         brw_MOV(p, brw_message_reg(nr), arg1[comp]);
1505      }
1506      nr += 2;
1507   }
1508
1509   if (intel->gen >= 6) {
1510      /* Load the message header.  There's no implied move from src0
1511       * to the base mrf on gen6.
1512       */
1513      brw_push_insn_state(p);
1514      brw_set_mask_control(p, BRW_MASK_DISABLE);
1515      brw_MOV(p, retype(brw_message_reg(0), BRW_REGISTER_TYPE_UD),
1516	      retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1517      brw_pop_insn_state(p);
1518
1519      if (target != 0) {
1520	 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1521					0,
1522					2), BRW_REGISTER_TYPE_UD),
1523		 brw_imm_ud(target));
1524      }
1525   }
1526
1527   if (!c->runtime_check_aads_emit) {
1528      if (c->aa_dest_stencil_reg)
1529	 emit_aa(c, arg1, 2);
1530
1531      fire_fb_write(c, 0, nr, target, eot);
1532   }
1533   else {
1534      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
1535      struct brw_reg ip = brw_ip_reg();
1536      int jmp;
1537
1538      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1539      brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
1540      brw_AND(p,
1541	      v1_null_ud,
1542	      get_element_ud(brw_vec8_grf(1,0), 6),
1543	      brw_imm_ud(1<<26));
1544
1545      jmp = brw_JMPI(p, ip, ip, brw_imm_w(0)) - p->store;
1546      {
1547	 emit_aa(c, arg1, 2);
1548	 fire_fb_write(c, 0, nr, target, eot);
1549	 /* note - thread killed in subroutine */
1550      }
1551      brw_land_fwd_jump(p, jmp);
1552
1553      /* ELSE: Shuffle up one register to fill in the hole left for AA:
1554       */
1555      fire_fb_write(c, 1, nr-1, target, eot);
1556   }
1557}
1558
1559/**
1560 * Move a GPR to scratch memory.
1561 */
1562static void emit_spill( struct brw_wm_compile *c,
1563			struct brw_reg reg,
1564			GLuint slot )
1565{
1566   struct brw_compile *p = &c->func;
1567
1568   /*
1569     mov (16) m2.0<1>:ud   r2.0<8;8,1>:ud   { Align1 Compr }
1570   */
1571   brw_MOV(p, brw_message_reg(2), reg);
1572
1573   /*
1574     mov (1) r0.2<1>:d    0x00000080:d     { Align1 NoMask }
1575     send (16) null.0<1>:uw m1               r0.0<8;8,1>:uw   0x053003ff:ud    { Align1 }
1576   */
1577   brw_oword_block_write_scratch(p, brw_message_reg(1), 2, slot);
1578}
1579
1580
1581/**
1582 * Load a GPR from scratch memory.
1583 */
1584static void emit_unspill( struct brw_wm_compile *c,
1585			  struct brw_reg reg,
1586			  GLuint slot )
1587{
1588   struct brw_compile *p = &c->func;
1589
1590   /* Slot 0 is the undef value.
1591    */
1592   if (slot == 0) {
1593      brw_MOV(p, reg, brw_imm_f(0));
1594      return;
1595   }
1596
1597   /*
1598     mov (1) r0.2<1>:d    0x000000c0:d     { Align1 NoMask }
1599     send (16) r110.0<1>:uw m1               r0.0<8;8,1>:uw   0x041243ff:ud    { Align1 }
1600   */
1601
1602   brw_oword_block_read(p, vec16(reg), brw_message_reg(1), 2, slot);
1603}
1604
1605
1606/**
1607 * Retrieve up to 4 GEN4 register pairs for the given wm reg:
1608 * Args with unspill_reg != 0 will be loaded from scratch memory.
1609 */
1610static void get_argument_regs( struct brw_wm_compile *c,
1611			       struct brw_wm_ref *arg[],
1612			       struct brw_reg *regs )
1613{
1614   GLuint i;
1615
1616   for (i = 0; i < 4; i++) {
1617      if (arg[i]) {
1618	 if (arg[i]->unspill_reg)
1619	    emit_unspill(c,
1620			 brw_vec8_grf(arg[i]->unspill_reg, 0),
1621			 arg[i]->value->spill_slot);
1622
1623	 regs[i] = arg[i]->hw_reg;
1624      }
1625      else {
1626	 regs[i] = brw_null_reg();
1627      }
1628   }
1629}
1630
1631
1632/**
1633 * For values that have a spill_slot!=0, write those regs to scratch memory.
1634 */
1635static void spill_values( struct brw_wm_compile *c,
1636			  struct brw_wm_value *values,
1637			  GLuint nr )
1638{
1639   GLuint i;
1640
1641   for (i = 0; i < nr; i++)
1642      if (values[i].spill_slot)
1643	 emit_spill(c, values[i].hw_reg, values[i].spill_slot);
1644}
1645
1646
1647/* Emit the fragment program instructions here.
1648 */
1649void brw_wm_emit( struct brw_wm_compile *c )
1650{
1651   struct brw_compile *p = &c->func;
1652   struct intel_context *intel = &p->brw->intel;
1653   GLuint insn;
1654
1655   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1656   if (intel->gen >= 6)
1657	brw_set_acc_write_control(p, 1);
1658
1659   /* Check if any of the payload regs need to be spilled:
1660    */
1661   spill_values(c, c->payload.depth, 4);
1662   spill_values(c, c->creg, c->nr_creg);
1663   spill_values(c, c->payload.input_interp, FRAG_ATTRIB_MAX);
1664
1665
1666   for (insn = 0; insn < c->nr_insns; insn++) {
1667
1668      struct brw_wm_instruction *inst = &c->instruction[insn];
1669      struct brw_reg args[3][4], dst[4];
1670      GLuint i, dst_flags;
1671
1672      /* Get argument regs:
1673       */
1674      for (i = 0; i < 3; i++)
1675	 get_argument_regs(c, inst->src[i], args[i]);
1676
1677      /* Get dest regs:
1678       */
1679      for (i = 0; i < 4; i++)
1680	 if (inst->dst[i])
1681	    dst[i] = inst->dst[i]->hw_reg;
1682	 else
1683	    dst[i] = brw_null_reg();
1684
1685      /* Flags
1686       */
1687      dst_flags = inst->writemask;
1688      if (inst->saturate)
1689	 dst_flags |= SATURATE;
1690
1691      switch (inst->opcode) {
1692	 /* Generated instructions for calculating triangle interpolants:
1693	  */
1694      case WM_PIXELXY:
1695	 emit_pixel_xy(c, dst, dst_flags);
1696	 break;
1697
1698      case WM_DELTAXY:
1699	 emit_delta_xy(p, dst, dst_flags, args[0]);
1700	 break;
1701
1702      case WM_WPOSXY:
1703	 emit_wpos_xy(c, dst, dst_flags, args[0]);
1704	 break;
1705
1706      case WM_PIXELW:
1707	 emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
1708	 break;
1709
1710      case WM_LINTERP:
1711	 emit_linterp(p, dst, dst_flags, args[0], args[1]);
1712	 break;
1713
1714      case WM_PINTERP:
1715	 emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
1716	 break;
1717
1718      case WM_CINTERP:
1719	 emit_cinterp(p, dst, dst_flags, args[0]);
1720	 break;
1721
1722      case WM_FB_WRITE:
1723	 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
1724	 break;
1725
1726      case WM_FRONTFACING:
1727	 emit_frontfacing(p, dst, dst_flags);
1728	 break;
1729
1730	 /* Straightforward arithmetic:
1731	  */
1732      case OPCODE_ADD:
1733	 emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
1734	 break;
1735
1736      case OPCODE_FRC:
1737	 emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
1738	 break;
1739
1740      case OPCODE_FLR:
1741	 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
1742	 break;
1743
1744      case OPCODE_DDX:
1745	 emit_ddxy(p, dst, dst_flags, true, args[0], false);
1746	 break;
1747
1748      case OPCODE_DDY:
1749         /* Make sure fp->program.UsesDFdy flag got set (otherwise there's no
1750          * guarantee that c->key.render_to_fbo is set).
1751          */
1752         assert(c->fp->program.UsesDFdy);
1753	 emit_ddxy(p, dst, dst_flags, false, args[0], c->key.render_to_fbo);
1754	 break;
1755
1756      case OPCODE_DP2:
1757	 emit_dp2(p, dst, dst_flags, args[0], args[1]);
1758	 break;
1759
1760      case OPCODE_DP3:
1761	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
1762	 break;
1763
1764      case OPCODE_DP4:
1765	 emit_dp4(p, dst, dst_flags, args[0], args[1]);
1766	 break;
1767
1768      case OPCODE_DPH:
1769	 emit_dph(p, dst, dst_flags, args[0], args[1]);
1770	 break;
1771
1772      case OPCODE_TRUNC:
1773	 for (i = 0; i < 4; i++) {
1774	    if (dst_flags & (1<<i)) {
1775	       brw_RNDZ(p, dst[i], args[0][i]);
1776	    }
1777	 }
1778	 break;
1779
1780      case OPCODE_LRP:
1781	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
1782	 break;
1783
1784      case OPCODE_MAD:
1785	 emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
1786	 break;
1787
1788      case OPCODE_MOV:
1789      case OPCODE_SWZ:
1790	 emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
1791	 break;
1792
1793      case OPCODE_MUL:
1794	 emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
1795	 break;
1796
1797      case OPCODE_XPD:
1798	 emit_xpd(p, dst, dst_flags, args[0], args[1]);
1799	 break;
1800
1801	 /* Higher math functions:
1802	  */
1803      case OPCODE_RCP:
1804	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
1805	 break;
1806
1807      case OPCODE_RSQ:
1808	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
1809	 break;
1810
1811      case OPCODE_SIN:
1812	 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
1813	 break;
1814
1815      case OPCODE_COS:
1816	 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
1817	 break;
1818
1819      case OPCODE_EX2:
1820	 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
1821	 break;
1822
1823      case OPCODE_LG2:
1824	 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
1825	 break;
1826
1827      case OPCODE_SCS:
1828	 /* There is an scs math function, but it would need some
1829	  * fixup for 16-element execution.
1830	  */
1831	 if (dst_flags & WRITEMASK_X)
1832	    emit_math1(c, BRW_MATH_FUNCTION_COS, dst, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1833	 if (dst_flags & WRITEMASK_Y)
1834	    emit_math1(c, BRW_MATH_FUNCTION_SIN, dst+1, (dst_flags&SATURATE)|WRITEMASK_X, args[0]);
1835	 break;
1836
1837      case OPCODE_POW:
1838	 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, dst_flags, args[0], args[1]);
1839	 break;
1840
1841	 /* Comparisons:
1842	  */
1843      case OPCODE_CMP:
1844	 emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
1845	 break;
1846
1847      case OPCODE_MAX:
1848	 emit_max(p, dst, dst_flags, args[0], args[1]);
1849	 break;
1850
1851      case OPCODE_MIN:
1852	 emit_min(p, dst, dst_flags, args[0], args[1]);
1853	 break;
1854
1855      case OPCODE_SLT:
1856	 emit_slt(p, dst, dst_flags, args[0], args[1]);
1857	 break;
1858
1859      case OPCODE_SLE:
1860	 emit_sle(p, dst, dst_flags, args[0], args[1]);
1861	break;
1862      case OPCODE_SGT:
1863	 emit_sgt(p, dst, dst_flags, args[0], args[1]);
1864	break;
1865      case OPCODE_SGE:
1866	 emit_sge(p, dst, dst_flags, args[0], args[1]);
1867	 break;
1868      case OPCODE_SEQ:
1869	 emit_seq(p, dst, dst_flags, args[0], args[1]);
1870	break;
1871      case OPCODE_SNE:
1872	 emit_sne(p, dst, dst_flags, args[0], args[1]);
1873	break;
1874
1875      case OPCODE_SSG:
1876	 emit_sign(p, dst, dst_flags, args[0]);
1877	 break;
1878
1879      case OPCODE_LIT:
1880	 emit_lit(c, dst, dst_flags, args[0]);
1881	 break;
1882
1883	 /* Texturing operations:
1884	  */
1885      case OPCODE_TEX:
1886	 emit_tex(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1887		  inst->tex_idx, inst->tex_unit,
1888		  inst->tex_shadow);
1889	 break;
1890
1891      case OPCODE_TXB:
1892	 emit_txb(c, dst, dst_flags, args[0], c->payload.depth[0].hw_reg,
1893		  inst->tex_idx, inst->tex_unit);
1894	 break;
1895
1896      case OPCODE_KIL:
1897	 emit_kil(c, args[0]);
1898	 break;
1899
1900      default:
1901	 printf("Unsupported opcode %i (%s) in fragment shader\n",
1902		inst->opcode, inst->opcode < MAX_OPCODE ?
1903		_mesa_opcode_string(inst->opcode) :
1904		"unknown");
1905      }
1906
1907      for (i = 0; i < 4; i++)
1908	if (inst->dst[i] && inst->dst[i]->spill_slot)
1909	   emit_spill(c,
1910		      inst->dst[i]->hw_reg,
1911		      inst->dst[i]->spill_slot);
1912   }
1913
1914   /* Only properly tested on ILK */
1915   if (p->brw->intel.gen == 5) {
1916     brw_remove_duplicate_mrf_moves(p);
1917     if (c->dispatch_width == 16)
1918	brw_remove_grf_to_mrf_moves(p);
1919   }
1920
1921   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1922      int i;
1923
1924     printf("wm-native:\n");
1925     for (i = 0; i < p->nr_insn; i++)
1926	 brw_disasm(stdout, &p->store[i], p->brw->intel.gen);
1927      printf("\n");
1928   }
1929}
1930
1931