brw_eu_emit.c revision 34b11334d417fae65ebe9cf96980aea581e24893
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct brw_context *brw = p->brw;
67   if (brw->gen < 6)
68      return;
69
70   if (src->file == BRW_MESSAGE_REGISTER_FILE)
71      return;
72
73   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74      brw_push_insn_state(p);
75      brw_set_mask_control(p, BRW_MASK_DISABLE);
76      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	      retype(*src, BRW_REGISTER_TYPE_UD));
79      brw_pop_insn_state(p);
80   }
81   *src = brw_message_reg(msg_reg_nr);
82}
83
84static void
85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86{
87   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88    * "The send with EOT should use register space R112-R127 for <src>. This is
89    *  to enable loading of a new thread into the same slot while the message
90    *  with EOT for current thread is pending dispatch."
91    *
92    * Since we're pretending to have 16 MRFs anyway, we may as well use the
93    * registers required for messages with EOT.
94    */
95   struct brw_context *brw = p->brw;
96   if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97      reg->file = BRW_GENERAL_REGISTER_FILE;
98      reg->nr += GEN7_MRF_HACK_START;
99   }
100}
101
102
103void
104brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
105	     struct brw_reg dest)
106{
107   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
108       dest.file != BRW_MESSAGE_REGISTER_FILE)
109      assert(dest.nr < 128);
110
111   gen7_convert_mrf_to_grf(p, &dest);
112
113   insn->bits1.da1.dest_reg_file = dest.file;
114   insn->bits1.da1.dest_reg_type = dest.type;
115   insn->bits1.da1.dest_address_mode = dest.address_mode;
116
117   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
118      insn->bits1.da1.dest_reg_nr = dest.nr;
119
120      if (insn->header.access_mode == BRW_ALIGN_1) {
121	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
122	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
123	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
124	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
125      }
126      else {
127	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
128	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
129         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
130             dest.file == BRW_MESSAGE_REGISTER_FILE) {
131            assert(dest.dw1.bits.writemask != 0);
132         }
133	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
134	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
135	  *    this to be programmed as "01".
136	  */
137	 insn->bits1.da16.dest_horiz_stride = 1;
138      }
139   }
140   else {
141      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
142
143      /* These are different sizes in align1 vs align16:
144       */
145      if (insn->header.access_mode == BRW_ALIGN_1) {
146	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
147	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
148	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
149	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
150      }
151      else {
152	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
153	 /* even ignored in da16, still need to set as '01' */
154	 insn->bits1.ia16.dest_horiz_stride = 1;
155      }
156   }
157
158   /* NEW: Set the execution size based on dest.width and
159    * insn->compression_control:
160    */
161   guess_execution_size(p, insn, dest);
162}
163
164extern int reg_type_size[];
165
166static void
167validate_reg(struct brw_instruction *insn, struct brw_reg reg)
168{
169   int hstride_for_reg[] = {0, 1, 2, 4};
170   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
171   int width_for_reg[] = {1, 2, 4, 8, 16};
172   int execsize_for_reg[] = {1, 2, 4, 8, 16};
173   int width, hstride, vstride, execsize;
174
175   if (reg.file == BRW_IMMEDIATE_VALUE) {
176      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
177       * mean the destination has to be 128-bit aligned and the
178       * destination horiz stride has to be a word.
179       */
180      if (reg.type == BRW_REGISTER_TYPE_V) {
181	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
182		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
183      }
184
185      return;
186   }
187
188   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
189       reg.file == BRW_ARF_NULL)
190      return;
191
192   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
193   hstride = hstride_for_reg[reg.hstride];
194
195   if (reg.vstride == 0xf) {
196      vstride = -1;
197   } else {
198      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
199      vstride = vstride_for_reg[reg.vstride];
200   }
201
202   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
203   width = width_for_reg[reg.width];
204
205   assert(insn->header.execution_size >= 0 &&
206	  insn->header.execution_size < Elements(execsize_for_reg));
207   execsize = execsize_for_reg[insn->header.execution_size];
208
209   /* Restrictions from 3.3.10: Register Region Restrictions. */
210   /* 3. */
211   assert(execsize >= width);
212
213   /* 4. */
214   if (execsize == width && hstride != 0) {
215      assert(vstride == -1 || vstride == width * hstride);
216   }
217
218   /* 5. */
219   if (execsize == width && hstride == 0) {
220      /* no restriction on vstride. */
221   }
222
223   /* 6. */
224   if (width == 1) {
225      assert(hstride == 0);
226   }
227
228   /* 7. */
229   if (execsize == 1 && width == 1) {
230      assert(hstride == 0);
231      assert(vstride == 0);
232   }
233
234   /* 8. */
235   if (vstride == 0 && hstride == 0) {
236      assert(width == 1);
237   }
238
239   /* 10. Check destination issues. */
240}
241
242void
243brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
244	     struct brw_reg reg)
245{
246   struct brw_context *brw = p->brw;
247
248   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
249      assert(reg.nr < 128);
250
251   gen7_convert_mrf_to_grf(p, &reg);
252
253   if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
254                           insn->header.opcode == BRW_OPCODE_SENDC)) {
255      /* Any source modifiers or regions will be ignored, since this just
256       * identifies the MRF/GRF to start reading the message contents from.
257       * Check for some likely failures.
258       */
259      assert(!reg.negate);
260      assert(!reg.abs);
261      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
262   }
263
264   validate_reg(insn, reg);
265
266   insn->bits1.da1.src0_reg_file = reg.file;
267   insn->bits1.da1.src0_reg_type = reg.type;
268   insn->bits2.da1.src0_abs = reg.abs;
269   insn->bits2.da1.src0_negate = reg.negate;
270   insn->bits2.da1.src0_address_mode = reg.address_mode;
271
272   if (reg.file == BRW_IMMEDIATE_VALUE) {
273      insn->bits3.ud = reg.dw1.ud;
274
275      /* Required to set some fields in src1 as well:
276       */
277      insn->bits1.da1.src1_reg_file = 0; /* arf */
278      insn->bits1.da1.src1_reg_type = reg.type;
279   }
280   else
281   {
282      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
283	 if (insn->header.access_mode == BRW_ALIGN_1) {
284	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
285	    insn->bits2.da1.src0_reg_nr = reg.nr;
286	 }
287	 else {
288	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
289	    insn->bits2.da16.src0_reg_nr = reg.nr;
290	 }
291      }
292      else {
293	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
294
295	 if (insn->header.access_mode == BRW_ALIGN_1) {
296	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
297	 }
298	 else {
299	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
300	 }
301      }
302
303      if (insn->header.access_mode == BRW_ALIGN_1) {
304	 if (reg.width == BRW_WIDTH_1 &&
305	     insn->header.execution_size == BRW_EXECUTE_1) {
306	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
307	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
308	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
309	 }
310	 else {
311	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
312	    insn->bits2.da1.src0_width = reg.width;
313	    insn->bits2.da1.src0_vert_stride = reg.vstride;
314	 }
315      }
316      else {
317	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
318	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
319	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
320	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
321
322	 /* This is an oddity of the fact we're using the same
323	  * descriptions for registers in align_16 as align_1:
324	  */
325	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
326	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
327	 else
328	    insn->bits2.da16.src0_vert_stride = reg.vstride;
329      }
330   }
331}
332
333
334void brw_set_src1(struct brw_compile *p,
335		  struct brw_instruction *insn,
336		  struct brw_reg reg)
337{
338   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
339
340   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
341      assert(reg.nr < 128);
342
343   gen7_convert_mrf_to_grf(p, &reg);
344
345   validate_reg(insn, reg);
346
347   insn->bits1.da1.src1_reg_file = reg.file;
348   insn->bits1.da1.src1_reg_type = reg.type;
349   insn->bits3.da1.src1_abs = reg.abs;
350   insn->bits3.da1.src1_negate = reg.negate;
351
352   /* Only src1 can be immediate in two-argument instructions.
353    */
354   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
355
356   if (reg.file == BRW_IMMEDIATE_VALUE) {
357      insn->bits3.ud = reg.dw1.ud;
358   }
359   else {
360      /* This is a hardware restriction, which may or may not be lifted
361       * in the future:
362       */
363      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
364      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
365
366      if (insn->header.access_mode == BRW_ALIGN_1) {
367	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
368	 insn->bits3.da1.src1_reg_nr = reg.nr;
369      }
370      else {
371	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
372	 insn->bits3.da16.src1_reg_nr = reg.nr;
373      }
374
375      if (insn->header.access_mode == BRW_ALIGN_1) {
376	 if (reg.width == BRW_WIDTH_1 &&
377	     insn->header.execution_size == BRW_EXECUTE_1) {
378	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
379	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
380	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
381	 }
382	 else {
383	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
384	    insn->bits3.da1.src1_width = reg.width;
385	    insn->bits3.da1.src1_vert_stride = reg.vstride;
386	 }
387      }
388      else {
389	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
390	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
391	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
392	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
393
394	 /* This is an oddity of the fact we're using the same
395	  * descriptions for registers in align_16 as align_1:
396	  */
397	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
398	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
399	 else
400	    insn->bits3.da16.src1_vert_stride = reg.vstride;
401      }
402   }
403}
404
405/**
406 * Set the Message Descriptor and Extended Message Descriptor fields
407 * for SEND messages.
408 *
409 * \note This zeroes out the Function Control bits, so it must be called
410 *       \b before filling out any message-specific data.  Callers can
411 *       choose not to fill in irrelevant bits; they will be zero.
412 */
413static void
414brw_set_message_descriptor(struct brw_compile *p,
415			   struct brw_instruction *inst,
416			   enum brw_message_target sfid,
417			   unsigned msg_length,
418			   unsigned response_length,
419			   bool header_present,
420			   bool end_of_thread)
421{
422   struct brw_context *brw = p->brw;
423
424   brw_set_src1(p, inst, brw_imm_d(0));
425
426   if (brw->gen >= 5) {
427      inst->bits3.generic_gen5.header_present = header_present;
428      inst->bits3.generic_gen5.response_length = response_length;
429      inst->bits3.generic_gen5.msg_length = msg_length;
430      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
431
432      if (brw->gen >= 6) {
433	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
434	 inst->header.destreg__conditionalmod = sfid;
435      } else {
436	 /* Set Extended Message Descriptor (ex_desc) */
437	 inst->bits2.send_gen5.sfid = sfid;
438	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
439      }
440   } else {
441      inst->bits3.generic.response_length = response_length;
442      inst->bits3.generic.msg_length = msg_length;
443      inst->bits3.generic.msg_target = sfid;
444      inst->bits3.generic.end_of_thread = end_of_thread;
445   }
446}
447
448static void brw_set_math_message( struct brw_compile *p,
449				  struct brw_instruction *insn,
450				  GLuint function,
451				  GLuint integer_type,
452				  bool low_precision,
453				  GLuint dataType )
454{
455   struct brw_context *brw = p->brw;
456   unsigned msg_length;
457   unsigned response_length;
458
459   /* Infer message length from the function */
460   switch (function) {
461   case BRW_MATH_FUNCTION_POW:
462   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
463   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
464   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
465      msg_length = 2;
466      break;
467   default:
468      msg_length = 1;
469      break;
470   }
471
472   /* Infer response length from the function */
473   switch (function) {
474   case BRW_MATH_FUNCTION_SINCOS:
475   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
476      response_length = 2;
477      break;
478   default:
479      response_length = 1;
480      break;
481   }
482
483
484   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
485			      msg_length, response_length, false, false);
486   if (brw->gen == 5) {
487      insn->bits3.math_gen5.function = function;
488      insn->bits3.math_gen5.int_type = integer_type;
489      insn->bits3.math_gen5.precision = low_precision;
490      insn->bits3.math_gen5.saturate = insn->header.saturate;
491      insn->bits3.math_gen5.data_type = dataType;
492      insn->bits3.math_gen5.snapshot = 0;
493   } else {
494      insn->bits3.math.function = function;
495      insn->bits3.math.int_type = integer_type;
496      insn->bits3.math.precision = low_precision;
497      insn->bits3.math.saturate = insn->header.saturate;
498      insn->bits3.math.data_type = dataType;
499   }
500   insn->header.saturate = 0;
501}
502
503
504static void brw_set_ff_sync_message(struct brw_compile *p,
505				    struct brw_instruction *insn,
506				    bool allocate,
507				    GLuint response_length,
508				    bool end_of_thread)
509{
510   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
511			      1, response_length, true, end_of_thread);
512   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
513   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
514   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
515   insn->bits3.urb_gen5.allocate = allocate;
516   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
517   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
518}
519
520static void brw_set_urb_message( struct brw_compile *p,
521				 struct brw_instruction *insn,
522                                 enum brw_urb_write_flags flags,
523				 GLuint msg_length,
524				 GLuint response_length,
525				 GLuint offset,
526				 GLuint swizzle_control )
527{
528   struct brw_context *brw = p->brw;
529
530   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
531			      msg_length, response_length, true,
532                              flags & BRW_URB_WRITE_EOT);
533   if (brw->gen == 7) {
534      if (flags & BRW_URB_WRITE_OWORD) {
535         assert(msg_length == 2); /* header + one OWORD of data */
536         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
537      } else {
538         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
539      }
540      insn->bits3.urb_gen7.offset = offset;
541      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
542      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
543      insn->bits3.urb_gen7.per_slot_offset =
544         flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
545      insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
546   } else if (brw->gen >= 5) {
547      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
548      insn->bits3.urb_gen5.offset = offset;
549      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
550      insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
551      insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
552      insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
553   } else {
554      insn->bits3.urb.opcode = 0;	/* ? */
555      insn->bits3.urb.offset = offset;
556      insn->bits3.urb.swizzle_control = swizzle_control;
557      insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
558      insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
559      insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
560   }
561}
562
563void
564brw_set_dp_write_message(struct brw_compile *p,
565			 struct brw_instruction *insn,
566			 GLuint binding_table_index,
567			 GLuint msg_control,
568			 GLuint msg_type,
569			 GLuint msg_length,
570			 bool header_present,
571			 GLuint last_render_target,
572			 GLuint response_length,
573			 GLuint end_of_thread,
574			 GLuint send_commit_msg)
575{
576   struct brw_context *brw = p->brw;
577   unsigned sfid;
578
579   if (brw->gen >= 7) {
580      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
581      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
582	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
583      else
584	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
585   } else if (brw->gen == 6) {
586      /* Use the render cache for all write messages. */
587      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
588   } else {
589      sfid = BRW_SFID_DATAPORT_WRITE;
590   }
591
592   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
593			      header_present, end_of_thread);
594
595   if (brw->gen >= 7) {
596      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
597      insn->bits3.gen7_dp.msg_control = msg_control;
598      insn->bits3.gen7_dp.last_render_target = last_render_target;
599      insn->bits3.gen7_dp.msg_type = msg_type;
600   } else if (brw->gen == 6) {
601      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
602      insn->bits3.gen6_dp.msg_control = msg_control;
603      insn->bits3.gen6_dp.last_render_target = last_render_target;
604      insn->bits3.gen6_dp.msg_type = msg_type;
605      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
606   } else if (brw->gen == 5) {
607      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
608      insn->bits3.dp_write_gen5.msg_control = msg_control;
609      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
610      insn->bits3.dp_write_gen5.msg_type = msg_type;
611      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
612   } else {
613      insn->bits3.dp_write.binding_table_index = binding_table_index;
614      insn->bits3.dp_write.msg_control = msg_control;
615      insn->bits3.dp_write.last_render_target = last_render_target;
616      insn->bits3.dp_write.msg_type = msg_type;
617      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
618   }
619}
620
621void
622brw_set_dp_read_message(struct brw_compile *p,
623			struct brw_instruction *insn,
624			GLuint binding_table_index,
625			GLuint msg_control,
626			GLuint msg_type,
627			GLuint target_cache,
628			GLuint msg_length,
629                        bool header_present,
630			GLuint response_length)
631{
632   struct brw_context *brw = p->brw;
633   unsigned sfid;
634
635   if (brw->gen >= 7) {
636      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
637   } else if (brw->gen == 6) {
638      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
639	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
640      else
641	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
642   } else {
643      sfid = BRW_SFID_DATAPORT_READ;
644   }
645
646   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
647			      header_present, false);
648
649   if (brw->gen >= 7) {
650      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
651      insn->bits3.gen7_dp.msg_control = msg_control;
652      insn->bits3.gen7_dp.last_render_target = 0;
653      insn->bits3.gen7_dp.msg_type = msg_type;
654   } else if (brw->gen == 6) {
655      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
656      insn->bits3.gen6_dp.msg_control = msg_control;
657      insn->bits3.gen6_dp.last_render_target = 0;
658      insn->bits3.gen6_dp.msg_type = msg_type;
659      insn->bits3.gen6_dp.send_commit_msg = 0;
660   } else if (brw->gen == 5) {
661      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
662      insn->bits3.dp_read_gen5.msg_control = msg_control;
663      insn->bits3.dp_read_gen5.msg_type = msg_type;
664      insn->bits3.dp_read_gen5.target_cache = target_cache;
665   } else if (brw->is_g4x) {
666      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
667      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
668      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
669      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
670   } else {
671      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
672      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
673      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
674      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
675   }
676}
677
678void
679brw_set_sampler_message(struct brw_compile *p,
680                        struct brw_instruction *insn,
681                        GLuint binding_table_index,
682                        GLuint sampler,
683                        GLuint msg_type,
684                        GLuint response_length,
685                        GLuint msg_length,
686                        GLuint header_present,
687                        GLuint simd_mode,
688                        GLuint return_format)
689{
690   struct brw_context *brw = p->brw;
691
692   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
693			      response_length, header_present, false);
694
695   if (brw->gen >= 7) {
696      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
697      insn->bits3.sampler_gen7.sampler = sampler;
698      insn->bits3.sampler_gen7.msg_type = msg_type;
699      insn->bits3.sampler_gen7.simd_mode = simd_mode;
700   } else if (brw->gen >= 5) {
701      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
702      insn->bits3.sampler_gen5.sampler = sampler;
703      insn->bits3.sampler_gen5.msg_type = msg_type;
704      insn->bits3.sampler_gen5.simd_mode = simd_mode;
705   } else if (brw->is_g4x) {
706      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
707      insn->bits3.sampler_g4x.sampler = sampler;
708      insn->bits3.sampler_g4x.msg_type = msg_type;
709   } else {
710      insn->bits3.sampler.binding_table_index = binding_table_index;
711      insn->bits3.sampler.sampler = sampler;
712      insn->bits3.sampler.msg_type = msg_type;
713      insn->bits3.sampler.return_format = return_format;
714   }
715}
716
717
718#define next_insn brw_next_insn
719struct brw_instruction *
720brw_next_insn(struct brw_compile *p, GLuint opcode)
721{
722   struct brw_instruction *insn;
723
724   if (p->nr_insn + 1 > p->store_size) {
725      if (0)
726         printf("incresing the store size to %d\n", p->store_size << 1);
727      p->store_size <<= 1;
728      p->store = reralloc(p->mem_ctx, p->store,
729                          struct brw_instruction, p->store_size);
730      if (!p->store)
731         assert(!"realloc eu store memeory failed");
732   }
733
734   p->next_insn_offset += 16;
735   insn = &p->store[p->nr_insn++];
736   memcpy(insn, p->current, sizeof(*insn));
737
738   /* Reset this one-shot flag:
739    */
740
741   if (p->current->header.destreg__conditionalmod) {
742      p->current->header.destreg__conditionalmod = 0;
743      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
744   }
745
746   insn->header.opcode = opcode;
747   return insn;
748}
749
750static struct brw_instruction *brw_alu1( struct brw_compile *p,
751					 GLuint opcode,
752					 struct brw_reg dest,
753					 struct brw_reg src )
754{
755   struct brw_instruction *insn = next_insn(p, opcode);
756   brw_set_dest(p, insn, dest);
757   brw_set_src0(p, insn, src);
758   return insn;
759}
760
761static struct brw_instruction *brw_alu2(struct brw_compile *p,
762					GLuint opcode,
763					struct brw_reg dest,
764					struct brw_reg src0,
765					struct brw_reg src1 )
766{
767   struct brw_instruction *insn = next_insn(p, opcode);
768   brw_set_dest(p, insn, dest);
769   brw_set_src0(p, insn, src0);
770   brw_set_src1(p, insn, src1);
771   return insn;
772}
773
774static int
775get_3src_subreg_nr(struct brw_reg reg)
776{
777   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
778      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
779      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
780   } else {
781      return reg.subnr / 4;
782   }
783}
784
785static struct brw_instruction *brw_alu3(struct brw_compile *p,
786					GLuint opcode,
787					struct brw_reg dest,
788					struct brw_reg src0,
789					struct brw_reg src1,
790					struct brw_reg src2)
791{
792   struct brw_context *brw = p->brw;
793   struct brw_instruction *insn = next_insn(p, opcode);
794
795   gen7_convert_mrf_to_grf(p, &dest);
796
797   assert(insn->header.access_mode == BRW_ALIGN_16);
798
799   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
800	  dest.file == BRW_MESSAGE_REGISTER_FILE);
801   assert(dest.nr < 128);
802   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
803   assert(dest.type == BRW_REGISTER_TYPE_F ||
804          dest.type == BRW_REGISTER_TYPE_D ||
805          dest.type == BRW_REGISTER_TYPE_UD);
806   insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
807   insn->bits1.da3src.dest_reg_nr = dest.nr;
808   insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
809   insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
810   guess_execution_size(p, insn, dest);
811
812   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
813   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
814   assert(src0.nr < 128);
815   insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
816   insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
817   insn->bits2.da3src.src0_reg_nr = src0.nr;
818   insn->bits1.da3src.src0_abs = src0.abs;
819   insn->bits1.da3src.src0_negate = src0.negate;
820   insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
821
822   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
823   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
824   assert(src1.nr < 128);
825   insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
826   insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
827   insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
828   insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
829   insn->bits3.da3src.src1_reg_nr = src1.nr;
830   insn->bits1.da3src.src1_abs = src1.abs;
831   insn->bits1.da3src.src1_negate = src1.negate;
832
833   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
834   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
835   assert(src2.nr < 128);
836   insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
837   insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
838   insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
839   insn->bits3.da3src.src2_reg_nr = src2.nr;
840   insn->bits1.da3src.src2_abs = src2.abs;
841   insn->bits1.da3src.src2_negate = src2.negate;
842
843   if (brw->gen >= 7) {
844      /* Set both the source and destination types based on dest.type,
845       * ignoring the source register types.  The MAD and LRP emitters ensure
846       * that all four types are float.  The BFE and BFI2 emitters, however,
847       * may send us mixed D and UD types and want us to ignore that and use
848       * the destination type.
849       */
850      switch (dest.type) {
851      case BRW_REGISTER_TYPE_F:
852         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
853         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
854         break;
855      case BRW_REGISTER_TYPE_D:
856         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
857         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
858         break;
859      case BRW_REGISTER_TYPE_UD:
860         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
861         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
862         break;
863      }
864   }
865
866   return insn;
867}
868
869
870/***********************************************************************
871 * Convenience routines.
872 */
873#define ALU1(OP)					\
874struct brw_instruction *brw_##OP(struct brw_compile *p,	\
875	      struct brw_reg dest,			\
876	      struct brw_reg src0)   			\
877{							\
878   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
879}
880
881#define ALU2(OP)					\
882struct brw_instruction *brw_##OP(struct brw_compile *p,	\
883	      struct brw_reg dest,			\
884	      struct brw_reg src0,			\
885	      struct brw_reg src1)   			\
886{							\
887   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
888}
889
890#define ALU3(OP)					\
891struct brw_instruction *brw_##OP(struct brw_compile *p,	\
892	      struct brw_reg dest,			\
893	      struct brw_reg src0,			\
894	      struct brw_reg src1,			\
895	      struct brw_reg src2)   			\
896{							\
897   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
898}
899
900#define ALU3F(OP)                                               \
901struct brw_instruction *brw_##OP(struct brw_compile *p,         \
902                                 struct brw_reg dest,           \
903                                 struct brw_reg src0,           \
904                                 struct brw_reg src1,           \
905                                 struct brw_reg src2)           \
906{                                                               \
907   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
908   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
909   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
910   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
911   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
912}
913
914/* Rounding operations (other than RNDD) require two instructions - the first
915 * stores a rounded value (possibly the wrong way) in the dest register, but
916 * also sets a per-channel "increment bit" in the flag register.  A predicated
917 * add of 1.0 fixes dest to contain the desired result.
918 *
919 * Sandybridge and later appear to round correctly without an ADD.
920 */
921#define ROUND(OP)							      \
922void brw_##OP(struct brw_compile *p,					      \
923	      struct brw_reg dest,					      \
924	      struct brw_reg src)					      \
925{									      \
926   struct brw_instruction *rnd, *add;					      \
927   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
928   brw_set_dest(p, rnd, dest);						      \
929   brw_set_src0(p, rnd, src);						      \
930									      \
931   if (p->brw->gen < 6) {						      \
932      /* turn on round-increments */					      \
933      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
934      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
935      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
936   }									      \
937}
938
939
940ALU1(MOV)
941ALU2(SEL)
942ALU1(NOT)
943ALU2(AND)
944ALU2(OR)
945ALU2(XOR)
946ALU2(SHR)
947ALU2(SHL)
948ALU2(ASR)
949ALU1(F32TO16)
950ALU1(F16TO32)
951ALU1(FRC)
952ALU1(RNDD)
953ALU2(MAC)
954ALU2(MACH)
955ALU1(LZD)
956ALU2(DP4)
957ALU2(DPH)
958ALU2(DP3)
959ALU2(DP2)
960ALU2(LINE)
961ALU2(PLN)
962ALU3F(MAD)
963ALU3F(LRP)
964ALU1(BFREV)
965ALU3(BFE)
966ALU2(BFI1)
967ALU3(BFI2)
968ALU1(FBH)
969ALU1(FBL)
970ALU1(CBIT)
971
972ROUND(RNDZ)
973ROUND(RNDE)
974
975
976struct brw_instruction *brw_ADD(struct brw_compile *p,
977				struct brw_reg dest,
978				struct brw_reg src0,
979				struct brw_reg src1)
980{
981   /* 6.2.2: add */
982   if (src0.type == BRW_REGISTER_TYPE_F ||
983       (src0.file == BRW_IMMEDIATE_VALUE &&
984	src0.type == BRW_REGISTER_TYPE_VF)) {
985      assert(src1.type != BRW_REGISTER_TYPE_UD);
986      assert(src1.type != BRW_REGISTER_TYPE_D);
987   }
988
989   if (src1.type == BRW_REGISTER_TYPE_F ||
990       (src1.file == BRW_IMMEDIATE_VALUE &&
991	src1.type == BRW_REGISTER_TYPE_VF)) {
992      assert(src0.type != BRW_REGISTER_TYPE_UD);
993      assert(src0.type != BRW_REGISTER_TYPE_D);
994   }
995
996   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
997}
998
999struct brw_instruction *brw_AVG(struct brw_compile *p,
1000                                struct brw_reg dest,
1001                                struct brw_reg src0,
1002                                struct brw_reg src1)
1003{
1004   assert(dest.type == src0.type);
1005   assert(src0.type == src1.type);
1006   switch (src0.type) {
1007   case BRW_REGISTER_TYPE_B:
1008   case BRW_REGISTER_TYPE_UB:
1009   case BRW_REGISTER_TYPE_W:
1010   case BRW_REGISTER_TYPE_UW:
1011   case BRW_REGISTER_TYPE_D:
1012   case BRW_REGISTER_TYPE_UD:
1013      break;
1014   default:
1015      assert(!"Bad type for brw_AVG");
1016   }
1017
1018   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1019}
1020
1021struct brw_instruction *brw_MUL(struct brw_compile *p,
1022				struct brw_reg dest,
1023				struct brw_reg src0,
1024				struct brw_reg src1)
1025{
1026   /* 6.32.38: mul */
1027   if (src0.type == BRW_REGISTER_TYPE_D ||
1028       src0.type == BRW_REGISTER_TYPE_UD ||
1029       src1.type == BRW_REGISTER_TYPE_D ||
1030       src1.type == BRW_REGISTER_TYPE_UD) {
1031      assert(dest.type != BRW_REGISTER_TYPE_F);
1032   }
1033
1034   if (src0.type == BRW_REGISTER_TYPE_F ||
1035       (src0.file == BRW_IMMEDIATE_VALUE &&
1036	src0.type == BRW_REGISTER_TYPE_VF)) {
1037      assert(src1.type != BRW_REGISTER_TYPE_UD);
1038      assert(src1.type != BRW_REGISTER_TYPE_D);
1039   }
1040
1041   if (src1.type == BRW_REGISTER_TYPE_F ||
1042       (src1.file == BRW_IMMEDIATE_VALUE &&
1043	src1.type == BRW_REGISTER_TYPE_VF)) {
1044      assert(src0.type != BRW_REGISTER_TYPE_UD);
1045      assert(src0.type != BRW_REGISTER_TYPE_D);
1046   }
1047
1048   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1049	  src0.nr != BRW_ARF_ACCUMULATOR);
1050   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1051	  src1.nr != BRW_ARF_ACCUMULATOR);
1052
1053   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1054}
1055
1056
1057void brw_NOP(struct brw_compile *p)
1058{
1059   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1060   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1061   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1062   brw_set_src1(p, insn, brw_imm_ud(0x0));
1063}
1064
1065
1066
1067
1068
1069/***********************************************************************
1070 * Comparisons, if/else/endif
1071 */
1072
1073struct brw_instruction *brw_JMPI(struct brw_compile *p,
1074                                 struct brw_reg dest,
1075                                 struct brw_reg src0,
1076                                 struct brw_reg src1)
1077{
1078   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1079
1080   insn->header.execution_size = 1;
1081   insn->header.compression_control = BRW_COMPRESSION_NONE;
1082   insn->header.mask_control = BRW_MASK_DISABLE;
1083
1084   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1085
1086   return insn;
1087}
1088
1089static void
1090push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1091{
1092   p->if_stack[p->if_stack_depth] = inst - p->store;
1093
1094   p->if_stack_depth++;
1095   if (p->if_stack_array_size <= p->if_stack_depth) {
1096      p->if_stack_array_size *= 2;
1097      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1098			     p->if_stack_array_size);
1099   }
1100}
1101
1102static struct brw_instruction *
1103pop_if_stack(struct brw_compile *p)
1104{
1105   p->if_stack_depth--;
1106   return &p->store[p->if_stack[p->if_stack_depth]];
1107}
1108
1109static void
1110push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1111{
1112   if (p->loop_stack_array_size < p->loop_stack_depth) {
1113      p->loop_stack_array_size *= 2;
1114      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1115			       p->loop_stack_array_size);
1116      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1117				     p->loop_stack_array_size);
1118   }
1119
1120   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1121   p->loop_stack_depth++;
1122   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1123}
1124
1125static struct brw_instruction *
1126get_inner_do_insn(struct brw_compile *p)
1127{
1128   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1129}
1130
1131/* EU takes the value from the flag register and pushes it onto some
1132 * sort of a stack (presumably merging with any flag value already on
1133 * the stack).  Within an if block, the flags at the top of the stack
1134 * control execution on each channel of the unit, eg. on each of the
1135 * 16 pixel values in our wm programs.
1136 *
1137 * When the matching 'else' instruction is reached (presumably by
1138 * countdown of the instruction count patched in by our ELSE/ENDIF
1139 * functions), the relevent flags are inverted.
1140 *
1141 * When the matching 'endif' instruction is reached, the flags are
1142 * popped off.  If the stack is now empty, normal execution resumes.
1143 */
1144struct brw_instruction *
1145brw_IF(struct brw_compile *p, GLuint execute_size)
1146{
1147   struct brw_context *brw = p->brw;
1148   struct brw_instruction *insn;
1149
1150   insn = next_insn(p, BRW_OPCODE_IF);
1151
1152   /* Override the defaults for this instruction:
1153    */
1154   if (brw->gen < 6) {
1155      brw_set_dest(p, insn, brw_ip_reg());
1156      brw_set_src0(p, insn, brw_ip_reg());
1157      brw_set_src1(p, insn, brw_imm_d(0x0));
1158   } else if (brw->gen == 6) {
1159      brw_set_dest(p, insn, brw_imm_w(0));
1160      insn->bits1.branch_gen6.jump_count = 0;
1161      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1162      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1163   } else {
1164      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1165      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1166      brw_set_src1(p, insn, brw_imm_ud(0));
1167      insn->bits3.break_cont.jip = 0;
1168      insn->bits3.break_cont.uip = 0;
1169   }
1170
1171   insn->header.execution_size = execute_size;
1172   insn->header.compression_control = BRW_COMPRESSION_NONE;
1173   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1174   insn->header.mask_control = BRW_MASK_ENABLE;
1175   if (!p->single_program_flow)
1176      insn->header.thread_control = BRW_THREAD_SWITCH;
1177
1178   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1179
1180   push_if_stack(p, insn);
1181   p->if_depth_in_loop[p->loop_stack_depth]++;
1182   return insn;
1183}
1184
1185/* This function is only used for gen6-style IF instructions with an
1186 * embedded comparison (conditional modifier).  It is not used on gen7.
1187 */
1188struct brw_instruction *
1189gen6_IF(struct brw_compile *p, uint32_t conditional,
1190	struct brw_reg src0, struct brw_reg src1)
1191{
1192   struct brw_instruction *insn;
1193
1194   insn = next_insn(p, BRW_OPCODE_IF);
1195
1196   brw_set_dest(p, insn, brw_imm_w(0));
1197   if (p->compressed) {
1198      insn->header.execution_size = BRW_EXECUTE_16;
1199   } else {
1200      insn->header.execution_size = BRW_EXECUTE_8;
1201   }
1202   insn->bits1.branch_gen6.jump_count = 0;
1203   brw_set_src0(p, insn, src0);
1204   brw_set_src1(p, insn, src1);
1205
1206   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1207   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1208   insn->header.destreg__conditionalmod = conditional;
1209
1210   if (!p->single_program_flow)
1211      insn->header.thread_control = BRW_THREAD_SWITCH;
1212
1213   push_if_stack(p, insn);
1214   return insn;
1215}
1216
1217/**
1218 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1219 */
1220static void
1221convert_IF_ELSE_to_ADD(struct brw_compile *p,
1222		       struct brw_instruction *if_inst,
1223		       struct brw_instruction *else_inst)
1224{
1225   /* The next instruction (where the ENDIF would be, if it existed) */
1226   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1227
1228   assert(p->single_program_flow);
1229   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1230   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1231   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1232
1233   /* Convert IF to an ADD instruction that moves the instruction pointer
1234    * to the first instruction of the ELSE block.  If there is no ELSE
1235    * block, point to where ENDIF would be.  Reverse the predicate.
1236    *
1237    * There's no need to execute an ENDIF since we don't need to do any
1238    * stack operations, and if we're currently executing, we just want to
1239    * continue normally.
1240    */
1241   if_inst->header.opcode = BRW_OPCODE_ADD;
1242   if_inst->header.predicate_inverse = 1;
1243
1244   if (else_inst != NULL) {
1245      /* Convert ELSE to an ADD instruction that points where the ENDIF
1246       * would be.
1247       */
1248      else_inst->header.opcode = BRW_OPCODE_ADD;
1249
1250      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1251      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1252   } else {
1253      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1254   }
1255}
1256
1257/**
1258 * Patch IF and ELSE instructions with appropriate jump targets.
1259 */
1260static void
1261patch_IF_ELSE(struct brw_compile *p,
1262	      struct brw_instruction *if_inst,
1263	      struct brw_instruction *else_inst,
1264	      struct brw_instruction *endif_inst)
1265{
1266   struct brw_context *brw = p->brw;
1267
1268   /* We shouldn't be patching IF and ELSE instructions in single program flow
1269    * mode when gen < 6, because in single program flow mode on those
1270    * platforms, we convert flow control instructions to conditional ADDs that
1271    * operate on IP (see brw_ENDIF).
1272    *
1273    * However, on Gen6, writing to IP doesn't work in single program flow mode
1274    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1275    * not be updated by non-flow control instructions.").  And on later
1276    * platforms, there is no significant benefit to converting control flow
1277    * instructions to conditional ADDs.  So we do patch IF and ELSE
1278    * instructions in single program flow mode on those platforms.
1279    */
1280   if (brw->gen < 6)
1281      assert(!p->single_program_flow);
1282
1283   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1284   assert(endif_inst != NULL);
1285   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1286
1287   unsigned br = 1;
1288   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1289    * requires 2 chunks.
1290    */
1291   if (brw->gen >= 5)
1292      br = 2;
1293
1294   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1295   endif_inst->header.execution_size = if_inst->header.execution_size;
1296
1297   if (else_inst == NULL) {
1298      /* Patch IF -> ENDIF */
1299      if (brw->gen < 6) {
1300	 /* Turn it into an IFF, which means no mask stack operations for
1301	  * all-false and jumping past the ENDIF.
1302	  */
1303	 if_inst->header.opcode = BRW_OPCODE_IFF;
1304	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1305	 if_inst->bits3.if_else.pop_count = 0;
1306	 if_inst->bits3.if_else.pad0 = 0;
1307      } else if (brw->gen == 6) {
1308	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1309	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1310      } else {
1311	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1312	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1313      }
1314   } else {
1315      else_inst->header.execution_size = if_inst->header.execution_size;
1316
1317      /* Patch IF -> ELSE */
1318      if (brw->gen < 6) {
1319	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1320	 if_inst->bits3.if_else.pop_count = 0;
1321	 if_inst->bits3.if_else.pad0 = 0;
1322      } else if (brw->gen == 6) {
1323	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1324      }
1325
1326      /* Patch ELSE -> ENDIF */
1327      if (brw->gen < 6) {
1328	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1329	  * matching ENDIF.
1330	  */
1331	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1332	 else_inst->bits3.if_else.pop_count = 1;
1333	 else_inst->bits3.if_else.pad0 = 0;
1334      } else if (brw->gen == 6) {
1335	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1336	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1337      } else {
1338	 /* The IF instruction's JIP should point just past the ELSE */
1339	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1340	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1341	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1342	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1343      }
1344   }
1345}
1346
1347void
1348brw_ELSE(struct brw_compile *p)
1349{
1350   struct brw_context *brw = p->brw;
1351   struct brw_instruction *insn;
1352
1353   insn = next_insn(p, BRW_OPCODE_ELSE);
1354
1355   if (brw->gen < 6) {
1356      brw_set_dest(p, insn, brw_ip_reg());
1357      brw_set_src0(p, insn, brw_ip_reg());
1358      brw_set_src1(p, insn, brw_imm_d(0x0));
1359   } else if (brw->gen == 6) {
1360      brw_set_dest(p, insn, brw_imm_w(0));
1361      insn->bits1.branch_gen6.jump_count = 0;
1362      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1363      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1364   } else {
1365      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1366      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1367      brw_set_src1(p, insn, brw_imm_ud(0));
1368      insn->bits3.break_cont.jip = 0;
1369      insn->bits3.break_cont.uip = 0;
1370   }
1371
1372   insn->header.compression_control = BRW_COMPRESSION_NONE;
1373   insn->header.mask_control = BRW_MASK_ENABLE;
1374   if (!p->single_program_flow)
1375      insn->header.thread_control = BRW_THREAD_SWITCH;
1376
1377   push_if_stack(p, insn);
1378}
1379
1380void
1381brw_ENDIF(struct brw_compile *p)
1382{
1383   struct brw_context *brw = p->brw;
1384   struct brw_instruction *insn = NULL;
1385   struct brw_instruction *else_inst = NULL;
1386   struct brw_instruction *if_inst = NULL;
1387   struct brw_instruction *tmp;
1388   bool emit_endif = true;
1389
1390   /* In single program flow mode, we can express IF and ELSE instructions
1391    * equivalently as ADD instructions that operate on IP.  On platforms prior
1392    * to Gen6, flow control instructions cause an implied thread switch, so
1393    * this is a significant savings.
1394    *
1395    * However, on Gen6, writing to IP doesn't work in single program flow mode
1396    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1397    * not be updated by non-flow control instructions.").  And on later
1398    * platforms, there is no significant benefit to converting control flow
1399    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1400    * Gen5.
1401    */
1402   if (brw->gen < 6 && p->single_program_flow)
1403      emit_endif = false;
1404
1405   /*
1406    * A single next_insn() may change the base adress of instruction store
1407    * memory(p->store), so call it first before referencing the instruction
1408    * store pointer from an index
1409    */
1410   if (emit_endif)
1411      insn = next_insn(p, BRW_OPCODE_ENDIF);
1412
1413   /* Pop the IF and (optional) ELSE instructions from the stack */
1414   p->if_depth_in_loop[p->loop_stack_depth]--;
1415   tmp = pop_if_stack(p);
1416   if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1417      else_inst = tmp;
1418      tmp = pop_if_stack(p);
1419   }
1420   if_inst = tmp;
1421
1422   if (!emit_endif) {
1423      /* ENDIF is useless; don't bother emitting it. */
1424      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1425      return;
1426   }
1427
1428   if (brw->gen < 6) {
1429      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1430      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1431      brw_set_src1(p, insn, brw_imm_d(0x0));
1432   } else if (brw->gen == 6) {
1433      brw_set_dest(p, insn, brw_imm_w(0));
1434      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1435      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1436   } else {
1437      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1438      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1439      brw_set_src1(p, insn, brw_imm_ud(0));
1440   }
1441
1442   insn->header.compression_control = BRW_COMPRESSION_NONE;
1443   insn->header.mask_control = BRW_MASK_ENABLE;
1444   insn->header.thread_control = BRW_THREAD_SWITCH;
1445
1446   /* Also pop item off the stack in the endif instruction: */
1447   if (brw->gen < 6) {
1448      insn->bits3.if_else.jump_count = 0;
1449      insn->bits3.if_else.pop_count = 1;
1450      insn->bits3.if_else.pad0 = 0;
1451   } else if (brw->gen == 6) {
1452      insn->bits1.branch_gen6.jump_count = 2;
1453   } else {
1454      insn->bits3.break_cont.jip = 2;
1455   }
1456   patch_IF_ELSE(p, if_inst, else_inst, insn);
1457}
1458
1459struct brw_instruction *brw_BREAK(struct brw_compile *p)
1460{
1461   struct brw_context *brw = p->brw;
1462   struct brw_instruction *insn;
1463
1464   insn = next_insn(p, BRW_OPCODE_BREAK);
1465   if (brw->gen >= 6) {
1466      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1467      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1468      brw_set_src1(p, insn, brw_imm_d(0x0));
1469   } else {
1470      brw_set_dest(p, insn, brw_ip_reg());
1471      brw_set_src0(p, insn, brw_ip_reg());
1472      brw_set_src1(p, insn, brw_imm_d(0x0));
1473      insn->bits3.if_else.pad0 = 0;
1474      insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1475   }
1476   insn->header.compression_control = BRW_COMPRESSION_NONE;
1477   insn->header.execution_size = BRW_EXECUTE_8;
1478
1479   return insn;
1480}
1481
1482struct brw_instruction *gen6_CONT(struct brw_compile *p)
1483{
1484   struct brw_instruction *insn;
1485
1486   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1487   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1488   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1489   brw_set_dest(p, insn, brw_ip_reg());
1490   brw_set_src0(p, insn, brw_ip_reg());
1491   brw_set_src1(p, insn, brw_imm_d(0x0));
1492
1493   insn->header.compression_control = BRW_COMPRESSION_NONE;
1494   insn->header.execution_size = BRW_EXECUTE_8;
1495   return insn;
1496}
1497
1498struct brw_instruction *brw_CONT(struct brw_compile *p)
1499{
1500   struct brw_instruction *insn;
1501   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1502   brw_set_dest(p, insn, brw_ip_reg());
1503   brw_set_src0(p, insn, brw_ip_reg());
1504   brw_set_src1(p, insn, brw_imm_d(0x0));
1505   insn->header.compression_control = BRW_COMPRESSION_NONE;
1506   insn->header.execution_size = BRW_EXECUTE_8;
1507   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1508   insn->bits3.if_else.pad0 = 0;
1509   insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1510   return insn;
1511}
1512
1513struct brw_instruction *gen6_HALT(struct brw_compile *p)
1514{
1515   struct brw_instruction *insn;
1516
1517   insn = next_insn(p, BRW_OPCODE_HALT);
1518   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1519   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1520   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1521
1522   if (p->compressed) {
1523      insn->header.execution_size = BRW_EXECUTE_16;
1524   } else {
1525      insn->header.compression_control = BRW_COMPRESSION_NONE;
1526      insn->header.execution_size = BRW_EXECUTE_8;
1527   }
1528   return insn;
1529}
1530
1531/* DO/WHILE loop:
1532 *
1533 * The DO/WHILE is just an unterminated loop -- break or continue are
1534 * used for control within the loop.  We have a few ways they can be
1535 * done.
1536 *
1537 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1538 * jip and no DO instruction.
1539 *
1540 * For non-uniform control flow pre-gen6, there's a DO instruction to
1541 * push the mask, and a WHILE to jump back, and BREAK to get out and
1542 * pop the mask.
1543 *
1544 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1545 * just points back to the first instruction of the loop.
1546 */
1547struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1548{
1549   struct brw_context *brw = p->brw;
1550
1551   if (brw->gen >= 6 || p->single_program_flow) {
1552      push_loop_stack(p, &p->store[p->nr_insn]);
1553      return &p->store[p->nr_insn];
1554   } else {
1555      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1556
1557      push_loop_stack(p, insn);
1558
1559      /* Override the defaults for this instruction:
1560       */
1561      brw_set_dest(p, insn, brw_null_reg());
1562      brw_set_src0(p, insn, brw_null_reg());
1563      brw_set_src1(p, insn, brw_null_reg());
1564
1565      insn->header.compression_control = BRW_COMPRESSION_NONE;
1566      insn->header.execution_size = execute_size;
1567      insn->header.predicate_control = BRW_PREDICATE_NONE;
1568      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1569      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1570
1571      return insn;
1572   }
1573}
1574
1575/**
1576 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1577 * instruction here.
1578 *
1579 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1580 * nesting, since it can always just point to the end of the block/current loop.
1581 */
1582static void
1583brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1584{
1585   struct brw_context *brw = p->brw;
1586   struct brw_instruction *do_inst = get_inner_do_insn(p);
1587   struct brw_instruction *inst;
1588   int br = (brw->gen == 5) ? 2 : 1;
1589
1590   for (inst = while_inst - 1; inst != do_inst; inst--) {
1591      /* If the jump count is != 0, that means that this instruction has already
1592       * been patched because it's part of a loop inside of the one we're
1593       * patching.
1594       */
1595      if (inst->header.opcode == BRW_OPCODE_BREAK &&
1596	  inst->bits3.if_else.jump_count == 0) {
1597	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1598      } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1599		 inst->bits3.if_else.jump_count == 0) {
1600	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1601      }
1602   }
1603}
1604
1605struct brw_instruction *brw_WHILE(struct brw_compile *p)
1606{
1607   struct brw_context *brw = p->brw;
1608   struct brw_instruction *insn, *do_insn;
1609   GLuint br = 1;
1610
1611   if (brw->gen >= 5)
1612      br = 2;
1613
1614   if (brw->gen >= 7) {
1615      insn = next_insn(p, BRW_OPCODE_WHILE);
1616      do_insn = get_inner_do_insn(p);
1617
1618      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1619      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1620      brw_set_src1(p, insn, brw_imm_ud(0));
1621      insn->bits3.break_cont.jip = br * (do_insn - insn);
1622
1623      insn->header.execution_size = BRW_EXECUTE_8;
1624   } else if (brw->gen == 6) {
1625      insn = next_insn(p, BRW_OPCODE_WHILE);
1626      do_insn = get_inner_do_insn(p);
1627
1628      brw_set_dest(p, insn, brw_imm_w(0));
1629      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1630      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1631      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1632
1633      insn->header.execution_size = BRW_EXECUTE_8;
1634   } else {
1635      if (p->single_program_flow) {
1636	 insn = next_insn(p, BRW_OPCODE_ADD);
1637         do_insn = get_inner_do_insn(p);
1638
1639	 brw_set_dest(p, insn, brw_ip_reg());
1640	 brw_set_src0(p, insn, brw_ip_reg());
1641	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1642	 insn->header.execution_size = BRW_EXECUTE_1;
1643      } else {
1644	 insn = next_insn(p, BRW_OPCODE_WHILE);
1645         do_insn = get_inner_do_insn(p);
1646
1647	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1648
1649	 brw_set_dest(p, insn, brw_ip_reg());
1650	 brw_set_src0(p, insn, brw_ip_reg());
1651	 brw_set_src1(p, insn, brw_imm_d(0));
1652
1653	 insn->header.execution_size = do_insn->header.execution_size;
1654	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1655	 insn->bits3.if_else.pop_count = 0;
1656	 insn->bits3.if_else.pad0 = 0;
1657
1658	 brw_patch_break_cont(p, insn);
1659      }
1660   }
1661   insn->header.compression_control = BRW_COMPRESSION_NONE;
1662   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1663
1664   p->loop_stack_depth--;
1665
1666   return insn;
1667}
1668
1669
1670/* FORWARD JUMPS:
1671 */
1672void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1673{
1674   struct brw_context *brw = p->brw;
1675   struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1676   GLuint jmpi = 1;
1677
1678   if (brw->gen >= 5)
1679      jmpi = 2;
1680
1681   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1682   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1683
1684   jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1685}
1686
1687
1688
1689/* To integrate with the above, it makes sense that the comparison
1690 * instruction should populate the flag register.  It might be simpler
1691 * just to use the flag reg for most WM tasks?
1692 */
1693void brw_CMP(struct brw_compile *p,
1694	     struct brw_reg dest,
1695	     GLuint conditional,
1696	     struct brw_reg src0,
1697	     struct brw_reg src1)
1698{
1699   struct brw_context *brw = p->brw;
1700   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1701
1702   insn->header.destreg__conditionalmod = conditional;
1703   brw_set_dest(p, insn, dest);
1704   brw_set_src0(p, insn, src0);
1705   brw_set_src1(p, insn, src1);
1706
1707/*    guess_execution_size(insn, src0); */
1708
1709
1710   /* Make it so that future instructions will use the computed flag
1711    * value until brw_set_predicate_control_flag_value() is called
1712    * again.
1713    */
1714   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1715       dest.nr == 0) {
1716      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1717      p->flag_value = 0xff;
1718   }
1719
1720   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1721    * page says:
1722    *    "Any CMP instruction with a null destination must use a {switch}."
1723    *
1724    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1725    * mentioned on their work-arounds pages.
1726    */
1727   if (brw->gen == 7) {
1728      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1729          dest.nr == BRW_ARF_NULL) {
1730         insn->header.thread_control = BRW_THREAD_SWITCH;
1731      }
1732   }
1733}
1734
1735/* Issue 'wait' instruction for n1, host could program MMIO
1736   to wake up thread. */
1737void brw_WAIT (struct brw_compile *p)
1738{
1739   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1740   struct brw_reg src = brw_notification_1_reg();
1741
1742   brw_set_dest(p, insn, src);
1743   brw_set_src0(p, insn, src);
1744   brw_set_src1(p, insn, brw_null_reg());
1745   insn->header.execution_size = 0; /* must */
1746   insn->header.predicate_control = 0;
1747   insn->header.compression_control = 0;
1748}
1749
1750
1751/***********************************************************************
1752 * Helpers for the various SEND message types:
1753 */
1754
1755/** Extended math function, float[8].
1756 */
1757void brw_math( struct brw_compile *p,
1758	       struct brw_reg dest,
1759	       GLuint function,
1760	       GLuint msg_reg_nr,
1761	       struct brw_reg src,
1762	       GLuint data_type,
1763	       GLuint precision )
1764{
1765   struct brw_context *brw = p->brw;
1766
1767   if (brw->gen >= 6) {
1768      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1769
1770      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1771             (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1772      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1773
1774      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1775      if (brw->gen == 6)
1776	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1777
1778      /* Source modifiers are ignored for extended math instructions on Gen6. */
1779      if (brw->gen == 6) {
1780	 assert(!src.negate);
1781	 assert(!src.abs);
1782      }
1783
1784      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1785	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1786	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1787	 assert(src.type != BRW_REGISTER_TYPE_F);
1788      } else {
1789	 assert(src.type == BRW_REGISTER_TYPE_F);
1790      }
1791
1792      /* Math is the same ISA format as other opcodes, except that CondModifier
1793       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1794       */
1795      insn->header.destreg__conditionalmod = function;
1796
1797      brw_set_dest(p, insn, dest);
1798      brw_set_src0(p, insn, src);
1799      brw_set_src1(p, insn, brw_null_reg());
1800   } else {
1801      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1802
1803      /* Example code doesn't set predicate_control for send
1804       * instructions.
1805       */
1806      insn->header.predicate_control = 0;
1807      insn->header.destreg__conditionalmod = msg_reg_nr;
1808
1809      brw_set_dest(p, insn, dest);
1810      brw_set_src0(p, insn, src);
1811      brw_set_math_message(p,
1812			   insn,
1813			   function,
1814			   src.type == BRW_REGISTER_TYPE_D,
1815			   precision,
1816			   data_type);
1817   }
1818}
1819
1820/** Extended math function, float[8].
1821 */
1822void brw_math2(struct brw_compile *p,
1823	       struct brw_reg dest,
1824	       GLuint function,
1825	       struct brw_reg src0,
1826	       struct brw_reg src1)
1827{
1828   struct brw_context *brw = p->brw;
1829   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1830
1831   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1832          (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1833   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1834   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1835
1836   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1837   if (brw->gen == 6) {
1838      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1839      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1840   }
1841
1842   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1843       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1844       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1845      assert(src0.type != BRW_REGISTER_TYPE_F);
1846      assert(src1.type != BRW_REGISTER_TYPE_F);
1847   } else {
1848      assert(src0.type == BRW_REGISTER_TYPE_F);
1849      assert(src1.type == BRW_REGISTER_TYPE_F);
1850   }
1851
1852   /* Source modifiers are ignored for extended math instructions on Gen6. */
1853   if (brw->gen == 6) {
1854      assert(!src0.negate);
1855      assert(!src0.abs);
1856      assert(!src1.negate);
1857      assert(!src1.abs);
1858   }
1859
1860   /* Math is the same ISA format as other opcodes, except that CondModifier
1861    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1862    */
1863   insn->header.destreg__conditionalmod = function;
1864
1865   brw_set_dest(p, insn, dest);
1866   brw_set_src0(p, insn, src0);
1867   brw_set_src1(p, insn, src1);
1868}
1869
1870
1871/**
1872 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1873 * using a constant offset per channel.
1874 *
1875 * The offset must be aligned to oword size (16 bytes).  Used for
1876 * register spilling.
1877 */
1878void brw_oword_block_write_scratch(struct brw_compile *p,
1879				   struct brw_reg mrf,
1880				   int num_regs,
1881				   GLuint offset)
1882{
1883   struct brw_context *brw = p->brw;
1884   uint32_t msg_control, msg_type;
1885   int mlen;
1886
1887   if (brw->gen >= 6)
1888      offset /= 16;
1889
1890   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1891
1892   if (num_regs == 1) {
1893      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1894      mlen = 2;
1895   } else {
1896      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1897      mlen = 3;
1898   }
1899
1900   /* Set up the message header.  This is g0, with g0.2 filled with
1901    * the offset.  We don't want to leave our offset around in g0 or
1902    * it'll screw up texture samples, so set it up inside the message
1903    * reg.
1904    */
1905   {
1906      brw_push_insn_state(p);
1907      brw_set_mask_control(p, BRW_MASK_DISABLE);
1908      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1909
1910      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1911
1912      /* set message header global offset field (reg 0, element 2) */
1913      brw_MOV(p,
1914	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1915				  mrf.nr,
1916				  2), BRW_REGISTER_TYPE_UD),
1917	      brw_imm_ud(offset));
1918
1919      brw_pop_insn_state(p);
1920   }
1921
1922   {
1923      struct brw_reg dest;
1924      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1925      int send_commit_msg;
1926      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1927					 BRW_REGISTER_TYPE_UW);
1928
1929      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1930	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1931	 src_header = vec16(src_header);
1932      }
1933      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1934      insn->header.destreg__conditionalmod = mrf.nr;
1935
1936      /* Until gen6, writes followed by reads from the same location
1937       * are not guaranteed to be ordered unless write_commit is set.
1938       * If set, then a no-op write is issued to the destination
1939       * register to set a dependency, and a read from the destination
1940       * can be used to ensure the ordering.
1941       *
1942       * For gen6, only writes between different threads need ordering
1943       * protection.  Our use of DP writes is all about register
1944       * spilling within a thread.
1945       */
1946      if (brw->gen >= 6) {
1947	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1948	 send_commit_msg = 0;
1949      } else {
1950	 dest = src_header;
1951	 send_commit_msg = 1;
1952      }
1953
1954      brw_set_dest(p, insn, dest);
1955      if (brw->gen >= 6) {
1956	 brw_set_src0(p, insn, mrf);
1957      } else {
1958	 brw_set_src0(p, insn, brw_null_reg());
1959      }
1960
1961      if (brw->gen >= 6)
1962	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1963      else
1964	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1965
1966      brw_set_dp_write_message(p,
1967			       insn,
1968			       255, /* binding table index (255=stateless) */
1969			       msg_control,
1970			       msg_type,
1971			       mlen,
1972			       true, /* header_present */
1973			       0, /* not a render target */
1974			       send_commit_msg, /* response_length */
1975			       0, /* eot */
1976			       send_commit_msg);
1977   }
1978}
1979
1980
1981/**
1982 * Read a block of owords (half a GRF each) from the scratch buffer
1983 * using a constant index per channel.
1984 *
1985 * Offset must be aligned to oword size (16 bytes).  Used for register
1986 * spilling.
1987 */
1988void
1989brw_oword_block_read_scratch(struct brw_compile *p,
1990			     struct brw_reg dest,
1991			     struct brw_reg mrf,
1992			     int num_regs,
1993			     GLuint offset)
1994{
1995   struct brw_context *brw = p->brw;
1996   uint32_t msg_control;
1997   int rlen;
1998
1999   if (brw->gen >= 6)
2000      offset /= 16;
2001
2002   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2003   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2004
2005   if (num_regs == 1) {
2006      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2007      rlen = 1;
2008   } else {
2009      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2010      rlen = 2;
2011   }
2012
2013   {
2014      brw_push_insn_state(p);
2015      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2016      brw_set_mask_control(p, BRW_MASK_DISABLE);
2017
2018      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2019
2020      /* set message header global offset field (reg 0, element 2) */
2021      brw_MOV(p,
2022	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2023				  mrf.nr,
2024				  2), BRW_REGISTER_TYPE_UD),
2025	      brw_imm_ud(offset));
2026
2027      brw_pop_insn_state(p);
2028   }
2029
2030   {
2031      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2032
2033      assert(insn->header.predicate_control == 0);
2034      insn->header.compression_control = BRW_COMPRESSION_NONE;
2035      insn->header.destreg__conditionalmod = mrf.nr;
2036
2037      brw_set_dest(p, insn, dest);	/* UW? */
2038      if (brw->gen >= 6) {
2039	 brw_set_src0(p, insn, mrf);
2040      } else {
2041	 brw_set_src0(p, insn, brw_null_reg());
2042      }
2043
2044      brw_set_dp_read_message(p,
2045			      insn,
2046			      255, /* binding table index (255=stateless) */
2047			      msg_control,
2048			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2049			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2050			      1, /* msg_length */
2051                              true, /* header_present */
2052			      rlen);
2053   }
2054}
2055
2056/**
2057 * Read a float[4] vector from the data port Data Cache (const buffer).
2058 * Location (in buffer) should be a multiple of 16.
2059 * Used for fetching shader constants.
2060 */
2061void brw_oword_block_read(struct brw_compile *p,
2062			  struct brw_reg dest,
2063			  struct brw_reg mrf,
2064			  uint32_t offset,
2065			  uint32_t bind_table_index)
2066{
2067   struct brw_context *brw = p->brw;
2068
2069   /* On newer hardware, offset is in units of owords. */
2070   if (brw->gen >= 6)
2071      offset /= 16;
2072
2073   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2074
2075   brw_push_insn_state(p);
2076   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2077   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2078   brw_set_mask_control(p, BRW_MASK_DISABLE);
2079
2080   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2081
2082   /* set message header global offset field (reg 0, element 2) */
2083   brw_MOV(p,
2084	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2085			       mrf.nr,
2086			       2), BRW_REGISTER_TYPE_UD),
2087	   brw_imm_ud(offset));
2088
2089   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2090   insn->header.destreg__conditionalmod = mrf.nr;
2091
2092   /* cast dest to a uword[8] vector */
2093   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2094
2095   brw_set_dest(p, insn, dest);
2096   if (brw->gen >= 6) {
2097      brw_set_src0(p, insn, mrf);
2098   } else {
2099      brw_set_src0(p, insn, brw_null_reg());
2100   }
2101
2102   brw_set_dp_read_message(p,
2103			   insn,
2104			   bind_table_index,
2105			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2106			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2107			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2108			   1, /* msg_length */
2109                           true, /* header_present */
2110			   1); /* response_length (1 reg, 2 owords!) */
2111
2112   brw_pop_insn_state(p);
2113}
2114
2115
2116void brw_fb_WRITE(struct brw_compile *p,
2117		  int dispatch_width,
2118                  GLuint msg_reg_nr,
2119                  struct brw_reg src0,
2120                  GLuint msg_control,
2121                  GLuint binding_table_index,
2122                  GLuint msg_length,
2123                  GLuint response_length,
2124                  bool eot,
2125                  bool header_present)
2126{
2127   struct brw_context *brw = p->brw;
2128   struct brw_instruction *insn;
2129   GLuint msg_type;
2130   struct brw_reg dest;
2131
2132   if (dispatch_width == 16)
2133      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2134   else
2135      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2136
2137   if (brw->gen >= 6) {
2138      insn = next_insn(p, BRW_OPCODE_SENDC);
2139   } else {
2140      insn = next_insn(p, BRW_OPCODE_SEND);
2141   }
2142   /* The execution mask is ignored for render target writes. */
2143   insn->header.predicate_control = 0;
2144   insn->header.compression_control = BRW_COMPRESSION_NONE;
2145
2146   if (brw->gen >= 6) {
2147      /* headerless version, just submit color payload */
2148      src0 = brw_message_reg(msg_reg_nr);
2149
2150      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2151   } else {
2152      insn->header.destreg__conditionalmod = msg_reg_nr;
2153
2154      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2155   }
2156
2157   brw_set_dest(p, insn, dest);
2158   brw_set_src0(p, insn, src0);
2159   brw_set_dp_write_message(p,
2160			    insn,
2161			    binding_table_index,
2162			    msg_control,
2163			    msg_type,
2164			    msg_length,
2165			    header_present,
2166			    eot, /* last render target write */
2167			    response_length,
2168			    eot,
2169			    0 /* send_commit_msg */);
2170}
2171
2172
2173/**
2174 * Texture sample instruction.
2175 * Note: the msg_type plus msg_length values determine exactly what kind
2176 * of sampling operation is performed.  See volume 4, page 161 of docs.
2177 */
2178void brw_SAMPLE(struct brw_compile *p,
2179		struct brw_reg dest,
2180		GLuint msg_reg_nr,
2181		struct brw_reg src0,
2182		GLuint binding_table_index,
2183		GLuint sampler,
2184		GLuint msg_type,
2185		GLuint response_length,
2186		GLuint msg_length,
2187		GLuint header_present,
2188		GLuint simd_mode,
2189		GLuint return_format)
2190{
2191   struct brw_context *brw = p->brw;
2192   struct brw_instruction *insn;
2193
2194   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2195
2196   insn = next_insn(p, BRW_OPCODE_SEND);
2197   insn->header.predicate_control = 0; /* XXX */
2198   insn->header.compression_control = BRW_COMPRESSION_NONE;
2199   if (brw->gen < 6)
2200      insn->header.destreg__conditionalmod = msg_reg_nr;
2201
2202   brw_set_dest(p, insn, dest);
2203   brw_set_src0(p, insn, src0);
2204   brw_set_sampler_message(p, insn,
2205                           binding_table_index,
2206                           sampler,
2207                           msg_type,
2208                           response_length,
2209                           msg_length,
2210                           header_present,
2211                           simd_mode,
2212                           return_format);
2213}
2214
2215/* All these variables are pretty confusing - we might be better off
2216 * using bitmasks and macros for this, in the old style.  Or perhaps
2217 * just having the caller instantiate the fields in dword3 itself.
2218 */
2219void brw_urb_WRITE(struct brw_compile *p,
2220		   struct brw_reg dest,
2221		   GLuint msg_reg_nr,
2222		   struct brw_reg src0,
2223                   enum brw_urb_write_flags flags,
2224		   GLuint msg_length,
2225		   GLuint response_length,
2226		   GLuint offset,
2227		   GLuint swizzle)
2228{
2229   struct brw_context *brw = p->brw;
2230   struct brw_instruction *insn;
2231
2232   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2233
2234   if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2235      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2236      brw_push_insn_state(p);
2237      brw_set_access_mode(p, BRW_ALIGN_1);
2238      brw_set_mask_control(p, BRW_MASK_DISABLE);
2239      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2240		       BRW_REGISTER_TYPE_UD),
2241	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2242		brw_imm_ud(0xff00));
2243      brw_pop_insn_state(p);
2244   }
2245
2246   insn = next_insn(p, BRW_OPCODE_SEND);
2247
2248   assert(msg_length < BRW_MAX_MRF);
2249
2250   brw_set_dest(p, insn, dest);
2251   brw_set_src0(p, insn, src0);
2252   brw_set_src1(p, insn, brw_imm_d(0));
2253
2254   if (brw->gen < 6)
2255      insn->header.destreg__conditionalmod = msg_reg_nr;
2256
2257   brw_set_urb_message(p,
2258		       insn,
2259		       flags,
2260		       msg_length,
2261		       response_length,
2262		       offset,
2263		       swizzle);
2264}
2265
2266static int
2267next_ip(struct brw_compile *p, int ip)
2268{
2269   struct brw_instruction *insn = (void *)p->store + ip;
2270
2271   if (insn->header.cmpt_control)
2272      return ip + 8;
2273   else
2274      return ip + 16;
2275}
2276
2277static int
2278brw_find_next_block_end(struct brw_compile *p, int start)
2279{
2280   int ip;
2281   void *store = p->store;
2282
2283   for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2284      struct brw_instruction *insn = store + ip;
2285
2286      switch (insn->header.opcode) {
2287      case BRW_OPCODE_ENDIF:
2288      case BRW_OPCODE_ELSE:
2289      case BRW_OPCODE_WHILE:
2290      case BRW_OPCODE_HALT:
2291	 return ip;
2292      }
2293   }
2294
2295   return 0;
2296}
2297
2298/* There is no DO instruction on gen6, so to find the end of the loop
2299 * we have to see if the loop is jumping back before our start
2300 * instruction.
2301 */
2302static int
2303brw_find_loop_end(struct brw_compile *p, int start)
2304{
2305   struct brw_context *brw = p->brw;
2306   int ip;
2307   int scale = 8;
2308   void *store = p->store;
2309
2310   /* Always start after the instruction (such as a WHILE) we're trying to fix
2311    * up.
2312    */
2313   for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2314      struct brw_instruction *insn = store + ip;
2315
2316      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2317	 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2318				   : insn->bits3.break_cont.jip;
2319	 if (ip + jip * scale <= start)
2320	    return ip;
2321      }
2322   }
2323   assert(!"not reached");
2324   return start;
2325}
2326
2327/* After program generation, go back and update the UIP and JIP of
2328 * BREAK, CONT, and HALT instructions to their correct locations.
2329 */
2330void
2331brw_set_uip_jip(struct brw_compile *p)
2332{
2333   struct brw_context *brw = p->brw;
2334   int ip;
2335   int scale = 8;
2336   void *store = p->store;
2337
2338   if (brw->gen < 6)
2339      return;
2340
2341   for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2342      struct brw_instruction *insn = store + ip;
2343
2344      if (insn->header.cmpt_control) {
2345	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2346	 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2347		insn->header.opcode != BRW_OPCODE_CONTINUE &&
2348		insn->header.opcode != BRW_OPCODE_HALT);
2349	 continue;
2350      }
2351
2352      int block_end_ip = brw_find_next_block_end(p, ip);
2353      switch (insn->header.opcode) {
2354      case BRW_OPCODE_BREAK:
2355         assert(block_end_ip != 0);
2356	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2357	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2358	 insn->bits3.break_cont.uip =
2359	    (brw_find_loop_end(p, ip) - ip +
2360             (brw->gen == 6 ? 16 : 0)) / scale;
2361	 break;
2362      case BRW_OPCODE_CONTINUE:
2363         assert(block_end_ip != 0);
2364	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2365	 insn->bits3.break_cont.uip =
2366            (brw_find_loop_end(p, ip) - ip) / scale;
2367
2368	 assert(insn->bits3.break_cont.uip != 0);
2369	 assert(insn->bits3.break_cont.jip != 0);
2370	 break;
2371
2372      case BRW_OPCODE_ENDIF:
2373         if (block_end_ip == 0)
2374            insn->bits3.break_cont.jip = 2;
2375         else
2376            insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2377	 break;
2378
2379      case BRW_OPCODE_HALT:
2380	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2381	  *
2382	  *    "In case of the halt instruction not inside any conditional
2383	  *     code block, the value of <JIP> and <UIP> should be the
2384	  *     same. In case of the halt instruction inside conditional code
2385	  *     block, the <UIP> should be the end of the program, and the
2386	  *     <JIP> should be end of the most inner conditional code block."
2387	  *
2388	  * The uip will have already been set by whoever set up the
2389	  * instruction.
2390	  */
2391	 if (block_end_ip == 0) {
2392	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2393	 } else {
2394	    insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2395	 }
2396	 assert(insn->bits3.break_cont.uip != 0);
2397	 assert(insn->bits3.break_cont.jip != 0);
2398	 break;
2399      }
2400   }
2401}
2402
2403void brw_ff_sync(struct brw_compile *p,
2404		   struct brw_reg dest,
2405		   GLuint msg_reg_nr,
2406		   struct brw_reg src0,
2407		   bool allocate,
2408		   GLuint response_length,
2409		   bool eot)
2410{
2411   struct brw_context *brw = p->brw;
2412   struct brw_instruction *insn;
2413
2414   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2415
2416   insn = next_insn(p, BRW_OPCODE_SEND);
2417   brw_set_dest(p, insn, dest);
2418   brw_set_src0(p, insn, src0);
2419   brw_set_src1(p, insn, brw_imm_d(0));
2420
2421   if (brw->gen < 6)
2422      insn->header.destreg__conditionalmod = msg_reg_nr;
2423
2424   brw_set_ff_sync_message(p,
2425			   insn,
2426			   allocate,
2427			   response_length,
2428			   eot);
2429}
2430
2431/**
2432 * Emit the SEND instruction necessary to generate stream output data on Gen6
2433 * (for transform feedback).
2434 *
2435 * If send_commit_msg is true, this is the last piece of stream output data
2436 * from this thread, so send the data as a committed write.  According to the
2437 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2438 *
2439 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2440 *   writes are complete by sending the final write as a committed write."
2441 */
2442void
2443brw_svb_write(struct brw_compile *p,
2444              struct brw_reg dest,
2445              GLuint msg_reg_nr,
2446              struct brw_reg src0,
2447              GLuint binding_table_index,
2448              bool   send_commit_msg)
2449{
2450   struct brw_instruction *insn;
2451
2452   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2453
2454   insn = next_insn(p, BRW_OPCODE_SEND);
2455   brw_set_dest(p, insn, dest);
2456   brw_set_src0(p, insn, src0);
2457   brw_set_src1(p, insn, brw_imm_d(0));
2458   brw_set_dp_write_message(p, insn,
2459                            binding_table_index,
2460                            0, /* msg_control: ignored */
2461                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2462                            1, /* msg_length */
2463                            true, /* header_present */
2464                            0, /* last_render_target: ignored */
2465                            send_commit_msg, /* response_length */
2466                            0, /* end_of_thread */
2467                            send_commit_msg); /* send_commit_msg */
2468}
2469
2470/**
2471 * This instruction is generated as a single-channel align1 instruction by
2472 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2473 *
2474 * We can't use the typed atomic op in the FS because that has the execution
2475 * mask ANDed with the pixel mask, but we just want to write the one dword for
2476 * all the pixels.
2477 *
2478 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2479 * one u32.  So we use the same untyped atomic write message as the pixel
2480 * shader.
2481 *
2482 * The untyped atomic operation requires a BUFFER surface type with RAW
2483 * format, and is only accessible through the legacy DATA_CACHE dataport
2484 * messages.
2485 */
2486void brw_shader_time_add(struct brw_compile *p,
2487                         struct brw_reg payload,
2488                         uint32_t surf_index)
2489{
2490   struct brw_context *brw = p->brw;
2491   assert(brw->gen >= 7);
2492
2493   brw_push_insn_state(p);
2494   brw_set_access_mode(p, BRW_ALIGN_1);
2495   brw_set_mask_control(p, BRW_MASK_DISABLE);
2496   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2497   brw_pop_insn_state(p);
2498
2499   /* We use brw_vec1_reg and unmasked because we want to increment the given
2500    * offset only once.
2501    */
2502   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2503                                      BRW_ARF_NULL, 0));
2504   brw_set_src0(p, send, brw_vec1_reg(payload.file,
2505                                      payload.nr, 0));
2506
2507   uint32_t sfid, msg_type;
2508   if (brw->is_haswell) {
2509      sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2510      msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2511   } else {
2512      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2513      msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2514   }
2515
2516   bool header_present = false;
2517   bool eot = false;
2518   uint32_t mlen = 2; /* offset, value */
2519   uint32_t rlen = 0;
2520   brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2521
2522   send->bits3.ud |= msg_type << 14;
2523   send->bits3.ud |= 0 << 13; /* no return data */
2524   send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2525   send->bits3.ud |= BRW_AOP_ADD << 8;
2526   send->bits3.ud |= surf_index << 0;
2527}
2528