brw_eu_emit.c revision eaa63cbbc2f5ae415fc89ef6fd74c5b26ad622fd
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct brw_context *brw = p->brw;
67   if (brw->gen < 6)
68      return;
69
70   if (src->file == BRW_MESSAGE_REGISTER_FILE)
71      return;
72
73   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74      brw_push_insn_state(p);
75      brw_set_mask_control(p, BRW_MASK_DISABLE);
76      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	      retype(*src, BRW_REGISTER_TYPE_UD));
79      brw_pop_insn_state(p);
80   }
81   *src = brw_message_reg(msg_reg_nr);
82}
83
84static void
85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86{
87   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88    * "The send with EOT should use register space R112-R127 for <src>. This is
89    *  to enable loading of a new thread into the same slot while the message
90    *  with EOT for current thread is pending dispatch."
91    *
92    * Since we're pretending to have 16 MRFs anyway, we may as well use the
93    * registers required for messages with EOT.
94    */
95   struct brw_context *brw = p->brw;
96   if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97      reg->file = BRW_GENERAL_REGISTER_FILE;
98      reg->nr += GEN7_MRF_HACK_START;
99   }
100}
101
102
103void
104brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
105	     struct brw_reg dest)
106{
107   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
108       dest.file != BRW_MESSAGE_REGISTER_FILE)
109      assert(dest.nr < 128);
110
111   gen7_convert_mrf_to_grf(p, &dest);
112
113   insn->bits1.da1.dest_reg_file = dest.file;
114   insn->bits1.da1.dest_reg_type = dest.type;
115   insn->bits1.da1.dest_address_mode = dest.address_mode;
116
117   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
118      insn->bits1.da1.dest_reg_nr = dest.nr;
119
120      if (insn->header.access_mode == BRW_ALIGN_1) {
121	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
122	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
123	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
124	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
125      }
126      else {
127	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
128	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
129	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
130	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
131	  *    this to be programmed as "01".
132	  */
133	 insn->bits1.da16.dest_horiz_stride = 1;
134      }
135   }
136   else {
137      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
138
139      /* These are different sizes in align1 vs align16:
140       */
141      if (insn->header.access_mode == BRW_ALIGN_1) {
142	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
143	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
144	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
145	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
146      }
147      else {
148	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
149	 /* even ignored in da16, still need to set as '01' */
150	 insn->bits1.ia16.dest_horiz_stride = 1;
151      }
152   }
153
154   /* NEW: Set the execution size based on dest.width and
155    * insn->compression_control:
156    */
157   guess_execution_size(p, insn, dest);
158}
159
160extern int reg_type_size[];
161
162static void
163validate_reg(struct brw_instruction *insn, struct brw_reg reg)
164{
165   int hstride_for_reg[] = {0, 1, 2, 4};
166   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
167   int width_for_reg[] = {1, 2, 4, 8, 16};
168   int execsize_for_reg[] = {1, 2, 4, 8, 16};
169   int width, hstride, vstride, execsize;
170
171   if (reg.file == BRW_IMMEDIATE_VALUE) {
172      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
173       * mean the destination has to be 128-bit aligned and the
174       * destination horiz stride has to be a word.
175       */
176      if (reg.type == BRW_REGISTER_TYPE_V) {
177	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
178		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
179      }
180
181      return;
182   }
183
184   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
185       reg.file == BRW_ARF_NULL)
186      return;
187
188   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
189   hstride = hstride_for_reg[reg.hstride];
190
191   if (reg.vstride == 0xf) {
192      vstride = -1;
193   } else {
194      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
195      vstride = vstride_for_reg[reg.vstride];
196   }
197
198   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
199   width = width_for_reg[reg.width];
200
201   assert(insn->header.execution_size >= 0 &&
202	  insn->header.execution_size < Elements(execsize_for_reg));
203   execsize = execsize_for_reg[insn->header.execution_size];
204
205   /* Restrictions from 3.3.10: Register Region Restrictions. */
206   /* 3. */
207   assert(execsize >= width);
208
209   /* 4. */
210   if (execsize == width && hstride != 0) {
211      assert(vstride == -1 || vstride == width * hstride);
212   }
213
214   /* 5. */
215   if (execsize == width && hstride == 0) {
216      /* no restriction on vstride. */
217   }
218
219   /* 6. */
220   if (width == 1) {
221      assert(hstride == 0);
222   }
223
224   /* 7. */
225   if (execsize == 1 && width == 1) {
226      assert(hstride == 0);
227      assert(vstride == 0);
228   }
229
230   /* 8. */
231   if (vstride == 0 && hstride == 0) {
232      assert(width == 1);
233   }
234
235   /* 10. Check destination issues. */
236}
237
238void
239brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
240	     struct brw_reg reg)
241{
242   struct brw_context *brw = p->brw;
243
244   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
245      assert(reg.nr < 128);
246
247   gen7_convert_mrf_to_grf(p, &reg);
248
249   if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
250                           insn->header.opcode == BRW_OPCODE_SENDC)) {
251      /* Any source modifiers or regions will be ignored, since this just
252       * identifies the MRF/GRF to start reading the message contents from.
253       * Check for some likely failures.
254       */
255      assert(!reg.negate);
256      assert(!reg.abs);
257      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
258   }
259
260   validate_reg(insn, reg);
261
262   insn->bits1.da1.src0_reg_file = reg.file;
263   insn->bits1.da1.src0_reg_type = reg.type;
264   insn->bits2.da1.src0_abs = reg.abs;
265   insn->bits2.da1.src0_negate = reg.negate;
266   insn->bits2.da1.src0_address_mode = reg.address_mode;
267
268   if (reg.file == BRW_IMMEDIATE_VALUE) {
269      insn->bits3.ud = reg.dw1.ud;
270
271      /* Required to set some fields in src1 as well:
272       */
273      insn->bits1.da1.src1_reg_file = 0; /* arf */
274      insn->bits1.da1.src1_reg_type = reg.type;
275   }
276   else
277   {
278      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
279	 if (insn->header.access_mode == BRW_ALIGN_1) {
280	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
281	    insn->bits2.da1.src0_reg_nr = reg.nr;
282	 }
283	 else {
284	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
285	    insn->bits2.da16.src0_reg_nr = reg.nr;
286	 }
287      }
288      else {
289	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
290
291	 if (insn->header.access_mode == BRW_ALIGN_1) {
292	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
293	 }
294	 else {
295	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
296	 }
297      }
298
299      if (insn->header.access_mode == BRW_ALIGN_1) {
300	 if (reg.width == BRW_WIDTH_1 &&
301	     insn->header.execution_size == BRW_EXECUTE_1) {
302	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
303	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
304	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
305	 }
306	 else {
307	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
308	    insn->bits2.da1.src0_width = reg.width;
309	    insn->bits2.da1.src0_vert_stride = reg.vstride;
310	 }
311      }
312      else {
313	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
314	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
315	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
316	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
317
318	 /* This is an oddity of the fact we're using the same
319	  * descriptions for registers in align_16 as align_1:
320	  */
321	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
322	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
323	 else
324	    insn->bits2.da16.src0_vert_stride = reg.vstride;
325      }
326   }
327}
328
329
330void brw_set_src1(struct brw_compile *p,
331		  struct brw_instruction *insn,
332		  struct brw_reg reg)
333{
334   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
335
336   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
337      assert(reg.nr < 128);
338
339   gen7_convert_mrf_to_grf(p, &reg);
340
341   validate_reg(insn, reg);
342
343   insn->bits1.da1.src1_reg_file = reg.file;
344   insn->bits1.da1.src1_reg_type = reg.type;
345   insn->bits3.da1.src1_abs = reg.abs;
346   insn->bits3.da1.src1_negate = reg.negate;
347
348   /* Only src1 can be immediate in two-argument instructions.
349    */
350   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
351
352   if (reg.file == BRW_IMMEDIATE_VALUE) {
353      insn->bits3.ud = reg.dw1.ud;
354   }
355   else {
356      /* This is a hardware restriction, which may or may not be lifted
357       * in the future:
358       */
359      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
360      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
361
362      if (insn->header.access_mode == BRW_ALIGN_1) {
363	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
364	 insn->bits3.da1.src1_reg_nr = reg.nr;
365      }
366      else {
367	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
368	 insn->bits3.da16.src1_reg_nr = reg.nr;
369      }
370
371      if (insn->header.access_mode == BRW_ALIGN_1) {
372	 if (reg.width == BRW_WIDTH_1 &&
373	     insn->header.execution_size == BRW_EXECUTE_1) {
374	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
375	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
376	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
377	 }
378	 else {
379	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
380	    insn->bits3.da1.src1_width = reg.width;
381	    insn->bits3.da1.src1_vert_stride = reg.vstride;
382	 }
383      }
384      else {
385	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
386	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
387	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
388	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
389
390	 /* This is an oddity of the fact we're using the same
391	  * descriptions for registers in align_16 as align_1:
392	  */
393	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
394	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
395	 else
396	    insn->bits3.da16.src1_vert_stride = reg.vstride;
397      }
398   }
399}
400
401/**
402 * Set the Message Descriptor and Extended Message Descriptor fields
403 * for SEND messages.
404 *
405 * \note This zeroes out the Function Control bits, so it must be called
406 *       \b before filling out any message-specific data.  Callers can
407 *       choose not to fill in irrelevant bits; they will be zero.
408 */
409static void
410brw_set_message_descriptor(struct brw_compile *p,
411			   struct brw_instruction *inst,
412			   enum brw_message_target sfid,
413			   unsigned msg_length,
414			   unsigned response_length,
415			   bool header_present,
416			   bool end_of_thread)
417{
418   struct brw_context *brw = p->brw;
419
420   brw_set_src1(p, inst, brw_imm_d(0));
421
422   if (brw->gen >= 5) {
423      inst->bits3.generic_gen5.header_present = header_present;
424      inst->bits3.generic_gen5.response_length = response_length;
425      inst->bits3.generic_gen5.msg_length = msg_length;
426      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
427
428      if (brw->gen >= 6) {
429	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
430	 inst->header.destreg__conditionalmod = sfid;
431      } else {
432	 /* Set Extended Message Descriptor (ex_desc) */
433	 inst->bits2.send_gen5.sfid = sfid;
434	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
435      }
436   } else {
437      inst->bits3.generic.response_length = response_length;
438      inst->bits3.generic.msg_length = msg_length;
439      inst->bits3.generic.msg_target = sfid;
440      inst->bits3.generic.end_of_thread = end_of_thread;
441   }
442}
443
444static void brw_set_math_message( struct brw_compile *p,
445				  struct brw_instruction *insn,
446				  GLuint function,
447				  GLuint integer_type,
448				  bool low_precision,
449				  GLuint dataType )
450{
451   struct brw_context *brw = p->brw;
452   unsigned msg_length;
453   unsigned response_length;
454
455   /* Infer message length from the function */
456   switch (function) {
457   case BRW_MATH_FUNCTION_POW:
458   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
459   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
460   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
461      msg_length = 2;
462      break;
463   default:
464      msg_length = 1;
465      break;
466   }
467
468   /* Infer response length from the function */
469   switch (function) {
470   case BRW_MATH_FUNCTION_SINCOS:
471   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
472      response_length = 2;
473      break;
474   default:
475      response_length = 1;
476      break;
477   }
478
479
480   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
481			      msg_length, response_length, false, false);
482   if (brw->gen == 5) {
483      insn->bits3.math_gen5.function = function;
484      insn->bits3.math_gen5.int_type = integer_type;
485      insn->bits3.math_gen5.precision = low_precision;
486      insn->bits3.math_gen5.saturate = insn->header.saturate;
487      insn->bits3.math_gen5.data_type = dataType;
488      insn->bits3.math_gen5.snapshot = 0;
489   } else {
490      insn->bits3.math.function = function;
491      insn->bits3.math.int_type = integer_type;
492      insn->bits3.math.precision = low_precision;
493      insn->bits3.math.saturate = insn->header.saturate;
494      insn->bits3.math.data_type = dataType;
495   }
496   insn->header.saturate = 0;
497}
498
499
500static void brw_set_ff_sync_message(struct brw_compile *p,
501				    struct brw_instruction *insn,
502				    bool allocate,
503				    GLuint response_length,
504				    bool end_of_thread)
505{
506   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
507			      1, response_length, true, end_of_thread);
508   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
509   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
510   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
511   insn->bits3.urb_gen5.allocate = allocate;
512   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
513   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
514}
515
516static void brw_set_urb_message( struct brw_compile *p,
517				 struct brw_instruction *insn,
518                                 unsigned flags,
519				 GLuint msg_length,
520				 GLuint response_length,
521				 GLuint offset,
522				 GLuint swizzle_control )
523{
524   struct brw_context *brw = p->brw;
525
526   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
527			      msg_length, response_length, true,
528                              flags & BRW_URB_WRITE_EOT);
529   if (brw->gen == 7) {
530      insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
531      insn->bits3.urb_gen7.offset = offset;
532      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
533      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
534      insn->bits3.urb_gen7.per_slot_offset =
535         flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
536      insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
537   } else if (brw->gen >= 5) {
538      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
539      insn->bits3.urb_gen5.offset = offset;
540      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
541      insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
542      insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
543      insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
544   } else {
545      insn->bits3.urb.opcode = 0;	/* ? */
546      insn->bits3.urb.offset = offset;
547      insn->bits3.urb.swizzle_control = swizzle_control;
548      insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
549      insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
550      insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
551   }
552}
553
554void
555brw_set_dp_write_message(struct brw_compile *p,
556			 struct brw_instruction *insn,
557			 GLuint binding_table_index,
558			 GLuint msg_control,
559			 GLuint msg_type,
560			 GLuint msg_length,
561			 bool header_present,
562			 GLuint last_render_target,
563			 GLuint response_length,
564			 GLuint end_of_thread,
565			 GLuint send_commit_msg)
566{
567   struct brw_context *brw = p->brw;
568   unsigned sfid;
569
570   if (brw->gen >= 7) {
571      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
572      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
573	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
574      else
575	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
576   } else if (brw->gen == 6) {
577      /* Use the render cache for all write messages. */
578      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
579   } else {
580      sfid = BRW_SFID_DATAPORT_WRITE;
581   }
582
583   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
584			      header_present, end_of_thread);
585
586   if (brw->gen >= 7) {
587      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
588      insn->bits3.gen7_dp.msg_control = msg_control;
589      insn->bits3.gen7_dp.last_render_target = last_render_target;
590      insn->bits3.gen7_dp.msg_type = msg_type;
591   } else if (brw->gen == 6) {
592      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
593      insn->bits3.gen6_dp.msg_control = msg_control;
594      insn->bits3.gen6_dp.last_render_target = last_render_target;
595      insn->bits3.gen6_dp.msg_type = msg_type;
596      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
597   } else if (brw->gen == 5) {
598      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
599      insn->bits3.dp_write_gen5.msg_control = msg_control;
600      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
601      insn->bits3.dp_write_gen5.msg_type = msg_type;
602      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
603   } else {
604      insn->bits3.dp_write.binding_table_index = binding_table_index;
605      insn->bits3.dp_write.msg_control = msg_control;
606      insn->bits3.dp_write.last_render_target = last_render_target;
607      insn->bits3.dp_write.msg_type = msg_type;
608      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
609   }
610}
611
612void
613brw_set_dp_read_message(struct brw_compile *p,
614			struct brw_instruction *insn,
615			GLuint binding_table_index,
616			GLuint msg_control,
617			GLuint msg_type,
618			GLuint target_cache,
619			GLuint msg_length,
620                        bool header_present,
621			GLuint response_length)
622{
623   struct brw_context *brw = p->brw;
624   unsigned sfid;
625
626   if (brw->gen >= 7) {
627      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
628   } else if (brw->gen == 6) {
629      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
630	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
631      else
632	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
633   } else {
634      sfid = BRW_SFID_DATAPORT_READ;
635   }
636
637   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
638			      header_present, false);
639
640   if (brw->gen >= 7) {
641      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
642      insn->bits3.gen7_dp.msg_control = msg_control;
643      insn->bits3.gen7_dp.last_render_target = 0;
644      insn->bits3.gen7_dp.msg_type = msg_type;
645   } else if (brw->gen == 6) {
646      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
647      insn->bits3.gen6_dp.msg_control = msg_control;
648      insn->bits3.gen6_dp.last_render_target = 0;
649      insn->bits3.gen6_dp.msg_type = msg_type;
650      insn->bits3.gen6_dp.send_commit_msg = 0;
651   } else if (brw->gen == 5) {
652      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
653      insn->bits3.dp_read_gen5.msg_control = msg_control;
654      insn->bits3.dp_read_gen5.msg_type = msg_type;
655      insn->bits3.dp_read_gen5.target_cache = target_cache;
656   } else if (brw->is_g4x) {
657      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
658      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
659      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
660      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
661   } else {
662      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
663      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
664      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
665      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
666   }
667}
668
669void
670brw_set_sampler_message(struct brw_compile *p,
671                        struct brw_instruction *insn,
672                        GLuint binding_table_index,
673                        GLuint sampler,
674                        GLuint msg_type,
675                        GLuint response_length,
676                        GLuint msg_length,
677                        GLuint header_present,
678                        GLuint simd_mode,
679                        GLuint return_format)
680{
681   struct brw_context *brw = p->brw;
682
683   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
684			      response_length, header_present, false);
685
686   if (brw->gen >= 7) {
687      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
688      insn->bits3.sampler_gen7.sampler = sampler;
689      insn->bits3.sampler_gen7.msg_type = msg_type;
690      insn->bits3.sampler_gen7.simd_mode = simd_mode;
691   } else if (brw->gen >= 5) {
692      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
693      insn->bits3.sampler_gen5.sampler = sampler;
694      insn->bits3.sampler_gen5.msg_type = msg_type;
695      insn->bits3.sampler_gen5.simd_mode = simd_mode;
696   } else if (brw->is_g4x) {
697      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
698      insn->bits3.sampler_g4x.sampler = sampler;
699      insn->bits3.sampler_g4x.msg_type = msg_type;
700   } else {
701      insn->bits3.sampler.binding_table_index = binding_table_index;
702      insn->bits3.sampler.sampler = sampler;
703      insn->bits3.sampler.msg_type = msg_type;
704      insn->bits3.sampler.return_format = return_format;
705   }
706}
707
708
709#define next_insn brw_next_insn
710struct brw_instruction *
711brw_next_insn(struct brw_compile *p, GLuint opcode)
712{
713   struct brw_instruction *insn;
714
715   if (p->nr_insn + 1 > p->store_size) {
716      if (0)
717         printf("incresing the store size to %d\n", p->store_size << 1);
718      p->store_size <<= 1;
719      p->store = reralloc(p->mem_ctx, p->store,
720                          struct brw_instruction, p->store_size);
721      if (!p->store)
722         assert(!"realloc eu store memeory failed");
723   }
724
725   p->next_insn_offset += 16;
726   insn = &p->store[p->nr_insn++];
727   memcpy(insn, p->current, sizeof(*insn));
728
729   /* Reset this one-shot flag:
730    */
731
732   if (p->current->header.destreg__conditionalmod) {
733      p->current->header.destreg__conditionalmod = 0;
734      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
735   }
736
737   insn->header.opcode = opcode;
738   return insn;
739}
740
741static struct brw_instruction *brw_alu1( struct brw_compile *p,
742					 GLuint opcode,
743					 struct brw_reg dest,
744					 struct brw_reg src )
745{
746   struct brw_instruction *insn = next_insn(p, opcode);
747   brw_set_dest(p, insn, dest);
748   brw_set_src0(p, insn, src);
749   return insn;
750}
751
752static struct brw_instruction *brw_alu2(struct brw_compile *p,
753					GLuint opcode,
754					struct brw_reg dest,
755					struct brw_reg src0,
756					struct brw_reg src1 )
757{
758   struct brw_instruction *insn = next_insn(p, opcode);
759   brw_set_dest(p, insn, dest);
760   brw_set_src0(p, insn, src0);
761   brw_set_src1(p, insn, src1);
762   return insn;
763}
764
765static int
766get_3src_subreg_nr(struct brw_reg reg)
767{
768   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
769      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
770      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
771   } else {
772      return reg.subnr / 4;
773   }
774}
775
776static struct brw_instruction *brw_alu3(struct brw_compile *p,
777					GLuint opcode,
778					struct brw_reg dest,
779					struct brw_reg src0,
780					struct brw_reg src1,
781					struct brw_reg src2)
782{
783   struct brw_context *brw = p->brw;
784   struct brw_instruction *insn = next_insn(p, opcode);
785
786   gen7_convert_mrf_to_grf(p, &dest);
787
788   assert(insn->header.access_mode == BRW_ALIGN_16);
789
790   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
791	  dest.file == BRW_MESSAGE_REGISTER_FILE);
792   assert(dest.nr < 128);
793   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
794   assert(dest.type == BRW_REGISTER_TYPE_F ||
795          dest.type == BRW_REGISTER_TYPE_D ||
796          dest.type == BRW_REGISTER_TYPE_UD);
797   insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
798   insn->bits1.da3src.dest_reg_nr = dest.nr;
799   insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
800   insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
801   guess_execution_size(p, insn, dest);
802
803   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
804   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
805   assert(src0.nr < 128);
806   insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
807   insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
808   insn->bits2.da3src.src0_reg_nr = src0.nr;
809   insn->bits1.da3src.src0_abs = src0.abs;
810   insn->bits1.da3src.src0_negate = src0.negate;
811   insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
812
813   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
814   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
815   assert(src1.nr < 128);
816   insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
817   insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
818   insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
819   insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
820   insn->bits3.da3src.src1_reg_nr = src1.nr;
821   insn->bits1.da3src.src1_abs = src1.abs;
822   insn->bits1.da3src.src1_negate = src1.negate;
823
824   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
825   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
826   assert(src2.nr < 128);
827   insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
828   insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
829   insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
830   insn->bits3.da3src.src2_reg_nr = src2.nr;
831   insn->bits1.da3src.src2_abs = src2.abs;
832   insn->bits1.da3src.src2_negate = src2.negate;
833
834   if (brw->gen >= 7) {
835      /* Set both the source and destination types based on dest.type,
836       * ignoring the source register types.  The MAD and LRP emitters ensure
837       * that all four types are float.  The BFE and BFI2 emitters, however,
838       * may send us mixed D and UD types and want us to ignore that and use
839       * the destination type.
840       */
841      switch (dest.type) {
842      case BRW_REGISTER_TYPE_F:
843         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
844         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
845         break;
846      case BRW_REGISTER_TYPE_D:
847         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
848         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
849         break;
850      case BRW_REGISTER_TYPE_UD:
851         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
852         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
853         break;
854      }
855   }
856
857   return insn;
858}
859
860
861/***********************************************************************
862 * Convenience routines.
863 */
864#define ALU1(OP)					\
865struct brw_instruction *brw_##OP(struct brw_compile *p,	\
866	      struct brw_reg dest,			\
867	      struct brw_reg src0)   			\
868{							\
869   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
870}
871
872#define ALU2(OP)					\
873struct brw_instruction *brw_##OP(struct brw_compile *p,	\
874	      struct brw_reg dest,			\
875	      struct brw_reg src0,			\
876	      struct brw_reg src1)   			\
877{							\
878   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
879}
880
881#define ALU3(OP)					\
882struct brw_instruction *brw_##OP(struct brw_compile *p,	\
883	      struct brw_reg dest,			\
884	      struct brw_reg src0,			\
885	      struct brw_reg src1,			\
886	      struct brw_reg src2)   			\
887{							\
888   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
889}
890
891#define ALU3F(OP)                                               \
892struct brw_instruction *brw_##OP(struct brw_compile *p,         \
893                                 struct brw_reg dest,           \
894                                 struct brw_reg src0,           \
895                                 struct brw_reg src1,           \
896                                 struct brw_reg src2)           \
897{                                                               \
898   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
899   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
900   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
901   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
902   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
903}
904
905/* Rounding operations (other than RNDD) require two instructions - the first
906 * stores a rounded value (possibly the wrong way) in the dest register, but
907 * also sets a per-channel "increment bit" in the flag register.  A predicated
908 * add of 1.0 fixes dest to contain the desired result.
909 *
910 * Sandybridge and later appear to round correctly without an ADD.
911 */
912#define ROUND(OP)							      \
913void brw_##OP(struct brw_compile *p,					      \
914	      struct brw_reg dest,					      \
915	      struct brw_reg src)					      \
916{									      \
917   struct brw_instruction *rnd, *add;					      \
918   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
919   brw_set_dest(p, rnd, dest);						      \
920   brw_set_src0(p, rnd, src);						      \
921									      \
922   if (p->brw->gen < 6) {						      \
923      /* turn on round-increments */					      \
924      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
925      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
926      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
927   }									      \
928}
929
930
931ALU1(MOV)
932ALU2(SEL)
933ALU1(NOT)
934ALU2(AND)
935ALU2(OR)
936ALU2(XOR)
937ALU2(SHR)
938ALU2(SHL)
939ALU2(RSR)
940ALU2(RSL)
941ALU2(ASR)
942ALU1(F32TO16)
943ALU1(F16TO32)
944ALU1(FRC)
945ALU1(RNDD)
946ALU2(MAC)
947ALU2(MACH)
948ALU1(LZD)
949ALU2(DP4)
950ALU2(DPH)
951ALU2(DP3)
952ALU2(DP2)
953ALU2(LINE)
954ALU2(PLN)
955ALU3F(MAD)
956ALU3F(LRP)
957ALU1(BFREV)
958ALU3(BFE)
959ALU2(BFI1)
960ALU3(BFI2)
961ALU1(FBH)
962ALU1(FBL)
963ALU1(CBIT)
964
965ROUND(RNDZ)
966ROUND(RNDE)
967
968
969struct brw_instruction *brw_ADD(struct brw_compile *p,
970				struct brw_reg dest,
971				struct brw_reg src0,
972				struct brw_reg src1)
973{
974   /* 6.2.2: add */
975   if (src0.type == BRW_REGISTER_TYPE_F ||
976       (src0.file == BRW_IMMEDIATE_VALUE &&
977	src0.type == BRW_REGISTER_TYPE_VF)) {
978      assert(src1.type != BRW_REGISTER_TYPE_UD);
979      assert(src1.type != BRW_REGISTER_TYPE_D);
980   }
981
982   if (src1.type == BRW_REGISTER_TYPE_F ||
983       (src1.file == BRW_IMMEDIATE_VALUE &&
984	src1.type == BRW_REGISTER_TYPE_VF)) {
985      assert(src0.type != BRW_REGISTER_TYPE_UD);
986      assert(src0.type != BRW_REGISTER_TYPE_D);
987   }
988
989   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
990}
991
992struct brw_instruction *brw_AVG(struct brw_compile *p,
993                                struct brw_reg dest,
994                                struct brw_reg src0,
995                                struct brw_reg src1)
996{
997   assert(dest.type == src0.type);
998   assert(src0.type == src1.type);
999   switch (src0.type) {
1000   case BRW_REGISTER_TYPE_B:
1001   case BRW_REGISTER_TYPE_UB:
1002   case BRW_REGISTER_TYPE_W:
1003   case BRW_REGISTER_TYPE_UW:
1004   case BRW_REGISTER_TYPE_D:
1005   case BRW_REGISTER_TYPE_UD:
1006      break;
1007   default:
1008      assert(!"Bad type for brw_AVG");
1009   }
1010
1011   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1012}
1013
1014struct brw_instruction *brw_MUL(struct brw_compile *p,
1015				struct brw_reg dest,
1016				struct brw_reg src0,
1017				struct brw_reg src1)
1018{
1019   /* 6.32.38: mul */
1020   if (src0.type == BRW_REGISTER_TYPE_D ||
1021       src0.type == BRW_REGISTER_TYPE_UD ||
1022       src1.type == BRW_REGISTER_TYPE_D ||
1023       src1.type == BRW_REGISTER_TYPE_UD) {
1024      assert(dest.type != BRW_REGISTER_TYPE_F);
1025   }
1026
1027   if (src0.type == BRW_REGISTER_TYPE_F ||
1028       (src0.file == BRW_IMMEDIATE_VALUE &&
1029	src0.type == BRW_REGISTER_TYPE_VF)) {
1030      assert(src1.type != BRW_REGISTER_TYPE_UD);
1031      assert(src1.type != BRW_REGISTER_TYPE_D);
1032   }
1033
1034   if (src1.type == BRW_REGISTER_TYPE_F ||
1035       (src1.file == BRW_IMMEDIATE_VALUE &&
1036	src1.type == BRW_REGISTER_TYPE_VF)) {
1037      assert(src0.type != BRW_REGISTER_TYPE_UD);
1038      assert(src0.type != BRW_REGISTER_TYPE_D);
1039   }
1040
1041   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1042	  src0.nr != BRW_ARF_ACCUMULATOR);
1043   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1044	  src1.nr != BRW_ARF_ACCUMULATOR);
1045
1046   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1047}
1048
1049
1050void brw_NOP(struct brw_compile *p)
1051{
1052   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1053   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1054   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1055   brw_set_src1(p, insn, brw_imm_ud(0x0));
1056}
1057
1058
1059
1060
1061
1062/***********************************************************************
1063 * Comparisons, if/else/endif
1064 */
1065
1066struct brw_instruction *brw_JMPI(struct brw_compile *p,
1067                                 struct brw_reg dest,
1068                                 struct brw_reg src0,
1069                                 struct brw_reg src1)
1070{
1071   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1072
1073   insn->header.execution_size = 1;
1074   insn->header.compression_control = BRW_COMPRESSION_NONE;
1075   insn->header.mask_control = BRW_MASK_DISABLE;
1076
1077   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1078
1079   return insn;
1080}
1081
1082static void
1083push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1084{
1085   p->if_stack[p->if_stack_depth] = inst - p->store;
1086
1087   p->if_stack_depth++;
1088   if (p->if_stack_array_size <= p->if_stack_depth) {
1089      p->if_stack_array_size *= 2;
1090      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1091			     p->if_stack_array_size);
1092   }
1093}
1094
1095static struct brw_instruction *
1096pop_if_stack(struct brw_compile *p)
1097{
1098   p->if_stack_depth--;
1099   return &p->store[p->if_stack[p->if_stack_depth]];
1100}
1101
1102static void
1103push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1104{
1105   if (p->loop_stack_array_size < p->loop_stack_depth) {
1106      p->loop_stack_array_size *= 2;
1107      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1108			       p->loop_stack_array_size);
1109      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1110				     p->loop_stack_array_size);
1111   }
1112
1113   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1114   p->loop_stack_depth++;
1115   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1116}
1117
1118static struct brw_instruction *
1119get_inner_do_insn(struct brw_compile *p)
1120{
1121   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1122}
1123
1124/* EU takes the value from the flag register and pushes it onto some
1125 * sort of a stack (presumably merging with any flag value already on
1126 * the stack).  Within an if block, the flags at the top of the stack
1127 * control execution on each channel of the unit, eg. on each of the
1128 * 16 pixel values in our wm programs.
1129 *
1130 * When the matching 'else' instruction is reached (presumably by
1131 * countdown of the instruction count patched in by our ELSE/ENDIF
1132 * functions), the relevent flags are inverted.
1133 *
1134 * When the matching 'endif' instruction is reached, the flags are
1135 * popped off.  If the stack is now empty, normal execution resumes.
1136 */
1137struct brw_instruction *
1138brw_IF(struct brw_compile *p, GLuint execute_size)
1139{
1140   struct brw_context *brw = p->brw;
1141   struct brw_instruction *insn;
1142
1143   insn = next_insn(p, BRW_OPCODE_IF);
1144
1145   /* Override the defaults for this instruction:
1146    */
1147   if (brw->gen < 6) {
1148      brw_set_dest(p, insn, brw_ip_reg());
1149      brw_set_src0(p, insn, brw_ip_reg());
1150      brw_set_src1(p, insn, brw_imm_d(0x0));
1151   } else if (brw->gen == 6) {
1152      brw_set_dest(p, insn, brw_imm_w(0));
1153      insn->bits1.branch_gen6.jump_count = 0;
1154      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1155      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1156   } else {
1157      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1158      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1159      brw_set_src1(p, insn, brw_imm_ud(0));
1160      insn->bits3.break_cont.jip = 0;
1161      insn->bits3.break_cont.uip = 0;
1162   }
1163
1164   insn->header.execution_size = execute_size;
1165   insn->header.compression_control = BRW_COMPRESSION_NONE;
1166   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1167   insn->header.mask_control = BRW_MASK_ENABLE;
1168   if (!p->single_program_flow)
1169      insn->header.thread_control = BRW_THREAD_SWITCH;
1170
1171   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1172
1173   push_if_stack(p, insn);
1174   p->if_depth_in_loop[p->loop_stack_depth]++;
1175   return insn;
1176}
1177
1178/* This function is only used for gen6-style IF instructions with an
1179 * embedded comparison (conditional modifier).  It is not used on gen7.
1180 */
1181struct brw_instruction *
1182gen6_IF(struct brw_compile *p, uint32_t conditional,
1183	struct brw_reg src0, struct brw_reg src1)
1184{
1185   struct brw_instruction *insn;
1186
1187   insn = next_insn(p, BRW_OPCODE_IF);
1188
1189   brw_set_dest(p, insn, brw_imm_w(0));
1190   if (p->compressed) {
1191      insn->header.execution_size = BRW_EXECUTE_16;
1192   } else {
1193      insn->header.execution_size = BRW_EXECUTE_8;
1194   }
1195   insn->bits1.branch_gen6.jump_count = 0;
1196   brw_set_src0(p, insn, src0);
1197   brw_set_src1(p, insn, src1);
1198
1199   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1200   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1201   insn->header.destreg__conditionalmod = conditional;
1202
1203   if (!p->single_program_flow)
1204      insn->header.thread_control = BRW_THREAD_SWITCH;
1205
1206   push_if_stack(p, insn);
1207   return insn;
1208}
1209
1210/**
1211 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1212 */
1213static void
1214convert_IF_ELSE_to_ADD(struct brw_compile *p,
1215		       struct brw_instruction *if_inst,
1216		       struct brw_instruction *else_inst)
1217{
1218   /* The next instruction (where the ENDIF would be, if it existed) */
1219   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1220
1221   assert(p->single_program_flow);
1222   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1223   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1224   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1225
1226   /* Convert IF to an ADD instruction that moves the instruction pointer
1227    * to the first instruction of the ELSE block.  If there is no ELSE
1228    * block, point to where ENDIF would be.  Reverse the predicate.
1229    *
1230    * There's no need to execute an ENDIF since we don't need to do any
1231    * stack operations, and if we're currently executing, we just want to
1232    * continue normally.
1233    */
1234   if_inst->header.opcode = BRW_OPCODE_ADD;
1235   if_inst->header.predicate_inverse = 1;
1236
1237   if (else_inst != NULL) {
1238      /* Convert ELSE to an ADD instruction that points where the ENDIF
1239       * would be.
1240       */
1241      else_inst->header.opcode = BRW_OPCODE_ADD;
1242
1243      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1244      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1245   } else {
1246      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1247   }
1248}
1249
1250/**
1251 * Patch IF and ELSE instructions with appropriate jump targets.
1252 */
1253static void
1254patch_IF_ELSE(struct brw_compile *p,
1255	      struct brw_instruction *if_inst,
1256	      struct brw_instruction *else_inst,
1257	      struct brw_instruction *endif_inst)
1258{
1259   struct brw_context *brw = p->brw;
1260
1261   /* We shouldn't be patching IF and ELSE instructions in single program flow
1262    * mode when gen < 6, because in single program flow mode on those
1263    * platforms, we convert flow control instructions to conditional ADDs that
1264    * operate on IP (see brw_ENDIF).
1265    *
1266    * However, on Gen6, writing to IP doesn't work in single program flow mode
1267    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1268    * not be updated by non-flow control instructions.").  And on later
1269    * platforms, there is no significant benefit to converting control flow
1270    * instructions to conditional ADDs.  So we do patch IF and ELSE
1271    * instructions in single program flow mode on those platforms.
1272    */
1273   if (brw->gen < 6)
1274      assert(!p->single_program_flow);
1275
1276   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1277   assert(endif_inst != NULL);
1278   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1279
1280   unsigned br = 1;
1281   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1282    * requires 2 chunks.
1283    */
1284   if (brw->gen >= 5)
1285      br = 2;
1286
1287   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1288   endif_inst->header.execution_size = if_inst->header.execution_size;
1289
1290   if (else_inst == NULL) {
1291      /* Patch IF -> ENDIF */
1292      if (brw->gen < 6) {
1293	 /* Turn it into an IFF, which means no mask stack operations for
1294	  * all-false and jumping past the ENDIF.
1295	  */
1296	 if_inst->header.opcode = BRW_OPCODE_IFF;
1297	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1298	 if_inst->bits3.if_else.pop_count = 0;
1299	 if_inst->bits3.if_else.pad0 = 0;
1300      } else if (brw->gen == 6) {
1301	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1302	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1303      } else {
1304	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1305	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1306      }
1307   } else {
1308      else_inst->header.execution_size = if_inst->header.execution_size;
1309
1310      /* Patch IF -> ELSE */
1311      if (brw->gen < 6) {
1312	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1313	 if_inst->bits3.if_else.pop_count = 0;
1314	 if_inst->bits3.if_else.pad0 = 0;
1315      } else if (brw->gen == 6) {
1316	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1317      }
1318
1319      /* Patch ELSE -> ENDIF */
1320      if (brw->gen < 6) {
1321	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1322	  * matching ENDIF.
1323	  */
1324	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1325	 else_inst->bits3.if_else.pop_count = 1;
1326	 else_inst->bits3.if_else.pad0 = 0;
1327      } else if (brw->gen == 6) {
1328	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1329	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1330      } else {
1331	 /* The IF instruction's JIP should point just past the ELSE */
1332	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1333	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1334	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1335	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1336      }
1337   }
1338}
1339
1340void
1341brw_ELSE(struct brw_compile *p)
1342{
1343   struct brw_context *brw = p->brw;
1344   struct brw_instruction *insn;
1345
1346   insn = next_insn(p, BRW_OPCODE_ELSE);
1347
1348   if (brw->gen < 6) {
1349      brw_set_dest(p, insn, brw_ip_reg());
1350      brw_set_src0(p, insn, brw_ip_reg());
1351      brw_set_src1(p, insn, brw_imm_d(0x0));
1352   } else if (brw->gen == 6) {
1353      brw_set_dest(p, insn, brw_imm_w(0));
1354      insn->bits1.branch_gen6.jump_count = 0;
1355      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1356      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1357   } else {
1358      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1359      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1360      brw_set_src1(p, insn, brw_imm_ud(0));
1361      insn->bits3.break_cont.jip = 0;
1362      insn->bits3.break_cont.uip = 0;
1363   }
1364
1365   insn->header.compression_control = BRW_COMPRESSION_NONE;
1366   insn->header.mask_control = BRW_MASK_ENABLE;
1367   if (!p->single_program_flow)
1368      insn->header.thread_control = BRW_THREAD_SWITCH;
1369
1370   push_if_stack(p, insn);
1371}
1372
1373void
1374brw_ENDIF(struct brw_compile *p)
1375{
1376   struct brw_context *brw = p->brw;
1377   struct brw_instruction *insn = NULL;
1378   struct brw_instruction *else_inst = NULL;
1379   struct brw_instruction *if_inst = NULL;
1380   struct brw_instruction *tmp;
1381   bool emit_endif = true;
1382
1383   /* In single program flow mode, we can express IF and ELSE instructions
1384    * equivalently as ADD instructions that operate on IP.  On platforms prior
1385    * to Gen6, flow control instructions cause an implied thread switch, so
1386    * this is a significant savings.
1387    *
1388    * However, on Gen6, writing to IP doesn't work in single program flow mode
1389    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1390    * not be updated by non-flow control instructions.").  And on later
1391    * platforms, there is no significant benefit to converting control flow
1392    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1393    * Gen5.
1394    */
1395   if (brw->gen < 6 && p->single_program_flow)
1396      emit_endif = false;
1397
1398   /*
1399    * A single next_insn() may change the base adress of instruction store
1400    * memory(p->store), so call it first before referencing the instruction
1401    * store pointer from an index
1402    */
1403   if (emit_endif)
1404      insn = next_insn(p, BRW_OPCODE_ENDIF);
1405
1406   /* Pop the IF and (optional) ELSE instructions from the stack */
1407   p->if_depth_in_loop[p->loop_stack_depth]--;
1408   tmp = pop_if_stack(p);
1409   if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1410      else_inst = tmp;
1411      tmp = pop_if_stack(p);
1412   }
1413   if_inst = tmp;
1414
1415   if (!emit_endif) {
1416      /* ENDIF is useless; don't bother emitting it. */
1417      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1418      return;
1419   }
1420
1421   if (brw->gen < 6) {
1422      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1423      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1424      brw_set_src1(p, insn, brw_imm_d(0x0));
1425   } else if (brw->gen == 6) {
1426      brw_set_dest(p, insn, brw_imm_w(0));
1427      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1428      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1429   } else {
1430      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1431      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1432      brw_set_src1(p, insn, brw_imm_ud(0));
1433   }
1434
1435   insn->header.compression_control = BRW_COMPRESSION_NONE;
1436   insn->header.mask_control = BRW_MASK_ENABLE;
1437   insn->header.thread_control = BRW_THREAD_SWITCH;
1438
1439   /* Also pop item off the stack in the endif instruction: */
1440   if (brw->gen < 6) {
1441      insn->bits3.if_else.jump_count = 0;
1442      insn->bits3.if_else.pop_count = 1;
1443      insn->bits3.if_else.pad0 = 0;
1444   } else if (brw->gen == 6) {
1445      insn->bits1.branch_gen6.jump_count = 2;
1446   } else {
1447      insn->bits3.break_cont.jip = 2;
1448   }
1449   patch_IF_ELSE(p, if_inst, else_inst, insn);
1450}
1451
1452struct brw_instruction *brw_BREAK(struct brw_compile *p)
1453{
1454   struct brw_context *brw = p->brw;
1455   struct brw_instruction *insn;
1456
1457   insn = next_insn(p, BRW_OPCODE_BREAK);
1458   if (brw->gen >= 6) {
1459      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1460      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1461      brw_set_src1(p, insn, brw_imm_d(0x0));
1462   } else {
1463      brw_set_dest(p, insn, brw_ip_reg());
1464      brw_set_src0(p, insn, brw_ip_reg());
1465      brw_set_src1(p, insn, brw_imm_d(0x0));
1466      insn->bits3.if_else.pad0 = 0;
1467      insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1468   }
1469   insn->header.compression_control = BRW_COMPRESSION_NONE;
1470   insn->header.execution_size = BRW_EXECUTE_8;
1471
1472   return insn;
1473}
1474
1475struct brw_instruction *gen6_CONT(struct brw_compile *p)
1476{
1477   struct brw_instruction *insn;
1478
1479   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1480   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1481   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1482   brw_set_dest(p, insn, brw_ip_reg());
1483   brw_set_src0(p, insn, brw_ip_reg());
1484   brw_set_src1(p, insn, brw_imm_d(0x0));
1485
1486   insn->header.compression_control = BRW_COMPRESSION_NONE;
1487   insn->header.execution_size = BRW_EXECUTE_8;
1488   return insn;
1489}
1490
1491struct brw_instruction *brw_CONT(struct brw_compile *p)
1492{
1493   struct brw_instruction *insn;
1494   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1495   brw_set_dest(p, insn, brw_ip_reg());
1496   brw_set_src0(p, insn, brw_ip_reg());
1497   brw_set_src1(p, insn, brw_imm_d(0x0));
1498   insn->header.compression_control = BRW_COMPRESSION_NONE;
1499   insn->header.execution_size = BRW_EXECUTE_8;
1500   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1501   insn->bits3.if_else.pad0 = 0;
1502   insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1503   return insn;
1504}
1505
1506struct brw_instruction *gen6_HALT(struct brw_compile *p)
1507{
1508   struct brw_instruction *insn;
1509
1510   insn = next_insn(p, BRW_OPCODE_HALT);
1511   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1512   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1513   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1514
1515   if (p->compressed) {
1516      insn->header.execution_size = BRW_EXECUTE_16;
1517   } else {
1518      insn->header.compression_control = BRW_COMPRESSION_NONE;
1519      insn->header.execution_size = BRW_EXECUTE_8;
1520   }
1521   return insn;
1522}
1523
1524/* DO/WHILE loop:
1525 *
1526 * The DO/WHILE is just an unterminated loop -- break or continue are
1527 * used for control within the loop.  We have a few ways they can be
1528 * done.
1529 *
1530 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1531 * jip and no DO instruction.
1532 *
1533 * For non-uniform control flow pre-gen6, there's a DO instruction to
1534 * push the mask, and a WHILE to jump back, and BREAK to get out and
1535 * pop the mask.
1536 *
1537 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1538 * just points back to the first instruction of the loop.
1539 */
1540struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1541{
1542   struct brw_context *brw = p->brw;
1543
1544   if (brw->gen >= 6 || p->single_program_flow) {
1545      push_loop_stack(p, &p->store[p->nr_insn]);
1546      return &p->store[p->nr_insn];
1547   } else {
1548      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1549
1550      push_loop_stack(p, insn);
1551
1552      /* Override the defaults for this instruction:
1553       */
1554      brw_set_dest(p, insn, brw_null_reg());
1555      brw_set_src0(p, insn, brw_null_reg());
1556      brw_set_src1(p, insn, brw_null_reg());
1557
1558      insn->header.compression_control = BRW_COMPRESSION_NONE;
1559      insn->header.execution_size = execute_size;
1560      insn->header.predicate_control = BRW_PREDICATE_NONE;
1561      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1562      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1563
1564      return insn;
1565   }
1566}
1567
1568/**
1569 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1570 * instruction here.
1571 *
1572 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1573 * nesting, since it can always just point to the end of the block/current loop.
1574 */
1575static void
1576brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1577{
1578   struct brw_context *brw = p->brw;
1579   struct brw_instruction *do_inst = get_inner_do_insn(p);
1580   struct brw_instruction *inst;
1581   int br = (brw->gen == 5) ? 2 : 1;
1582
1583   for (inst = while_inst - 1; inst != do_inst; inst--) {
1584      /* If the jump count is != 0, that means that this instruction has already
1585       * been patched because it's part of a loop inside of the one we're
1586       * patching.
1587       */
1588      if (inst->header.opcode == BRW_OPCODE_BREAK &&
1589	  inst->bits3.if_else.jump_count == 0) {
1590	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1591      } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1592		 inst->bits3.if_else.jump_count == 0) {
1593	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1594      }
1595   }
1596}
1597
1598struct brw_instruction *brw_WHILE(struct brw_compile *p)
1599{
1600   struct brw_context *brw = p->brw;
1601   struct brw_instruction *insn, *do_insn;
1602   GLuint br = 1;
1603
1604   if (brw->gen >= 5)
1605      br = 2;
1606
1607   if (brw->gen >= 7) {
1608      insn = next_insn(p, BRW_OPCODE_WHILE);
1609      do_insn = get_inner_do_insn(p);
1610
1611      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1612      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1613      brw_set_src1(p, insn, brw_imm_ud(0));
1614      insn->bits3.break_cont.jip = br * (do_insn - insn);
1615
1616      insn->header.execution_size = BRW_EXECUTE_8;
1617   } else if (brw->gen == 6) {
1618      insn = next_insn(p, BRW_OPCODE_WHILE);
1619      do_insn = get_inner_do_insn(p);
1620
1621      brw_set_dest(p, insn, brw_imm_w(0));
1622      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1623      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1624      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1625
1626      insn->header.execution_size = BRW_EXECUTE_8;
1627   } else {
1628      if (p->single_program_flow) {
1629	 insn = next_insn(p, BRW_OPCODE_ADD);
1630         do_insn = get_inner_do_insn(p);
1631
1632	 brw_set_dest(p, insn, brw_ip_reg());
1633	 brw_set_src0(p, insn, brw_ip_reg());
1634	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1635	 insn->header.execution_size = BRW_EXECUTE_1;
1636      } else {
1637	 insn = next_insn(p, BRW_OPCODE_WHILE);
1638         do_insn = get_inner_do_insn(p);
1639
1640	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1641
1642	 brw_set_dest(p, insn, brw_ip_reg());
1643	 brw_set_src0(p, insn, brw_ip_reg());
1644	 brw_set_src1(p, insn, brw_imm_d(0));
1645
1646	 insn->header.execution_size = do_insn->header.execution_size;
1647	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1648	 insn->bits3.if_else.pop_count = 0;
1649	 insn->bits3.if_else.pad0 = 0;
1650
1651	 brw_patch_break_cont(p, insn);
1652      }
1653   }
1654   insn->header.compression_control = BRW_COMPRESSION_NONE;
1655   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1656
1657   p->loop_stack_depth--;
1658
1659   return insn;
1660}
1661
1662
1663/* FORWARD JUMPS:
1664 */
1665void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1666{
1667   struct brw_context *brw = p->brw;
1668   struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1669   GLuint jmpi = 1;
1670
1671   if (brw->gen >= 5)
1672      jmpi = 2;
1673
1674   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1675   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1676
1677   jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1678}
1679
1680
1681
1682/* To integrate with the above, it makes sense that the comparison
1683 * instruction should populate the flag register.  It might be simpler
1684 * just to use the flag reg for most WM tasks?
1685 */
1686void brw_CMP(struct brw_compile *p,
1687	     struct brw_reg dest,
1688	     GLuint conditional,
1689	     struct brw_reg src0,
1690	     struct brw_reg src1)
1691{
1692   struct brw_context *brw = p->brw;
1693   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1694
1695   insn->header.destreg__conditionalmod = conditional;
1696   brw_set_dest(p, insn, dest);
1697   brw_set_src0(p, insn, src0);
1698   brw_set_src1(p, insn, src1);
1699
1700/*    guess_execution_size(insn, src0); */
1701
1702
1703   /* Make it so that future instructions will use the computed flag
1704    * value until brw_set_predicate_control_flag_value() is called
1705    * again.
1706    */
1707   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1708       dest.nr == 0) {
1709      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1710      p->flag_value = 0xff;
1711   }
1712
1713   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1714    * page says:
1715    *    "Any CMP instruction with a null destination must use a {switch}."
1716    *
1717    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1718    * mentioned on their work-arounds pages.
1719    */
1720   if (brw->gen == 7) {
1721      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1722          dest.nr == BRW_ARF_NULL) {
1723         insn->header.thread_control = BRW_THREAD_SWITCH;
1724      }
1725   }
1726}
1727
1728/* Issue 'wait' instruction for n1, host could program MMIO
1729   to wake up thread. */
1730void brw_WAIT (struct brw_compile *p)
1731{
1732   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1733   struct brw_reg src = brw_notification_1_reg();
1734
1735   brw_set_dest(p, insn, src);
1736   brw_set_src0(p, insn, src);
1737   brw_set_src1(p, insn, brw_null_reg());
1738   insn->header.execution_size = 0; /* must */
1739   insn->header.predicate_control = 0;
1740   insn->header.compression_control = 0;
1741}
1742
1743
1744/***********************************************************************
1745 * Helpers for the various SEND message types:
1746 */
1747
1748/** Extended math function, float[8].
1749 */
1750void brw_math( struct brw_compile *p,
1751	       struct brw_reg dest,
1752	       GLuint function,
1753	       GLuint msg_reg_nr,
1754	       struct brw_reg src,
1755	       GLuint data_type,
1756	       GLuint precision )
1757{
1758   struct brw_context *brw = p->brw;
1759
1760   if (brw->gen >= 6) {
1761      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1762
1763      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1764             (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1765      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1766
1767      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1768      if (brw->gen == 6)
1769	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1770
1771      /* Source modifiers are ignored for extended math instructions on Gen6. */
1772      if (brw->gen == 6) {
1773	 assert(!src.negate);
1774	 assert(!src.abs);
1775      }
1776
1777      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1778	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1779	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1780	 assert(src.type != BRW_REGISTER_TYPE_F);
1781      } else {
1782	 assert(src.type == BRW_REGISTER_TYPE_F);
1783      }
1784
1785      /* Math is the same ISA format as other opcodes, except that CondModifier
1786       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1787       */
1788      insn->header.destreg__conditionalmod = function;
1789
1790      brw_set_dest(p, insn, dest);
1791      brw_set_src0(p, insn, src);
1792      brw_set_src1(p, insn, brw_null_reg());
1793   } else {
1794      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1795
1796      /* Example code doesn't set predicate_control for send
1797       * instructions.
1798       */
1799      insn->header.predicate_control = 0;
1800      insn->header.destreg__conditionalmod = msg_reg_nr;
1801
1802      brw_set_dest(p, insn, dest);
1803      brw_set_src0(p, insn, src);
1804      brw_set_math_message(p,
1805			   insn,
1806			   function,
1807			   src.type == BRW_REGISTER_TYPE_D,
1808			   precision,
1809			   data_type);
1810   }
1811}
1812
1813/** Extended math function, float[8].
1814 */
1815void brw_math2(struct brw_compile *p,
1816	       struct brw_reg dest,
1817	       GLuint function,
1818	       struct brw_reg src0,
1819	       struct brw_reg src1)
1820{
1821   struct brw_context *brw = p->brw;
1822   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1823
1824   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1825          (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1826   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1827   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1828
1829   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1830   if (brw->gen == 6) {
1831      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1832      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1833   }
1834
1835   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1836       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1837       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1838      assert(src0.type != BRW_REGISTER_TYPE_F);
1839      assert(src1.type != BRW_REGISTER_TYPE_F);
1840   } else {
1841      assert(src0.type == BRW_REGISTER_TYPE_F);
1842      assert(src1.type == BRW_REGISTER_TYPE_F);
1843   }
1844
1845   /* Source modifiers are ignored for extended math instructions on Gen6. */
1846   if (brw->gen == 6) {
1847      assert(!src0.negate);
1848      assert(!src0.abs);
1849      assert(!src1.negate);
1850      assert(!src1.abs);
1851   }
1852
1853   /* Math is the same ISA format as other opcodes, except that CondModifier
1854    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1855    */
1856   insn->header.destreg__conditionalmod = function;
1857
1858   brw_set_dest(p, insn, dest);
1859   brw_set_src0(p, insn, src0);
1860   brw_set_src1(p, insn, src1);
1861}
1862
1863
1864/**
1865 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1866 * using a constant offset per channel.
1867 *
1868 * The offset must be aligned to oword size (16 bytes).  Used for
1869 * register spilling.
1870 */
1871void brw_oword_block_write_scratch(struct brw_compile *p,
1872				   struct brw_reg mrf,
1873				   int num_regs,
1874				   GLuint offset)
1875{
1876   struct brw_context *brw = p->brw;
1877   uint32_t msg_control, msg_type;
1878   int mlen;
1879
1880   if (brw->gen >= 6)
1881      offset /= 16;
1882
1883   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1884
1885   if (num_regs == 1) {
1886      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1887      mlen = 2;
1888   } else {
1889      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1890      mlen = 3;
1891   }
1892
1893   /* Set up the message header.  This is g0, with g0.2 filled with
1894    * the offset.  We don't want to leave our offset around in g0 or
1895    * it'll screw up texture samples, so set it up inside the message
1896    * reg.
1897    */
1898   {
1899      brw_push_insn_state(p);
1900      brw_set_mask_control(p, BRW_MASK_DISABLE);
1901      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1902
1903      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1904
1905      /* set message header global offset field (reg 0, element 2) */
1906      brw_MOV(p,
1907	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1908				  mrf.nr,
1909				  2), BRW_REGISTER_TYPE_UD),
1910	      brw_imm_ud(offset));
1911
1912      brw_pop_insn_state(p);
1913   }
1914
1915   {
1916      struct brw_reg dest;
1917      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1918      int send_commit_msg;
1919      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1920					 BRW_REGISTER_TYPE_UW);
1921
1922      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1923	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1924	 src_header = vec16(src_header);
1925      }
1926      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1927      insn->header.destreg__conditionalmod = mrf.nr;
1928
1929      /* Until gen6, writes followed by reads from the same location
1930       * are not guaranteed to be ordered unless write_commit is set.
1931       * If set, then a no-op write is issued to the destination
1932       * register to set a dependency, and a read from the destination
1933       * can be used to ensure the ordering.
1934       *
1935       * For gen6, only writes between different threads need ordering
1936       * protection.  Our use of DP writes is all about register
1937       * spilling within a thread.
1938       */
1939      if (brw->gen >= 6) {
1940	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1941	 send_commit_msg = 0;
1942      } else {
1943	 dest = src_header;
1944	 send_commit_msg = 1;
1945      }
1946
1947      brw_set_dest(p, insn, dest);
1948      if (brw->gen >= 6) {
1949	 brw_set_src0(p, insn, mrf);
1950      } else {
1951	 brw_set_src0(p, insn, brw_null_reg());
1952      }
1953
1954      if (brw->gen >= 6)
1955	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1956      else
1957	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1958
1959      brw_set_dp_write_message(p,
1960			       insn,
1961			       255, /* binding table index (255=stateless) */
1962			       msg_control,
1963			       msg_type,
1964			       mlen,
1965			       true, /* header_present */
1966			       0, /* not a render target */
1967			       send_commit_msg, /* response_length */
1968			       0, /* eot */
1969			       send_commit_msg);
1970   }
1971}
1972
1973
1974/**
1975 * Read a block of owords (half a GRF each) from the scratch buffer
1976 * using a constant index per channel.
1977 *
1978 * Offset must be aligned to oword size (16 bytes).  Used for register
1979 * spilling.
1980 */
1981void
1982brw_oword_block_read_scratch(struct brw_compile *p,
1983			     struct brw_reg dest,
1984			     struct brw_reg mrf,
1985			     int num_regs,
1986			     GLuint offset)
1987{
1988   struct brw_context *brw = p->brw;
1989   uint32_t msg_control;
1990   int rlen;
1991
1992   if (brw->gen >= 6)
1993      offset /= 16;
1994
1995   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1996   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1997
1998   if (num_regs == 1) {
1999      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2000      rlen = 1;
2001   } else {
2002      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2003      rlen = 2;
2004   }
2005
2006   {
2007      brw_push_insn_state(p);
2008      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2009      brw_set_mask_control(p, BRW_MASK_DISABLE);
2010
2011      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2012
2013      /* set message header global offset field (reg 0, element 2) */
2014      brw_MOV(p,
2015	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2016				  mrf.nr,
2017				  2), BRW_REGISTER_TYPE_UD),
2018	      brw_imm_ud(offset));
2019
2020      brw_pop_insn_state(p);
2021   }
2022
2023   {
2024      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2025
2026      assert(insn->header.predicate_control == 0);
2027      insn->header.compression_control = BRW_COMPRESSION_NONE;
2028      insn->header.destreg__conditionalmod = mrf.nr;
2029
2030      brw_set_dest(p, insn, dest);	/* UW? */
2031      if (brw->gen >= 6) {
2032	 brw_set_src0(p, insn, mrf);
2033      } else {
2034	 brw_set_src0(p, insn, brw_null_reg());
2035      }
2036
2037      brw_set_dp_read_message(p,
2038			      insn,
2039			      255, /* binding table index (255=stateless) */
2040			      msg_control,
2041			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2042			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2043			      1, /* msg_length */
2044                              true, /* header_present */
2045			      rlen);
2046   }
2047}
2048
2049/**
2050 * Read a float[4] vector from the data port Data Cache (const buffer).
2051 * Location (in buffer) should be a multiple of 16.
2052 * Used for fetching shader constants.
2053 */
2054void brw_oword_block_read(struct brw_compile *p,
2055			  struct brw_reg dest,
2056			  struct brw_reg mrf,
2057			  uint32_t offset,
2058			  uint32_t bind_table_index)
2059{
2060   struct brw_context *brw = p->brw;
2061
2062   /* On newer hardware, offset is in units of owords. */
2063   if (brw->gen >= 6)
2064      offset /= 16;
2065
2066   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2067
2068   brw_push_insn_state(p);
2069   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2070   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2071   brw_set_mask_control(p, BRW_MASK_DISABLE);
2072
2073   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2074
2075   /* set message header global offset field (reg 0, element 2) */
2076   brw_MOV(p,
2077	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2078			       mrf.nr,
2079			       2), BRW_REGISTER_TYPE_UD),
2080	   brw_imm_ud(offset));
2081
2082   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2083   insn->header.destreg__conditionalmod = mrf.nr;
2084
2085   /* cast dest to a uword[8] vector */
2086   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2087
2088   brw_set_dest(p, insn, dest);
2089   if (brw->gen >= 6) {
2090      brw_set_src0(p, insn, mrf);
2091   } else {
2092      brw_set_src0(p, insn, brw_null_reg());
2093   }
2094
2095   brw_set_dp_read_message(p,
2096			   insn,
2097			   bind_table_index,
2098			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2099			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2100			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2101			   1, /* msg_length */
2102                           true, /* header_present */
2103			   1); /* response_length (1 reg, 2 owords!) */
2104
2105   brw_pop_insn_state(p);
2106}
2107
2108
2109void brw_fb_WRITE(struct brw_compile *p,
2110		  int dispatch_width,
2111                  GLuint msg_reg_nr,
2112                  struct brw_reg src0,
2113                  GLuint msg_control,
2114                  GLuint binding_table_index,
2115                  GLuint msg_length,
2116                  GLuint response_length,
2117                  bool eot,
2118                  bool header_present)
2119{
2120   struct brw_context *brw = p->brw;
2121   struct brw_instruction *insn;
2122   GLuint msg_type;
2123   struct brw_reg dest;
2124
2125   if (dispatch_width == 16)
2126      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2127   else
2128      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2129
2130   if (brw->gen >= 6) {
2131      insn = next_insn(p, BRW_OPCODE_SENDC);
2132   } else {
2133      insn = next_insn(p, BRW_OPCODE_SEND);
2134   }
2135   /* The execution mask is ignored for render target writes. */
2136   insn->header.predicate_control = 0;
2137   insn->header.compression_control = BRW_COMPRESSION_NONE;
2138
2139   if (brw->gen >= 6) {
2140      /* headerless version, just submit color payload */
2141      src0 = brw_message_reg(msg_reg_nr);
2142
2143      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2144   } else {
2145      insn->header.destreg__conditionalmod = msg_reg_nr;
2146
2147      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2148   }
2149
2150   brw_set_dest(p, insn, dest);
2151   brw_set_src0(p, insn, src0);
2152   brw_set_dp_write_message(p,
2153			    insn,
2154			    binding_table_index,
2155			    msg_control,
2156			    msg_type,
2157			    msg_length,
2158			    header_present,
2159			    eot, /* last render target write */
2160			    response_length,
2161			    eot,
2162			    0 /* send_commit_msg */);
2163}
2164
2165
2166/**
2167 * Texture sample instruction.
2168 * Note: the msg_type plus msg_length values determine exactly what kind
2169 * of sampling operation is performed.  See volume 4, page 161 of docs.
2170 */
2171void brw_SAMPLE(struct brw_compile *p,
2172		struct brw_reg dest,
2173		GLuint msg_reg_nr,
2174		struct brw_reg src0,
2175		GLuint binding_table_index,
2176		GLuint sampler,
2177		GLuint msg_type,
2178		GLuint response_length,
2179		GLuint msg_length,
2180		GLuint header_present,
2181		GLuint simd_mode,
2182		GLuint return_format)
2183{
2184   struct brw_context *brw = p->brw;
2185   struct brw_instruction *insn;
2186
2187   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2188
2189   insn = next_insn(p, BRW_OPCODE_SEND);
2190   insn->header.predicate_control = 0; /* XXX */
2191   insn->header.compression_control = BRW_COMPRESSION_NONE;
2192   if (brw->gen < 6)
2193      insn->header.destreg__conditionalmod = msg_reg_nr;
2194
2195   brw_set_dest(p, insn, dest);
2196   brw_set_src0(p, insn, src0);
2197   brw_set_sampler_message(p, insn,
2198                           binding_table_index,
2199                           sampler,
2200                           msg_type,
2201                           response_length,
2202                           msg_length,
2203                           header_present,
2204                           simd_mode,
2205                           return_format);
2206}
2207
2208/* All these variables are pretty confusing - we might be better off
2209 * using bitmasks and macros for this, in the old style.  Or perhaps
2210 * just having the caller instantiate the fields in dword3 itself.
2211 */
2212void brw_urb_WRITE(struct brw_compile *p,
2213		   struct brw_reg dest,
2214		   GLuint msg_reg_nr,
2215		   struct brw_reg src0,
2216                   unsigned flags,
2217		   GLuint msg_length,
2218		   GLuint response_length,
2219		   GLuint offset,
2220		   GLuint swizzle)
2221{
2222   struct brw_context *brw = p->brw;
2223   struct brw_instruction *insn;
2224
2225   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2226
2227   if (brw->gen == 7) {
2228      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2229      brw_push_insn_state(p);
2230      brw_set_access_mode(p, BRW_ALIGN_1);
2231      brw_set_mask_control(p, BRW_MASK_DISABLE);
2232      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2233		       BRW_REGISTER_TYPE_UD),
2234	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2235		brw_imm_ud(0xff00));
2236      brw_pop_insn_state(p);
2237   }
2238
2239   insn = next_insn(p, BRW_OPCODE_SEND);
2240
2241   assert(msg_length < BRW_MAX_MRF);
2242
2243   brw_set_dest(p, insn, dest);
2244   brw_set_src0(p, insn, src0);
2245   brw_set_src1(p, insn, brw_imm_d(0));
2246
2247   if (brw->gen < 6)
2248      insn->header.destreg__conditionalmod = msg_reg_nr;
2249
2250   brw_set_urb_message(p,
2251		       insn,
2252		       flags,
2253		       msg_length,
2254		       response_length,
2255		       offset,
2256		       swizzle);
2257}
2258
2259static int
2260next_ip(struct brw_compile *p, int ip)
2261{
2262   struct brw_instruction *insn = (void *)p->store + ip;
2263
2264   if (insn->header.cmpt_control)
2265      return ip + 8;
2266   else
2267      return ip + 16;
2268}
2269
2270static int
2271brw_find_next_block_end(struct brw_compile *p, int start)
2272{
2273   int ip;
2274   void *store = p->store;
2275
2276   for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2277      struct brw_instruction *insn = store + ip;
2278
2279      switch (insn->header.opcode) {
2280      case BRW_OPCODE_ENDIF:
2281      case BRW_OPCODE_ELSE:
2282      case BRW_OPCODE_WHILE:
2283      case BRW_OPCODE_HALT:
2284	 return ip;
2285      }
2286   }
2287
2288   return 0;
2289}
2290
2291/* There is no DO instruction on gen6, so to find the end of the loop
2292 * we have to see if the loop is jumping back before our start
2293 * instruction.
2294 */
2295static int
2296brw_find_loop_end(struct brw_compile *p, int start)
2297{
2298   struct brw_context *brw = p->brw;
2299   int ip;
2300   int scale = 8;
2301   void *store = p->store;
2302
2303   /* Always start after the instruction (such as a WHILE) we're trying to fix
2304    * up.
2305    */
2306   for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2307      struct brw_instruction *insn = store + ip;
2308
2309      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2310	 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2311				   : insn->bits3.break_cont.jip;
2312	 if (ip + jip * scale <= start)
2313	    return ip;
2314      }
2315   }
2316   assert(!"not reached");
2317   return start;
2318}
2319
2320/* After program generation, go back and update the UIP and JIP of
2321 * BREAK, CONT, and HALT instructions to their correct locations.
2322 */
2323void
2324brw_set_uip_jip(struct brw_compile *p)
2325{
2326   struct brw_context *brw = p->brw;
2327   int ip;
2328   int scale = 8;
2329   void *store = p->store;
2330
2331   if (brw->gen < 6)
2332      return;
2333
2334   for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2335      struct brw_instruction *insn = store + ip;
2336
2337      if (insn->header.cmpt_control) {
2338	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2339	 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2340		insn->header.opcode != BRW_OPCODE_CONTINUE &&
2341		insn->header.opcode != BRW_OPCODE_HALT);
2342	 continue;
2343      }
2344
2345      int block_end_ip = brw_find_next_block_end(p, ip);
2346      switch (insn->header.opcode) {
2347      case BRW_OPCODE_BREAK:
2348         assert(block_end_ip != 0);
2349	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2350	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2351	 insn->bits3.break_cont.uip =
2352	    (brw_find_loop_end(p, ip) - ip +
2353             (brw->gen == 6 ? 16 : 0)) / scale;
2354	 break;
2355      case BRW_OPCODE_CONTINUE:
2356         assert(block_end_ip != 0);
2357	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2358	 insn->bits3.break_cont.uip =
2359            (brw_find_loop_end(p, ip) - ip) / scale;
2360
2361	 assert(insn->bits3.break_cont.uip != 0);
2362	 assert(insn->bits3.break_cont.jip != 0);
2363	 break;
2364
2365      case BRW_OPCODE_ENDIF:
2366         if (block_end_ip == 0)
2367            insn->bits3.break_cont.jip = 2;
2368         else
2369            insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2370	 break;
2371
2372      case BRW_OPCODE_HALT:
2373	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2374	  *
2375	  *    "In case of the halt instruction not inside any conditional
2376	  *     code block, the value of <JIP> and <UIP> should be the
2377	  *     same. In case of the halt instruction inside conditional code
2378	  *     block, the <UIP> should be the end of the program, and the
2379	  *     <JIP> should be end of the most inner conditional code block."
2380	  *
2381	  * The uip will have already been set by whoever set up the
2382	  * instruction.
2383	  */
2384	 if (block_end_ip == 0) {
2385	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2386	 } else {
2387	    insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2388	 }
2389	 assert(insn->bits3.break_cont.uip != 0);
2390	 assert(insn->bits3.break_cont.jip != 0);
2391	 break;
2392      }
2393   }
2394}
2395
2396void brw_ff_sync(struct brw_compile *p,
2397		   struct brw_reg dest,
2398		   GLuint msg_reg_nr,
2399		   struct brw_reg src0,
2400		   bool allocate,
2401		   GLuint response_length,
2402		   bool eot)
2403{
2404   struct brw_context *brw = p->brw;
2405   struct brw_instruction *insn;
2406
2407   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2408
2409   insn = next_insn(p, BRW_OPCODE_SEND);
2410   brw_set_dest(p, insn, dest);
2411   brw_set_src0(p, insn, src0);
2412   brw_set_src1(p, insn, brw_imm_d(0));
2413
2414   if (brw->gen < 6)
2415      insn->header.destreg__conditionalmod = msg_reg_nr;
2416
2417   brw_set_ff_sync_message(p,
2418			   insn,
2419			   allocate,
2420			   response_length,
2421			   eot);
2422}
2423
2424/**
2425 * Emit the SEND instruction necessary to generate stream output data on Gen6
2426 * (for transform feedback).
2427 *
2428 * If send_commit_msg is true, this is the last piece of stream output data
2429 * from this thread, so send the data as a committed write.  According to the
2430 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2431 *
2432 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2433 *   writes are complete by sending the final write as a committed write."
2434 */
2435void
2436brw_svb_write(struct brw_compile *p,
2437              struct brw_reg dest,
2438              GLuint msg_reg_nr,
2439              struct brw_reg src0,
2440              GLuint binding_table_index,
2441              bool   send_commit_msg)
2442{
2443   struct brw_instruction *insn;
2444
2445   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2446
2447   insn = next_insn(p, BRW_OPCODE_SEND);
2448   brw_set_dest(p, insn, dest);
2449   brw_set_src0(p, insn, src0);
2450   brw_set_src1(p, insn, brw_imm_d(0));
2451   brw_set_dp_write_message(p, insn,
2452                            binding_table_index,
2453                            0, /* msg_control: ignored */
2454                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2455                            1, /* msg_length */
2456                            true, /* header_present */
2457                            0, /* last_render_target: ignored */
2458                            send_commit_msg, /* response_length */
2459                            0, /* end_of_thread */
2460                            send_commit_msg); /* send_commit_msg */
2461}
2462
2463/**
2464 * This instruction is generated as a single-channel align1 instruction by
2465 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2466 *
2467 * We can't use the typed atomic op in the FS because that has the execution
2468 * mask ANDed with the pixel mask, but we just want to write the one dword for
2469 * all the pixels.
2470 *
2471 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2472 * one u32.  So we use the same untyped atomic write message as the pixel
2473 * shader.
2474 *
2475 * The untyped atomic operation requires a BUFFER surface type with RAW
2476 * format, and is only accessible through the legacy DATA_CACHE dataport
2477 * messages.
2478 */
2479void brw_shader_time_add(struct brw_compile *p,
2480                         struct brw_reg payload,
2481                         uint32_t surf_index)
2482{
2483   struct brw_context *brw = p->brw;
2484   assert(brw->gen >= 7);
2485
2486   brw_push_insn_state(p);
2487   brw_set_access_mode(p, BRW_ALIGN_1);
2488   brw_set_mask_control(p, BRW_MASK_DISABLE);
2489   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2490   brw_pop_insn_state(p);
2491
2492   /* We use brw_vec1_reg and unmasked because we want to increment the given
2493    * offset only once.
2494    */
2495   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2496                                      BRW_ARF_NULL, 0));
2497   brw_set_src0(p, send, brw_vec1_reg(payload.file,
2498                                      payload.nr, 0));
2499
2500   uint32_t sfid, msg_type;
2501   if (brw->is_haswell) {
2502      sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2503      msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2504   } else {
2505      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2506      msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2507   }
2508
2509   bool header_present = false;
2510   bool eot = false;
2511   uint32_t mlen = 2; /* offset, value */
2512   uint32_t rlen = 0;
2513   brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2514
2515   send->bits3.ud |= msg_type << 14;
2516   send->bits3.ud |= 0 << 13; /* no return data */
2517   send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2518   send->bits3.ud |= BRW_AOP_ADD << 8;
2519   send->bits3.ud |= surf_index << 0;
2520}
2521