brw_eu_emit.c revision 9b4053cabd8bda180b352d2d2047209f6ca5f6e8
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct intel_context *intel = &p->brw->intel;
67   if (intel->gen < 6)
68      return;
69
70   if (src->file == BRW_MESSAGE_REGISTER_FILE)
71      return;
72
73   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74      brw_push_insn_state(p);
75      brw_set_mask_control(p, BRW_MASK_DISABLE);
76      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	      retype(*src, BRW_REGISTER_TYPE_UD));
79      brw_pop_insn_state(p);
80   }
81   *src = brw_message_reg(msg_reg_nr);
82}
83
84static void
85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86{
87   /* From the BSpec / ISA Reference / send - [DevIVB+]:
88    * "The send with EOT should use register space R112-R127 for <src>. This is
89    *  to enable loading of a new thread into the same slot while the message
90    *  with EOT for current thread is pending dispatch."
91    *
92    * Since we're pretending to have 16 MRFs anyway, we may as well use the
93    * registers required for messages with EOT.
94    */
95   struct intel_context *intel = &p->brw->intel;
96   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97      reg->file = BRW_GENERAL_REGISTER_FILE;
98      reg->nr += GEN7_MRF_HACK_START;
99   }
100}
101
102
103void
104brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
105	     struct brw_reg dest)
106{
107   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
108       dest.file != BRW_MESSAGE_REGISTER_FILE)
109      assert(dest.nr < 128);
110
111   gen7_convert_mrf_to_grf(p, &dest);
112
113   insn->bits1.da1.dest_reg_file = dest.file;
114   insn->bits1.da1.dest_reg_type = dest.type;
115   insn->bits1.da1.dest_address_mode = dest.address_mode;
116
117   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
118      insn->bits1.da1.dest_reg_nr = dest.nr;
119
120      if (insn->header.access_mode == BRW_ALIGN_1) {
121	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
122	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
123	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
124	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
125      }
126      else {
127	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
128	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
129	 /* even ignored in da16, still need to set as '01' */
130	 insn->bits1.da16.dest_horiz_stride = 1;
131      }
132   }
133   else {
134      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
135
136      /* These are different sizes in align1 vs align16:
137       */
138      if (insn->header.access_mode == BRW_ALIGN_1) {
139	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
140	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
141	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
142	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
143      }
144      else {
145	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
146	 /* even ignored in da16, still need to set as '01' */
147	 insn->bits1.ia16.dest_horiz_stride = 1;
148      }
149   }
150
151   /* NEW: Set the execution size based on dest.width and
152    * insn->compression_control:
153    */
154   guess_execution_size(p, insn, dest);
155}
156
157extern int reg_type_size[];
158
159static void
160validate_reg(struct brw_instruction *insn, struct brw_reg reg)
161{
162   int hstride_for_reg[] = {0, 1, 2, 4};
163   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
164   int width_for_reg[] = {1, 2, 4, 8, 16};
165   int execsize_for_reg[] = {1, 2, 4, 8, 16};
166   int width, hstride, vstride, execsize;
167
168   if (reg.file == BRW_IMMEDIATE_VALUE) {
169      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
170       * mean the destination has to be 128-bit aligned and the
171       * destination horiz stride has to be a word.
172       */
173      if (reg.type == BRW_REGISTER_TYPE_V) {
174	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
175		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
176      }
177
178      return;
179   }
180
181   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
182       reg.file == BRW_ARF_NULL)
183      return;
184
185   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
186   hstride = hstride_for_reg[reg.hstride];
187
188   if (reg.vstride == 0xf) {
189      vstride = -1;
190   } else {
191      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
192      vstride = vstride_for_reg[reg.vstride];
193   }
194
195   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
196   width = width_for_reg[reg.width];
197
198   assert(insn->header.execution_size >= 0 &&
199	  insn->header.execution_size < Elements(execsize_for_reg));
200   execsize = execsize_for_reg[insn->header.execution_size];
201
202   /* Restrictions from 3.3.10: Register Region Restrictions. */
203   /* 3. */
204   assert(execsize >= width);
205
206   /* 4. */
207   if (execsize == width && hstride != 0) {
208      assert(vstride == -1 || vstride == width * hstride);
209   }
210
211   /* 5. */
212   if (execsize == width && hstride == 0) {
213      /* no restriction on vstride. */
214   }
215
216   /* 6. */
217   if (width == 1) {
218      assert(hstride == 0);
219   }
220
221   /* 7. */
222   if (execsize == 1 && width == 1) {
223      assert(hstride == 0);
224      assert(vstride == 0);
225   }
226
227   /* 8. */
228   if (vstride == 0 && hstride == 0) {
229      assert(width == 1);
230   }
231
232   /* 10. Check destination issues. */
233}
234
235void
236brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
237	     struct brw_reg reg)
238{
239   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
240      assert(reg.nr < 128);
241
242   gen7_convert_mrf_to_grf(p, &reg);
243
244   validate_reg(insn, reg);
245
246   insn->bits1.da1.src0_reg_file = reg.file;
247   insn->bits1.da1.src0_reg_type = reg.type;
248   insn->bits2.da1.src0_abs = reg.abs;
249   insn->bits2.da1.src0_negate = reg.negate;
250   insn->bits2.da1.src0_address_mode = reg.address_mode;
251
252   if (reg.file == BRW_IMMEDIATE_VALUE) {
253      insn->bits3.ud = reg.dw1.ud;
254
255      /* Required to set some fields in src1 as well:
256       */
257      insn->bits1.da1.src1_reg_file = 0; /* arf */
258      insn->bits1.da1.src1_reg_type = reg.type;
259   }
260   else
261   {
262      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
263	 if (insn->header.access_mode == BRW_ALIGN_1) {
264	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
265	    insn->bits2.da1.src0_reg_nr = reg.nr;
266	 }
267	 else {
268	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
269	    insn->bits2.da16.src0_reg_nr = reg.nr;
270	 }
271      }
272      else {
273	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
274
275	 if (insn->header.access_mode == BRW_ALIGN_1) {
276	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
277	 }
278	 else {
279	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
280	 }
281      }
282
283      if (insn->header.access_mode == BRW_ALIGN_1) {
284	 if (reg.width == BRW_WIDTH_1 &&
285	     insn->header.execution_size == BRW_EXECUTE_1) {
286	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
287	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
288	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
289	 }
290	 else {
291	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
292	    insn->bits2.da1.src0_width = reg.width;
293	    insn->bits2.da1.src0_vert_stride = reg.vstride;
294	 }
295      }
296      else {
297	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
298	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
299	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
300	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
301
302	 /* This is an oddity of the fact we're using the same
303	  * descriptions for registers in align_16 as align_1:
304	  */
305	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
306	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
307	 else
308	    insn->bits2.da16.src0_vert_stride = reg.vstride;
309      }
310   }
311}
312
313
314void brw_set_src1(struct brw_compile *p,
315		  struct brw_instruction *insn,
316		  struct brw_reg reg)
317{
318   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
319
320   assert(reg.nr < 128);
321
322   gen7_convert_mrf_to_grf(p, &reg);
323
324   validate_reg(insn, reg);
325
326   insn->bits1.da1.src1_reg_file = reg.file;
327   insn->bits1.da1.src1_reg_type = reg.type;
328   insn->bits3.da1.src1_abs = reg.abs;
329   insn->bits3.da1.src1_negate = reg.negate;
330
331   /* Only src1 can be immediate in two-argument instructions.
332    */
333   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
334
335   if (reg.file == BRW_IMMEDIATE_VALUE) {
336      insn->bits3.ud = reg.dw1.ud;
337   }
338   else {
339      /* This is a hardware restriction, which may or may not be lifted
340       * in the future:
341       */
342      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
343      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
344
345      if (insn->header.access_mode == BRW_ALIGN_1) {
346	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
347	 insn->bits3.da1.src1_reg_nr = reg.nr;
348      }
349      else {
350	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
351	 insn->bits3.da16.src1_reg_nr = reg.nr;
352      }
353
354      if (insn->header.access_mode == BRW_ALIGN_1) {
355	 if (reg.width == BRW_WIDTH_1 &&
356	     insn->header.execution_size == BRW_EXECUTE_1) {
357	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
358	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
359	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
360	 }
361	 else {
362	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
363	    insn->bits3.da1.src1_width = reg.width;
364	    insn->bits3.da1.src1_vert_stride = reg.vstride;
365	 }
366      }
367      else {
368	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
369	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
370	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
371	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
372
373	 /* This is an oddity of the fact we're using the same
374	  * descriptions for registers in align_16 as align_1:
375	  */
376	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
377	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
378	 else
379	    insn->bits3.da16.src1_vert_stride = reg.vstride;
380      }
381   }
382}
383
384/**
385 * Set the Message Descriptor and Extended Message Descriptor fields
386 * for SEND messages.
387 *
388 * \note This zeroes out the Function Control bits, so it must be called
389 *       \b before filling out any message-specific data.  Callers can
390 *       choose not to fill in irrelevant bits; they will be zero.
391 */
392static void
393brw_set_message_descriptor(struct brw_compile *p,
394			   struct brw_instruction *inst,
395			   enum brw_message_target sfid,
396			   unsigned msg_length,
397			   unsigned response_length,
398			   bool header_present,
399			   bool end_of_thread)
400{
401   struct intel_context *intel = &p->brw->intel;
402
403   brw_set_src1(p, inst, brw_imm_d(0));
404
405   if (intel->gen >= 5) {
406      inst->bits3.generic_gen5.header_present = header_present;
407      inst->bits3.generic_gen5.response_length = response_length;
408      inst->bits3.generic_gen5.msg_length = msg_length;
409      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
410
411      if (intel->gen >= 6) {
412	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
413	 inst->header.destreg__conditionalmod = sfid;
414      } else {
415	 /* Set Extended Message Descriptor (ex_desc) */
416	 inst->bits2.send_gen5.sfid = sfid;
417	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
418      }
419   } else {
420      inst->bits3.generic.response_length = response_length;
421      inst->bits3.generic.msg_length = msg_length;
422      inst->bits3.generic.msg_target = sfid;
423      inst->bits3.generic.end_of_thread = end_of_thread;
424   }
425}
426
427static void brw_set_math_message( struct brw_compile *p,
428				  struct brw_instruction *insn,
429				  GLuint function,
430				  GLuint integer_type,
431				  bool low_precision,
432				  GLuint dataType )
433{
434   struct brw_context *brw = p->brw;
435   struct intel_context *intel = &brw->intel;
436   unsigned msg_length;
437   unsigned response_length;
438
439   /* Infer message length from the function */
440   switch (function) {
441   case BRW_MATH_FUNCTION_POW:
442   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
443   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
444   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
445      msg_length = 2;
446      break;
447   default:
448      msg_length = 1;
449      break;
450   }
451
452   /* Infer response length from the function */
453   switch (function) {
454   case BRW_MATH_FUNCTION_SINCOS:
455   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
456      response_length = 2;
457      break;
458   default:
459      response_length = 1;
460      break;
461   }
462
463
464   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
465			      msg_length, response_length, false, false);
466   if (intel->gen == 5) {
467      insn->bits3.math_gen5.function = function;
468      insn->bits3.math_gen5.int_type = integer_type;
469      insn->bits3.math_gen5.precision = low_precision;
470      insn->bits3.math_gen5.saturate = insn->header.saturate;
471      insn->bits3.math_gen5.data_type = dataType;
472      insn->bits3.math_gen5.snapshot = 0;
473   } else {
474      insn->bits3.math.function = function;
475      insn->bits3.math.int_type = integer_type;
476      insn->bits3.math.precision = low_precision;
477      insn->bits3.math.saturate = insn->header.saturate;
478      insn->bits3.math.data_type = dataType;
479   }
480   insn->header.saturate = 0;
481}
482
483
484static void brw_set_ff_sync_message(struct brw_compile *p,
485				    struct brw_instruction *insn,
486				    bool allocate,
487				    GLuint response_length,
488				    bool end_of_thread)
489{
490   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
491			      1, response_length, true, end_of_thread);
492   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
493   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
494   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
495   insn->bits3.urb_gen5.allocate = allocate;
496   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
497   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
498}
499
500static void brw_set_urb_message( struct brw_compile *p,
501				 struct brw_instruction *insn,
502				 bool allocate,
503				 bool used,
504				 GLuint msg_length,
505				 GLuint response_length,
506				 bool end_of_thread,
507				 bool complete,
508				 GLuint offset,
509				 GLuint swizzle_control )
510{
511   struct brw_context *brw = p->brw;
512   struct intel_context *intel = &brw->intel;
513
514   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
515			      msg_length, response_length, true, end_of_thread);
516   if (intel->gen == 7) {
517      insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
518      insn->bits3.urb_gen7.offset = offset;
519      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
520      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
521      /* per_slot_offset = 0 makes it ignore offsets in message header */
522      insn->bits3.urb_gen7.per_slot_offset = 0;
523      insn->bits3.urb_gen7.complete = complete;
524   } else if (intel->gen >= 5) {
525      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
526      insn->bits3.urb_gen5.offset = offset;
527      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
528      insn->bits3.urb_gen5.allocate = allocate;
529      insn->bits3.urb_gen5.used = used;	/* ? */
530      insn->bits3.urb_gen5.complete = complete;
531   } else {
532      insn->bits3.urb.opcode = 0;	/* ? */
533      insn->bits3.urb.offset = offset;
534      insn->bits3.urb.swizzle_control = swizzle_control;
535      insn->bits3.urb.allocate = allocate;
536      insn->bits3.urb.used = used;	/* ? */
537      insn->bits3.urb.complete = complete;
538   }
539}
540
541void
542brw_set_dp_write_message(struct brw_compile *p,
543			 struct brw_instruction *insn,
544			 GLuint binding_table_index,
545			 GLuint msg_control,
546			 GLuint msg_type,
547			 GLuint msg_length,
548			 bool header_present,
549			 GLuint last_render_target,
550			 GLuint response_length,
551			 GLuint end_of_thread,
552			 GLuint send_commit_msg)
553{
554   struct brw_context *brw = p->brw;
555   struct intel_context *intel = &brw->intel;
556   unsigned sfid;
557
558   if (intel->gen >= 7) {
559      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
560      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
561	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
562      else
563	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
564   } else if (intel->gen == 6) {
565      /* Use the render cache for all write messages. */
566      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
567   } else {
568      sfid = BRW_SFID_DATAPORT_WRITE;
569   }
570
571   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
572			      header_present, end_of_thread);
573
574   if (intel->gen >= 7) {
575      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
576      insn->bits3.gen7_dp.msg_control = msg_control;
577      insn->bits3.gen7_dp.last_render_target = last_render_target;
578      insn->bits3.gen7_dp.msg_type = msg_type;
579   } else if (intel->gen == 6) {
580      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
581      insn->bits3.gen6_dp.msg_control = msg_control;
582      insn->bits3.gen6_dp.last_render_target = last_render_target;
583      insn->bits3.gen6_dp.msg_type = msg_type;
584      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
585   } else if (intel->gen == 5) {
586      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
587      insn->bits3.dp_write_gen5.msg_control = msg_control;
588      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
589      insn->bits3.dp_write_gen5.msg_type = msg_type;
590      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
591   } else {
592      insn->bits3.dp_write.binding_table_index = binding_table_index;
593      insn->bits3.dp_write.msg_control = msg_control;
594      insn->bits3.dp_write.last_render_target = last_render_target;
595      insn->bits3.dp_write.msg_type = msg_type;
596      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
597   }
598}
599
600void
601brw_set_dp_read_message(struct brw_compile *p,
602			struct brw_instruction *insn,
603			GLuint binding_table_index,
604			GLuint msg_control,
605			GLuint msg_type,
606			GLuint target_cache,
607			GLuint msg_length,
608			GLuint response_length)
609{
610   struct brw_context *brw = p->brw;
611   struct intel_context *intel = &brw->intel;
612   unsigned sfid;
613
614   if (intel->gen >= 7) {
615      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
616   } else if (intel->gen == 6) {
617      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
618	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
619      else
620	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
621   } else {
622      sfid = BRW_SFID_DATAPORT_READ;
623   }
624
625   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
626			      true, false);
627
628   if (intel->gen >= 7) {
629      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
630      insn->bits3.gen7_dp.msg_control = msg_control;
631      insn->bits3.gen7_dp.last_render_target = 0;
632      insn->bits3.gen7_dp.msg_type = msg_type;
633   } else if (intel->gen == 6) {
634      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
635      insn->bits3.gen6_dp.msg_control = msg_control;
636      insn->bits3.gen6_dp.last_render_target = 0;
637      insn->bits3.gen6_dp.msg_type = msg_type;
638      insn->bits3.gen6_dp.send_commit_msg = 0;
639   } else if (intel->gen == 5) {
640      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
641      insn->bits3.dp_read_gen5.msg_control = msg_control;
642      insn->bits3.dp_read_gen5.msg_type = msg_type;
643      insn->bits3.dp_read_gen5.target_cache = target_cache;
644   } else if (intel->is_g4x) {
645      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
646      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
647      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
648      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
649   } else {
650      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
651      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
652      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
653      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
654   }
655}
656
657void
658brw_set_sampler_message(struct brw_compile *p,
659                        struct brw_instruction *insn,
660                        GLuint binding_table_index,
661                        GLuint sampler,
662                        GLuint msg_type,
663                        GLuint response_length,
664                        GLuint msg_length,
665                        GLuint header_present,
666                        GLuint simd_mode,
667                        GLuint return_format)
668{
669   struct brw_context *brw = p->brw;
670   struct intel_context *intel = &brw->intel;
671
672   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
673			      response_length, header_present, false);
674
675   if (intel->gen >= 7) {
676      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
677      insn->bits3.sampler_gen7.sampler = sampler;
678      insn->bits3.sampler_gen7.msg_type = msg_type;
679      insn->bits3.sampler_gen7.simd_mode = simd_mode;
680   } else if (intel->gen >= 5) {
681      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
682      insn->bits3.sampler_gen5.sampler = sampler;
683      insn->bits3.sampler_gen5.msg_type = msg_type;
684      insn->bits3.sampler_gen5.simd_mode = simd_mode;
685   } else if (intel->is_g4x) {
686      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
687      insn->bits3.sampler_g4x.sampler = sampler;
688      insn->bits3.sampler_g4x.msg_type = msg_type;
689   } else {
690      insn->bits3.sampler.binding_table_index = binding_table_index;
691      insn->bits3.sampler.sampler = sampler;
692      insn->bits3.sampler.msg_type = msg_type;
693      insn->bits3.sampler.return_format = return_format;
694   }
695}
696
697
698#define next_insn brw_next_insn
699struct brw_instruction *
700brw_next_insn(struct brw_compile *p, GLuint opcode)
701{
702   struct brw_instruction *insn;
703
704   if (p->nr_insn + 1 > p->store_size) {
705      if (0)
706         printf("incresing the store size to %d\n", p->store_size << 1);
707      p->store_size <<= 1;
708      p->store = reralloc(p->mem_ctx, p->store,
709                          struct brw_instruction, p->store_size);
710      if (!p->store)
711         assert(!"realloc eu store memeory failed");
712   }
713
714   insn = &p->store[p->nr_insn++];
715   memcpy(insn, p->current, sizeof(*insn));
716
717   /* Reset this one-shot flag:
718    */
719
720   if (p->current->header.destreg__conditionalmod) {
721      p->current->header.destreg__conditionalmod = 0;
722      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
723   }
724
725   insn->header.opcode = opcode;
726   return insn;
727}
728
729static struct brw_instruction *brw_alu1( struct brw_compile *p,
730					 GLuint opcode,
731					 struct brw_reg dest,
732					 struct brw_reg src )
733{
734   struct brw_instruction *insn = next_insn(p, opcode);
735   brw_set_dest(p, insn, dest);
736   brw_set_src0(p, insn, src);
737   return insn;
738}
739
740static struct brw_instruction *brw_alu2(struct brw_compile *p,
741					GLuint opcode,
742					struct brw_reg dest,
743					struct brw_reg src0,
744					struct brw_reg src1 )
745{
746   struct brw_instruction *insn = next_insn(p, opcode);
747   brw_set_dest(p, insn, dest);
748   brw_set_src0(p, insn, src0);
749   brw_set_src1(p, insn, src1);
750   return insn;
751}
752
753static int
754get_3src_subreg_nr(struct brw_reg reg)
755{
756   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
757      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
758      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
759   } else {
760      return reg.subnr / 4;
761   }
762}
763
764static struct brw_instruction *brw_alu3(struct brw_compile *p,
765					GLuint opcode,
766					struct brw_reg dest,
767					struct brw_reg src0,
768					struct brw_reg src1,
769					struct brw_reg src2)
770{
771   struct brw_instruction *insn = next_insn(p, opcode);
772
773   gen7_convert_mrf_to_grf(p, &dest);
774
775   assert(insn->header.access_mode == BRW_ALIGN_16);
776
777   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
778	  dest.file == BRW_MESSAGE_REGISTER_FILE);
779   assert(dest.nr < 128);
780   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
781   assert(dest.type = BRW_REGISTER_TYPE_F);
782   insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
783   insn->bits1.da3src.dest_reg_nr = dest.nr;
784   insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
785   insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
786   guess_execution_size(p, insn, dest);
787
788   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
789   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
790   assert(src0.nr < 128);
791   assert(src0.type == BRW_REGISTER_TYPE_F);
792   insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
793   insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
794   insn->bits2.da3src.src0_reg_nr = src0.nr;
795   insn->bits1.da3src.src0_abs = src0.abs;
796   insn->bits1.da3src.src0_negate = src0.negate;
797   insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
798
799   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
800   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
801   assert(src1.nr < 128);
802   assert(src1.type == BRW_REGISTER_TYPE_F);
803   insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
804   insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
805   insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
806   insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
807   insn->bits3.da3src.src1_reg_nr = src1.nr;
808   insn->bits1.da3src.src1_abs = src1.abs;
809   insn->bits1.da3src.src1_negate = src1.negate;
810
811   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
812   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
813   assert(src2.nr < 128);
814   assert(src2.type == BRW_REGISTER_TYPE_F);
815   insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
816   insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
817   insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
818   insn->bits3.da3src.src2_reg_nr = src2.nr;
819   insn->bits1.da3src.src2_abs = src2.abs;
820   insn->bits1.da3src.src2_negate = src2.negate;
821
822   return insn;
823}
824
825
826/***********************************************************************
827 * Convenience routines.
828 */
829#define ALU1(OP)					\
830struct brw_instruction *brw_##OP(struct brw_compile *p,	\
831	      struct brw_reg dest,			\
832	      struct brw_reg src0)   			\
833{							\
834   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
835}
836
837#define ALU2(OP)					\
838struct brw_instruction *brw_##OP(struct brw_compile *p,	\
839	      struct brw_reg dest,			\
840	      struct brw_reg src0,			\
841	      struct brw_reg src1)   			\
842{							\
843   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
844}
845
846#define ALU3(OP)					\
847struct brw_instruction *brw_##OP(struct brw_compile *p,	\
848	      struct brw_reg dest,			\
849	      struct brw_reg src0,			\
850	      struct brw_reg src1,			\
851	      struct brw_reg src2)   			\
852{							\
853   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
854}
855
856/* Rounding operations (other than RNDD) require two instructions - the first
857 * stores a rounded value (possibly the wrong way) in the dest register, but
858 * also sets a per-channel "increment bit" in the flag register.  A predicated
859 * add of 1.0 fixes dest to contain the desired result.
860 *
861 * Sandybridge and later appear to round correctly without an ADD.
862 */
863#define ROUND(OP)							      \
864void brw_##OP(struct brw_compile *p,					      \
865	      struct brw_reg dest,					      \
866	      struct brw_reg src)					      \
867{									      \
868   struct brw_instruction *rnd, *add;					      \
869   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
870   brw_set_dest(p, rnd, dest);						      \
871   brw_set_src0(p, rnd, src);						      \
872									      \
873   if (p->brw->intel.gen < 6) {						      \
874      /* turn on round-increments */					      \
875      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
876      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
877      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
878   }									      \
879}
880
881
882ALU1(MOV)
883ALU2(SEL)
884ALU1(NOT)
885ALU2(AND)
886ALU2(OR)
887ALU2(XOR)
888ALU2(SHR)
889ALU2(SHL)
890ALU2(RSR)
891ALU2(RSL)
892ALU2(ASR)
893ALU1(FRC)
894ALU1(RNDD)
895ALU2(MAC)
896ALU2(MACH)
897ALU1(LZD)
898ALU2(DP4)
899ALU2(DPH)
900ALU2(DP3)
901ALU2(DP2)
902ALU2(LINE)
903ALU2(PLN)
904ALU3(MAD)
905
906ROUND(RNDZ)
907ROUND(RNDE)
908
909
910struct brw_instruction *brw_ADD(struct brw_compile *p,
911				struct brw_reg dest,
912				struct brw_reg src0,
913				struct brw_reg src1)
914{
915   /* 6.2.2: add */
916   if (src0.type == BRW_REGISTER_TYPE_F ||
917       (src0.file == BRW_IMMEDIATE_VALUE &&
918	src0.type == BRW_REGISTER_TYPE_VF)) {
919      assert(src1.type != BRW_REGISTER_TYPE_UD);
920      assert(src1.type != BRW_REGISTER_TYPE_D);
921   }
922
923   if (src1.type == BRW_REGISTER_TYPE_F ||
924       (src1.file == BRW_IMMEDIATE_VALUE &&
925	src1.type == BRW_REGISTER_TYPE_VF)) {
926      assert(src0.type != BRW_REGISTER_TYPE_UD);
927      assert(src0.type != BRW_REGISTER_TYPE_D);
928   }
929
930   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
931}
932
933struct brw_instruction *brw_AVG(struct brw_compile *p,
934                                struct brw_reg dest,
935                                struct brw_reg src0,
936                                struct brw_reg src1)
937{
938   assert(dest.type == src0.type);
939   assert(src0.type == src1.type);
940   switch (src0.type) {
941   case BRW_REGISTER_TYPE_B:
942   case BRW_REGISTER_TYPE_UB:
943   case BRW_REGISTER_TYPE_W:
944   case BRW_REGISTER_TYPE_UW:
945   case BRW_REGISTER_TYPE_D:
946   case BRW_REGISTER_TYPE_UD:
947      break;
948   default:
949      assert(!"Bad type for brw_AVG");
950   }
951
952   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
953}
954
955struct brw_instruction *brw_MUL(struct brw_compile *p,
956				struct brw_reg dest,
957				struct brw_reg src0,
958				struct brw_reg src1)
959{
960   /* 6.32.38: mul */
961   if (src0.type == BRW_REGISTER_TYPE_D ||
962       src0.type == BRW_REGISTER_TYPE_UD ||
963       src1.type == BRW_REGISTER_TYPE_D ||
964       src1.type == BRW_REGISTER_TYPE_UD) {
965      assert(dest.type != BRW_REGISTER_TYPE_F);
966   }
967
968   if (src0.type == BRW_REGISTER_TYPE_F ||
969       (src0.file == BRW_IMMEDIATE_VALUE &&
970	src0.type == BRW_REGISTER_TYPE_VF)) {
971      assert(src1.type != BRW_REGISTER_TYPE_UD);
972      assert(src1.type != BRW_REGISTER_TYPE_D);
973   }
974
975   if (src1.type == BRW_REGISTER_TYPE_F ||
976       (src1.file == BRW_IMMEDIATE_VALUE &&
977	src1.type == BRW_REGISTER_TYPE_VF)) {
978      assert(src0.type != BRW_REGISTER_TYPE_UD);
979      assert(src0.type != BRW_REGISTER_TYPE_D);
980   }
981
982   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
983	  src0.nr != BRW_ARF_ACCUMULATOR);
984   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
985	  src1.nr != BRW_ARF_ACCUMULATOR);
986
987   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
988}
989
990
991void brw_NOP(struct brw_compile *p)
992{
993   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
994   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
995   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
996   brw_set_src1(p, insn, brw_imm_ud(0x0));
997}
998
999
1000
1001
1002
1003/***********************************************************************
1004 * Comparisons, if/else/endif
1005 */
1006
1007struct brw_instruction *brw_JMPI(struct brw_compile *p,
1008                                 struct brw_reg dest,
1009                                 struct brw_reg src0,
1010                                 struct brw_reg src1)
1011{
1012   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1013
1014   insn->header.execution_size = 1;
1015   insn->header.compression_control = BRW_COMPRESSION_NONE;
1016   insn->header.mask_control = BRW_MASK_DISABLE;
1017
1018   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1019
1020   return insn;
1021}
1022
1023static void
1024push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1025{
1026   p->if_stack[p->if_stack_depth] = inst - p->store;
1027
1028   p->if_stack_depth++;
1029   if (p->if_stack_array_size <= p->if_stack_depth) {
1030      p->if_stack_array_size *= 2;
1031      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1032			     p->if_stack_array_size);
1033   }
1034}
1035
1036static struct brw_instruction *
1037pop_if_stack(struct brw_compile *p)
1038{
1039   p->if_stack_depth--;
1040   return &p->store[p->if_stack[p->if_stack_depth]];
1041}
1042
1043static void
1044push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1045{
1046   if (p->loop_stack_array_size < p->loop_stack_depth) {
1047      p->loop_stack_array_size *= 2;
1048      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1049			       p->loop_stack_array_size);
1050      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1051				     p->loop_stack_array_size);
1052   }
1053
1054   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1055   p->loop_stack_depth++;
1056   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1057}
1058
1059static struct brw_instruction *
1060get_inner_do_insn(struct brw_compile *p)
1061{
1062   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1063}
1064
1065/* EU takes the value from the flag register and pushes it onto some
1066 * sort of a stack (presumably merging with any flag value already on
1067 * the stack).  Within an if block, the flags at the top of the stack
1068 * control execution on each channel of the unit, eg. on each of the
1069 * 16 pixel values in our wm programs.
1070 *
1071 * When the matching 'else' instruction is reached (presumably by
1072 * countdown of the instruction count patched in by our ELSE/ENDIF
1073 * functions), the relevent flags are inverted.
1074 *
1075 * When the matching 'endif' instruction is reached, the flags are
1076 * popped off.  If the stack is now empty, normal execution resumes.
1077 */
1078struct brw_instruction *
1079brw_IF(struct brw_compile *p, GLuint execute_size)
1080{
1081   struct intel_context *intel = &p->brw->intel;
1082   struct brw_instruction *insn;
1083
1084   insn = next_insn(p, BRW_OPCODE_IF);
1085
1086   /* Override the defaults for this instruction:
1087    */
1088   if (intel->gen < 6) {
1089      brw_set_dest(p, insn, brw_ip_reg());
1090      brw_set_src0(p, insn, brw_ip_reg());
1091      brw_set_src1(p, insn, brw_imm_d(0x0));
1092   } else if (intel->gen == 6) {
1093      brw_set_dest(p, insn, brw_imm_w(0));
1094      insn->bits1.branch_gen6.jump_count = 0;
1095      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1096      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1097   } else {
1098      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1099      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1100      brw_set_src1(p, insn, brw_imm_ud(0));
1101      insn->bits3.break_cont.jip = 0;
1102      insn->bits3.break_cont.uip = 0;
1103   }
1104
1105   insn->header.execution_size = execute_size;
1106   insn->header.compression_control = BRW_COMPRESSION_NONE;
1107   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1108   insn->header.mask_control = BRW_MASK_ENABLE;
1109   if (!p->single_program_flow)
1110      insn->header.thread_control = BRW_THREAD_SWITCH;
1111
1112   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1113
1114   push_if_stack(p, insn);
1115   p->if_depth_in_loop[p->loop_stack_depth]++;
1116   return insn;
1117}
1118
1119/* This function is only used for gen6-style IF instructions with an
1120 * embedded comparison (conditional modifier).  It is not used on gen7.
1121 */
1122struct brw_instruction *
1123gen6_IF(struct brw_compile *p, uint32_t conditional,
1124	struct brw_reg src0, struct brw_reg src1)
1125{
1126   struct brw_instruction *insn;
1127
1128   insn = next_insn(p, BRW_OPCODE_IF);
1129
1130   brw_set_dest(p, insn, brw_imm_w(0));
1131   if (p->compressed) {
1132      insn->header.execution_size = BRW_EXECUTE_16;
1133   } else {
1134      insn->header.execution_size = BRW_EXECUTE_8;
1135   }
1136   insn->bits1.branch_gen6.jump_count = 0;
1137   brw_set_src0(p, insn, src0);
1138   brw_set_src1(p, insn, src1);
1139
1140   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1141   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1142   insn->header.destreg__conditionalmod = conditional;
1143
1144   if (!p->single_program_flow)
1145      insn->header.thread_control = BRW_THREAD_SWITCH;
1146
1147   push_if_stack(p, insn);
1148   return insn;
1149}
1150
1151/**
1152 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1153 */
1154static void
1155convert_IF_ELSE_to_ADD(struct brw_compile *p,
1156		       struct brw_instruction *if_inst,
1157		       struct brw_instruction *else_inst)
1158{
1159   /* The next instruction (where the ENDIF would be, if it existed) */
1160   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1161
1162   assert(p->single_program_flow);
1163   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1164   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1165   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1166
1167   /* Convert IF to an ADD instruction that moves the instruction pointer
1168    * to the first instruction of the ELSE block.  If there is no ELSE
1169    * block, point to where ENDIF would be.  Reverse the predicate.
1170    *
1171    * There's no need to execute an ENDIF since we don't need to do any
1172    * stack operations, and if we're currently executing, we just want to
1173    * continue normally.
1174    */
1175   if_inst->header.opcode = BRW_OPCODE_ADD;
1176   if_inst->header.predicate_inverse = 1;
1177
1178   if (else_inst != NULL) {
1179      /* Convert ELSE to an ADD instruction that points where the ENDIF
1180       * would be.
1181       */
1182      else_inst->header.opcode = BRW_OPCODE_ADD;
1183
1184      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1185      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1186   } else {
1187      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1188   }
1189}
1190
1191/**
1192 * Patch IF and ELSE instructions with appropriate jump targets.
1193 */
1194static void
1195patch_IF_ELSE(struct brw_compile *p,
1196	      struct brw_instruction *if_inst,
1197	      struct brw_instruction *else_inst,
1198	      struct brw_instruction *endif_inst)
1199{
1200   struct intel_context *intel = &p->brw->intel;
1201
1202   /* We shouldn't be patching IF and ELSE instructions in single program flow
1203    * mode when gen < 6, because in single program flow mode on those
1204    * platforms, we convert flow control instructions to conditional ADDs that
1205    * operate on IP (see brw_ENDIF).
1206    *
1207    * However, on Gen6, writing to IP doesn't work in single program flow mode
1208    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1209    * not be updated by non-flow control instructions.").  And on later
1210    * platforms, there is no significant benefit to converting control flow
1211    * instructions to conditional ADDs.  So we do patch IF and ELSE
1212    * instructions in single program flow mode on those platforms.
1213    */
1214   if (intel->gen < 6)
1215      assert(!p->single_program_flow);
1216
1217   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1218   assert(endif_inst != NULL);
1219   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1220
1221   unsigned br = 1;
1222   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1223    * requires 2 chunks.
1224    */
1225   if (intel->gen >= 5)
1226      br = 2;
1227
1228   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1229   endif_inst->header.execution_size = if_inst->header.execution_size;
1230
1231   if (else_inst == NULL) {
1232      /* Patch IF -> ENDIF */
1233      if (intel->gen < 6) {
1234	 /* Turn it into an IFF, which means no mask stack operations for
1235	  * all-false and jumping past the ENDIF.
1236	  */
1237	 if_inst->header.opcode = BRW_OPCODE_IFF;
1238	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1239	 if_inst->bits3.if_else.pop_count = 0;
1240	 if_inst->bits3.if_else.pad0 = 0;
1241      } else if (intel->gen == 6) {
1242	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1243	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1244      } else {
1245	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1246	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1247      }
1248   } else {
1249      else_inst->header.execution_size = if_inst->header.execution_size;
1250
1251      /* Patch IF -> ELSE */
1252      if (intel->gen < 6) {
1253	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1254	 if_inst->bits3.if_else.pop_count = 0;
1255	 if_inst->bits3.if_else.pad0 = 0;
1256      } else if (intel->gen == 6) {
1257	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1258      }
1259
1260      /* Patch ELSE -> ENDIF */
1261      if (intel->gen < 6) {
1262	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1263	  * matching ENDIF.
1264	  */
1265	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1266	 else_inst->bits3.if_else.pop_count = 1;
1267	 else_inst->bits3.if_else.pad0 = 0;
1268      } else if (intel->gen == 6) {
1269	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1270	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1271      } else {
1272	 /* The IF instruction's JIP should point just past the ELSE */
1273	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1274	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1275	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1276	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1277      }
1278   }
1279}
1280
1281void
1282brw_ELSE(struct brw_compile *p)
1283{
1284   struct intel_context *intel = &p->brw->intel;
1285   struct brw_instruction *insn;
1286
1287   insn = next_insn(p, BRW_OPCODE_ELSE);
1288
1289   if (intel->gen < 6) {
1290      brw_set_dest(p, insn, brw_ip_reg());
1291      brw_set_src0(p, insn, brw_ip_reg());
1292      brw_set_src1(p, insn, brw_imm_d(0x0));
1293   } else if (intel->gen == 6) {
1294      brw_set_dest(p, insn, brw_imm_w(0));
1295      insn->bits1.branch_gen6.jump_count = 0;
1296      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1297      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1298   } else {
1299      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1300      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1301      brw_set_src1(p, insn, brw_imm_ud(0));
1302      insn->bits3.break_cont.jip = 0;
1303      insn->bits3.break_cont.uip = 0;
1304   }
1305
1306   insn->header.compression_control = BRW_COMPRESSION_NONE;
1307   insn->header.mask_control = BRW_MASK_ENABLE;
1308   if (!p->single_program_flow)
1309      insn->header.thread_control = BRW_THREAD_SWITCH;
1310
1311   push_if_stack(p, insn);
1312}
1313
1314void
1315brw_ENDIF(struct brw_compile *p)
1316{
1317   struct intel_context *intel = &p->brw->intel;
1318   struct brw_instruction *insn = NULL;
1319   struct brw_instruction *else_inst = NULL;
1320   struct brw_instruction *if_inst = NULL;
1321   struct brw_instruction *tmp;
1322   bool emit_endif = true;
1323
1324   /* In single program flow mode, we can express IF and ELSE instructions
1325    * equivalently as ADD instructions that operate on IP.  On platforms prior
1326    * to Gen6, flow control instructions cause an implied thread switch, so
1327    * this is a significant savings.
1328    *
1329    * However, on Gen6, writing to IP doesn't work in single program flow mode
1330    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1331    * not be updated by non-flow control instructions.").  And on later
1332    * platforms, there is no significant benefit to converting control flow
1333    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1334    * Gen5.
1335    */
1336   if (intel->gen < 6 && p->single_program_flow)
1337      emit_endif = false;
1338
1339   /*
1340    * A single next_insn() may change the base adress of instruction store
1341    * memory(p->store), so call it first before referencing the instruction
1342    * store pointer from an index
1343    */
1344   if (emit_endif)
1345      insn = next_insn(p, BRW_OPCODE_ENDIF);
1346
1347   /* Pop the IF and (optional) ELSE instructions from the stack */
1348   p->if_depth_in_loop[p->loop_stack_depth]--;
1349   tmp = pop_if_stack(p);
1350   if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1351      else_inst = tmp;
1352      tmp = pop_if_stack(p);
1353   }
1354   if_inst = tmp;
1355
1356   if (!emit_endif) {
1357      /* ENDIF is useless; don't bother emitting it. */
1358      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1359      return;
1360   }
1361
1362   if (intel->gen < 6) {
1363      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1364      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1365      brw_set_src1(p, insn, brw_imm_d(0x0));
1366   } else if (intel->gen == 6) {
1367      brw_set_dest(p, insn, brw_imm_w(0));
1368      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1369      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1370   } else {
1371      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1372      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1373      brw_set_src1(p, insn, brw_imm_ud(0));
1374   }
1375
1376   insn->header.compression_control = BRW_COMPRESSION_NONE;
1377   insn->header.mask_control = BRW_MASK_ENABLE;
1378   insn->header.thread_control = BRW_THREAD_SWITCH;
1379
1380   /* Also pop item off the stack in the endif instruction: */
1381   if (intel->gen < 6) {
1382      insn->bits3.if_else.jump_count = 0;
1383      insn->bits3.if_else.pop_count = 1;
1384      insn->bits3.if_else.pad0 = 0;
1385   } else if (intel->gen == 6) {
1386      insn->bits1.branch_gen6.jump_count = 2;
1387   } else {
1388      insn->bits3.break_cont.jip = 2;
1389   }
1390   patch_IF_ELSE(p, if_inst, else_inst, insn);
1391}
1392
1393struct brw_instruction *brw_BREAK(struct brw_compile *p)
1394{
1395   struct intel_context *intel = &p->brw->intel;
1396   struct brw_instruction *insn;
1397
1398   insn = next_insn(p, BRW_OPCODE_BREAK);
1399   if (intel->gen >= 6) {
1400      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1401      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1402      brw_set_src1(p, insn, brw_imm_d(0x0));
1403   } else {
1404      brw_set_dest(p, insn, brw_ip_reg());
1405      brw_set_src0(p, insn, brw_ip_reg());
1406      brw_set_src1(p, insn, brw_imm_d(0x0));
1407      insn->bits3.if_else.pad0 = 0;
1408      insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1409   }
1410   insn->header.compression_control = BRW_COMPRESSION_NONE;
1411   insn->header.execution_size = BRW_EXECUTE_8;
1412
1413   return insn;
1414}
1415
1416struct brw_instruction *gen6_CONT(struct brw_compile *p)
1417{
1418   struct brw_instruction *insn;
1419
1420   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1421   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1422   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1423   brw_set_dest(p, insn, brw_ip_reg());
1424   brw_set_src0(p, insn, brw_ip_reg());
1425   brw_set_src1(p, insn, brw_imm_d(0x0));
1426
1427   insn->header.compression_control = BRW_COMPRESSION_NONE;
1428   insn->header.execution_size = BRW_EXECUTE_8;
1429   return insn;
1430}
1431
1432struct brw_instruction *brw_CONT(struct brw_compile *p)
1433{
1434   struct brw_instruction *insn;
1435   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1436   brw_set_dest(p, insn, brw_ip_reg());
1437   brw_set_src0(p, insn, brw_ip_reg());
1438   brw_set_src1(p, insn, brw_imm_d(0x0));
1439   insn->header.compression_control = BRW_COMPRESSION_NONE;
1440   insn->header.execution_size = BRW_EXECUTE_8;
1441   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1442   insn->bits3.if_else.pad0 = 0;
1443   insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1444   return insn;
1445}
1446
1447/* DO/WHILE loop:
1448 *
1449 * The DO/WHILE is just an unterminated loop -- break or continue are
1450 * used for control within the loop.  We have a few ways they can be
1451 * done.
1452 *
1453 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1454 * jip and no DO instruction.
1455 *
1456 * For non-uniform control flow pre-gen6, there's a DO instruction to
1457 * push the mask, and a WHILE to jump back, and BREAK to get out and
1458 * pop the mask.
1459 *
1460 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1461 * just points back to the first instruction of the loop.
1462 */
1463struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1464{
1465   struct intel_context *intel = &p->brw->intel;
1466
1467   if (intel->gen >= 6 || p->single_program_flow) {
1468      push_loop_stack(p, &p->store[p->nr_insn]);
1469      return &p->store[p->nr_insn];
1470   } else {
1471      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1472
1473      push_loop_stack(p, insn);
1474
1475      /* Override the defaults for this instruction:
1476       */
1477      brw_set_dest(p, insn, brw_null_reg());
1478      brw_set_src0(p, insn, brw_null_reg());
1479      brw_set_src1(p, insn, brw_null_reg());
1480
1481      insn->header.compression_control = BRW_COMPRESSION_NONE;
1482      insn->header.execution_size = execute_size;
1483      insn->header.predicate_control = BRW_PREDICATE_NONE;
1484      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1485      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1486
1487      return insn;
1488   }
1489}
1490
1491/**
1492 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1493 * instruction here.
1494 *
1495 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1496 * nesting, since it can always just point to the end of the block/current loop.
1497 */
1498static void
1499brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1500{
1501   struct intel_context *intel = &p->brw->intel;
1502   struct brw_instruction *do_inst = get_inner_do_insn(p);
1503   struct brw_instruction *inst;
1504   int br = (intel->gen == 5) ? 2 : 1;
1505
1506   for (inst = while_inst - 1; inst != do_inst; inst--) {
1507      /* If the jump count is != 0, that means that this instruction has already
1508       * been patched because it's part of a loop inside of the one we're
1509       * patching.
1510       */
1511      if (inst->header.opcode == BRW_OPCODE_BREAK &&
1512	  inst->bits3.if_else.jump_count == 0) {
1513	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1514      } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1515		 inst->bits3.if_else.jump_count == 0) {
1516	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1517      }
1518   }
1519}
1520
1521struct brw_instruction *brw_WHILE(struct brw_compile *p)
1522{
1523   struct intel_context *intel = &p->brw->intel;
1524   struct brw_instruction *insn, *do_insn;
1525   GLuint br = 1;
1526
1527   if (intel->gen >= 5)
1528      br = 2;
1529
1530   if (intel->gen >= 7) {
1531      insn = next_insn(p, BRW_OPCODE_WHILE);
1532      do_insn = get_inner_do_insn(p);
1533
1534      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1535      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1536      brw_set_src1(p, insn, brw_imm_ud(0));
1537      insn->bits3.break_cont.jip = br * (do_insn - insn);
1538
1539      insn->header.execution_size = BRW_EXECUTE_8;
1540   } else if (intel->gen == 6) {
1541      insn = next_insn(p, BRW_OPCODE_WHILE);
1542      do_insn = get_inner_do_insn(p);
1543
1544      brw_set_dest(p, insn, brw_imm_w(0));
1545      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1546      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1547      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1548
1549      insn->header.execution_size = BRW_EXECUTE_8;
1550   } else {
1551      if (p->single_program_flow) {
1552	 insn = next_insn(p, BRW_OPCODE_ADD);
1553         do_insn = get_inner_do_insn(p);
1554
1555	 brw_set_dest(p, insn, brw_ip_reg());
1556	 brw_set_src0(p, insn, brw_ip_reg());
1557	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1558	 insn->header.execution_size = BRW_EXECUTE_1;
1559      } else {
1560	 insn = next_insn(p, BRW_OPCODE_WHILE);
1561         do_insn = get_inner_do_insn(p);
1562
1563	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1564
1565	 brw_set_dest(p, insn, brw_ip_reg());
1566	 brw_set_src0(p, insn, brw_ip_reg());
1567	 brw_set_src1(p, insn, brw_imm_d(0));
1568
1569	 insn->header.execution_size = do_insn->header.execution_size;
1570	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1571	 insn->bits3.if_else.pop_count = 0;
1572	 insn->bits3.if_else.pad0 = 0;
1573
1574	 brw_patch_break_cont(p, insn);
1575      }
1576   }
1577   insn->header.compression_control = BRW_COMPRESSION_NONE;
1578   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1579
1580   p->loop_stack_depth--;
1581
1582   return insn;
1583}
1584
1585
1586/* FORWARD JUMPS:
1587 */
1588void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1589{
1590   struct intel_context *intel = &p->brw->intel;
1591   struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1592   GLuint jmpi = 1;
1593
1594   if (intel->gen >= 5)
1595      jmpi = 2;
1596
1597   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1598   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1599
1600   jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1601}
1602
1603
1604
1605/* To integrate with the above, it makes sense that the comparison
1606 * instruction should populate the flag register.  It might be simpler
1607 * just to use the flag reg for most WM tasks?
1608 */
1609void brw_CMP(struct brw_compile *p,
1610	     struct brw_reg dest,
1611	     GLuint conditional,
1612	     struct brw_reg src0,
1613	     struct brw_reg src1)
1614{
1615   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1616
1617   insn->header.destreg__conditionalmod = conditional;
1618   brw_set_dest(p, insn, dest);
1619   brw_set_src0(p, insn, src0);
1620   brw_set_src1(p, insn, src1);
1621
1622/*    guess_execution_size(insn, src0); */
1623
1624
1625   /* Make it so that future instructions will use the computed flag
1626    * value until brw_set_predicate_control_flag_value() is called
1627    * again.
1628    */
1629   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1630       dest.nr == 0) {
1631      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1632      p->flag_value = 0xff;
1633   }
1634}
1635
1636/* Issue 'wait' instruction for n1, host could program MMIO
1637   to wake up thread. */
1638void brw_WAIT (struct brw_compile *p)
1639{
1640   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1641   struct brw_reg src = brw_notification_1_reg();
1642
1643   brw_set_dest(p, insn, src);
1644   brw_set_src0(p, insn, src);
1645   brw_set_src1(p, insn, brw_null_reg());
1646   insn->header.execution_size = 0; /* must */
1647   insn->header.predicate_control = 0;
1648   insn->header.compression_control = 0;
1649}
1650
1651
1652/***********************************************************************
1653 * Helpers for the various SEND message types:
1654 */
1655
1656/** Extended math function, float[8].
1657 */
1658void brw_math( struct brw_compile *p,
1659	       struct brw_reg dest,
1660	       GLuint function,
1661	       GLuint msg_reg_nr,
1662	       struct brw_reg src,
1663	       GLuint data_type,
1664	       GLuint precision )
1665{
1666   struct intel_context *intel = &p->brw->intel;
1667
1668   if (intel->gen >= 6) {
1669      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1670
1671      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1672      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1673
1674      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1675      if (intel->gen == 6)
1676	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1677
1678      /* Source modifiers are ignored for extended math instructions on Gen6. */
1679      if (intel->gen == 6) {
1680	 assert(!src.negate);
1681	 assert(!src.abs);
1682      }
1683
1684      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1685	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1686	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1687	 assert(src.type != BRW_REGISTER_TYPE_F);
1688      } else {
1689	 assert(src.type == BRW_REGISTER_TYPE_F);
1690      }
1691
1692      /* Math is the same ISA format as other opcodes, except that CondModifier
1693       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1694       */
1695      insn->header.destreg__conditionalmod = function;
1696
1697      brw_set_dest(p, insn, dest);
1698      brw_set_src0(p, insn, src);
1699      brw_set_src1(p, insn, brw_null_reg());
1700   } else {
1701      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1702
1703      /* Example code doesn't set predicate_control for send
1704       * instructions.
1705       */
1706      insn->header.predicate_control = 0;
1707      insn->header.destreg__conditionalmod = msg_reg_nr;
1708
1709      brw_set_dest(p, insn, dest);
1710      brw_set_src0(p, insn, src);
1711      brw_set_math_message(p,
1712			   insn,
1713			   function,
1714			   src.type == BRW_REGISTER_TYPE_D,
1715			   precision,
1716			   data_type);
1717   }
1718}
1719
1720/** Extended math function, float[8].
1721 */
1722void brw_math2(struct brw_compile *p,
1723	       struct brw_reg dest,
1724	       GLuint function,
1725	       struct brw_reg src0,
1726	       struct brw_reg src1)
1727{
1728   struct intel_context *intel = &p->brw->intel;
1729   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1730
1731   assert(intel->gen >= 6);
1732   (void) intel;
1733
1734
1735   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1736   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1737   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1738
1739   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1740   if (intel->gen == 6) {
1741      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1742      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1743   }
1744
1745   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1746       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1747       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1748      assert(src0.type != BRW_REGISTER_TYPE_F);
1749      assert(src1.type != BRW_REGISTER_TYPE_F);
1750   } else {
1751      assert(src0.type == BRW_REGISTER_TYPE_F);
1752      assert(src1.type == BRW_REGISTER_TYPE_F);
1753   }
1754
1755   /* Source modifiers are ignored for extended math instructions on Gen6. */
1756   if (intel->gen == 6) {
1757      assert(!src0.negate);
1758      assert(!src0.abs);
1759      assert(!src1.negate);
1760      assert(!src1.abs);
1761   }
1762
1763   /* Math is the same ISA format as other opcodes, except that CondModifier
1764    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1765    */
1766   insn->header.destreg__conditionalmod = function;
1767
1768   brw_set_dest(p, insn, dest);
1769   brw_set_src0(p, insn, src0);
1770   brw_set_src1(p, insn, src1);
1771}
1772
1773/**
1774 * Extended math function, float[16].
1775 * Use 2 send instructions.
1776 */
1777void brw_math_16( struct brw_compile *p,
1778		  struct brw_reg dest,
1779		  GLuint function,
1780		  GLuint msg_reg_nr,
1781		  struct brw_reg src,
1782		  GLuint precision )
1783{
1784   struct intel_context *intel = &p->brw->intel;
1785   struct brw_instruction *insn;
1786
1787   if (intel->gen >= 6) {
1788      insn = next_insn(p, BRW_OPCODE_MATH);
1789
1790      /* Math is the same ISA format as other opcodes, except that CondModifier
1791       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1792       */
1793      insn->header.destreg__conditionalmod = function;
1794
1795      /* Source modifiers are ignored for extended math instructions. */
1796      assert(!src.negate);
1797      assert(!src.abs);
1798
1799      brw_set_dest(p, insn, dest);
1800      brw_set_src0(p, insn, src);
1801      brw_set_src1(p, insn, brw_null_reg());
1802      return;
1803   }
1804
1805   /* First instruction:
1806    */
1807   brw_push_insn_state(p);
1808   brw_set_predicate_control_flag_value(p, 0xff);
1809   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1810
1811   insn = next_insn(p, BRW_OPCODE_SEND);
1812   insn->header.destreg__conditionalmod = msg_reg_nr;
1813
1814   brw_set_dest(p, insn, dest);
1815   brw_set_src0(p, insn, src);
1816   brw_set_math_message(p,
1817			insn,
1818			function,
1819			BRW_MATH_INTEGER_UNSIGNED,
1820			precision,
1821			BRW_MATH_DATA_VECTOR);
1822
1823   /* Second instruction:
1824    */
1825   insn = next_insn(p, BRW_OPCODE_SEND);
1826   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1827   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1828
1829   brw_set_dest(p, insn, offset(dest,1));
1830   brw_set_src0(p, insn, src);
1831   brw_set_math_message(p,
1832			insn,
1833			function,
1834			BRW_MATH_INTEGER_UNSIGNED,
1835			precision,
1836			BRW_MATH_DATA_VECTOR);
1837
1838   brw_pop_insn_state(p);
1839}
1840
1841
1842/**
1843 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1844 * using a constant offset per channel.
1845 *
1846 * The offset must be aligned to oword size (16 bytes).  Used for
1847 * register spilling.
1848 */
1849void brw_oword_block_write_scratch(struct brw_compile *p,
1850				   struct brw_reg mrf,
1851				   int num_regs,
1852				   GLuint offset)
1853{
1854   struct intel_context *intel = &p->brw->intel;
1855   uint32_t msg_control, msg_type;
1856   int mlen;
1857
1858   if (intel->gen >= 6)
1859      offset /= 16;
1860
1861   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1862
1863   if (num_regs == 1) {
1864      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1865      mlen = 2;
1866   } else {
1867      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1868      mlen = 3;
1869   }
1870
1871   /* Set up the message header.  This is g0, with g0.2 filled with
1872    * the offset.  We don't want to leave our offset around in g0 or
1873    * it'll screw up texture samples, so set it up inside the message
1874    * reg.
1875    */
1876   {
1877      brw_push_insn_state(p);
1878      brw_set_mask_control(p, BRW_MASK_DISABLE);
1879      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1880
1881      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1882
1883      /* set message header global offset field (reg 0, element 2) */
1884      brw_MOV(p,
1885	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1886				  mrf.nr,
1887				  2), BRW_REGISTER_TYPE_UD),
1888	      brw_imm_ud(offset));
1889
1890      brw_pop_insn_state(p);
1891   }
1892
1893   {
1894      struct brw_reg dest;
1895      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1896      int send_commit_msg;
1897      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1898					 BRW_REGISTER_TYPE_UW);
1899
1900      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1901	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1902	 src_header = vec16(src_header);
1903      }
1904      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1905      insn->header.destreg__conditionalmod = mrf.nr;
1906
1907      /* Until gen6, writes followed by reads from the same location
1908       * are not guaranteed to be ordered unless write_commit is set.
1909       * If set, then a no-op write is issued to the destination
1910       * register to set a dependency, and a read from the destination
1911       * can be used to ensure the ordering.
1912       *
1913       * For gen6, only writes between different threads need ordering
1914       * protection.  Our use of DP writes is all about register
1915       * spilling within a thread.
1916       */
1917      if (intel->gen >= 6) {
1918	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1919	 send_commit_msg = 0;
1920      } else {
1921	 dest = src_header;
1922	 send_commit_msg = 1;
1923      }
1924
1925      brw_set_dest(p, insn, dest);
1926      if (intel->gen >= 6) {
1927	 brw_set_src0(p, insn, mrf);
1928      } else {
1929	 brw_set_src0(p, insn, brw_null_reg());
1930      }
1931
1932      if (intel->gen >= 6)
1933	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1934      else
1935	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1936
1937      brw_set_dp_write_message(p,
1938			       insn,
1939			       255, /* binding table index (255=stateless) */
1940			       msg_control,
1941			       msg_type,
1942			       mlen,
1943			       true, /* header_present */
1944			       0, /* not a render target */
1945			       send_commit_msg, /* response_length */
1946			       0, /* eot */
1947			       send_commit_msg);
1948   }
1949}
1950
1951
1952/**
1953 * Read a block of owords (half a GRF each) from the scratch buffer
1954 * using a constant index per channel.
1955 *
1956 * Offset must be aligned to oword size (16 bytes).  Used for register
1957 * spilling.
1958 */
1959void
1960brw_oword_block_read_scratch(struct brw_compile *p,
1961			     struct brw_reg dest,
1962			     struct brw_reg mrf,
1963			     int num_regs,
1964			     GLuint offset)
1965{
1966   struct intel_context *intel = &p->brw->intel;
1967   uint32_t msg_control;
1968   int rlen;
1969
1970   if (intel->gen >= 6)
1971      offset /= 16;
1972
1973   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1974   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1975
1976   if (num_regs == 1) {
1977      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1978      rlen = 1;
1979   } else {
1980      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1981      rlen = 2;
1982   }
1983
1984   {
1985      brw_push_insn_state(p);
1986      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1987      brw_set_mask_control(p, BRW_MASK_DISABLE);
1988
1989      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1990
1991      /* set message header global offset field (reg 0, element 2) */
1992      brw_MOV(p,
1993	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1994				  mrf.nr,
1995				  2), BRW_REGISTER_TYPE_UD),
1996	      brw_imm_ud(offset));
1997
1998      brw_pop_insn_state(p);
1999   }
2000
2001   {
2002      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2003
2004      assert(insn->header.predicate_control == 0);
2005      insn->header.compression_control = BRW_COMPRESSION_NONE;
2006      insn->header.destreg__conditionalmod = mrf.nr;
2007
2008      brw_set_dest(p, insn, dest);	/* UW? */
2009      if (intel->gen >= 6) {
2010	 brw_set_src0(p, insn, mrf);
2011      } else {
2012	 brw_set_src0(p, insn, brw_null_reg());
2013      }
2014
2015      brw_set_dp_read_message(p,
2016			      insn,
2017			      255, /* binding table index (255=stateless) */
2018			      msg_control,
2019			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2020			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2021			      1, /* msg_length */
2022			      rlen);
2023   }
2024}
2025
2026/**
2027 * Read a float[4] vector from the data port Data Cache (const buffer).
2028 * Location (in buffer) should be a multiple of 16.
2029 * Used for fetching shader constants.
2030 */
2031void brw_oword_block_read(struct brw_compile *p,
2032			  struct brw_reg dest,
2033			  struct brw_reg mrf,
2034			  uint32_t offset,
2035			  uint32_t bind_table_index)
2036{
2037   struct intel_context *intel = &p->brw->intel;
2038
2039   /* On newer hardware, offset is in units of owords. */
2040   if (intel->gen >= 6)
2041      offset /= 16;
2042
2043   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2044
2045   brw_push_insn_state(p);
2046   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2047   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2048   brw_set_mask_control(p, BRW_MASK_DISABLE);
2049
2050   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2051
2052   /* set message header global offset field (reg 0, element 2) */
2053   brw_MOV(p,
2054	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2055			       mrf.nr,
2056			       2), BRW_REGISTER_TYPE_UD),
2057	   brw_imm_ud(offset));
2058
2059   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2060   insn->header.destreg__conditionalmod = mrf.nr;
2061
2062   /* cast dest to a uword[8] vector */
2063   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2064
2065   brw_set_dest(p, insn, dest);
2066   if (intel->gen >= 6) {
2067      brw_set_src0(p, insn, mrf);
2068   } else {
2069      brw_set_src0(p, insn, brw_null_reg());
2070   }
2071
2072   brw_set_dp_read_message(p,
2073			   insn,
2074			   bind_table_index,
2075			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2076			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2077			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2078			   1, /* msg_length */
2079			   1); /* response_length (1 reg, 2 owords!) */
2080
2081   brw_pop_insn_state(p);
2082}
2083
2084/**
2085 * Read a set of dwords from the data port Data Cache (const buffer).
2086 *
2087 * Location (in buffer) appears as UD offsets in the register after
2088 * the provided mrf header reg.
2089 */
2090void brw_dword_scattered_read(struct brw_compile *p,
2091			      struct brw_reg dest,
2092			      struct brw_reg mrf,
2093			      uint32_t bind_table_index)
2094{
2095   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2096
2097   brw_push_insn_state(p);
2098   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2099   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2100   brw_set_mask_control(p, BRW_MASK_DISABLE);
2101   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2102   brw_pop_insn_state(p);
2103
2104   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2105   insn->header.destreg__conditionalmod = mrf.nr;
2106
2107   /* cast dest to a uword[8] vector */
2108   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2109
2110   brw_set_dest(p, insn, dest);
2111   brw_set_src0(p, insn, brw_null_reg());
2112
2113   brw_set_dp_read_message(p,
2114			   insn,
2115			   bind_table_index,
2116			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
2117			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
2118			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2119			   2, /* msg_length */
2120			   1); /* response_length */
2121}
2122
2123
2124
2125/**
2126 * Read float[4] constant(s) from VS constant buffer.
2127 * For relative addressing, two float[4] constants will be read into 'dest'.
2128 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
2129 */
2130void brw_dp_READ_4_vs(struct brw_compile *p,
2131                      struct brw_reg dest,
2132                      GLuint location,
2133                      GLuint bind_table_index)
2134{
2135   struct intel_context *intel = &p->brw->intel;
2136   struct brw_instruction *insn;
2137   GLuint msg_reg_nr = 1;
2138
2139   if (intel->gen >= 6)
2140      location /= 16;
2141
2142   /* Setup MRF[1] with location/offset into const buffer */
2143   brw_push_insn_state(p);
2144   brw_set_access_mode(p, BRW_ALIGN_1);
2145   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2146   brw_set_mask_control(p, BRW_MASK_DISABLE);
2147   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2148   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
2149		     BRW_REGISTER_TYPE_UD),
2150	   brw_imm_ud(location));
2151   brw_pop_insn_state(p);
2152
2153   insn = next_insn(p, BRW_OPCODE_SEND);
2154
2155   insn->header.predicate_control = BRW_PREDICATE_NONE;
2156   insn->header.compression_control = BRW_COMPRESSION_NONE;
2157   insn->header.destreg__conditionalmod = msg_reg_nr;
2158   insn->header.mask_control = BRW_MASK_DISABLE;
2159
2160   brw_set_dest(p, insn, dest);
2161   if (intel->gen >= 6) {
2162      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
2163   } else {
2164      brw_set_src0(p, insn, brw_null_reg());
2165   }
2166
2167   brw_set_dp_read_message(p,
2168			   insn,
2169			   bind_table_index,
2170			   0,
2171			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2172			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2173			   1, /* msg_length */
2174			   1); /* response_length (1 Oword) */
2175}
2176
2177/**
2178 * Read a float[4] constant per vertex from VS constant buffer, with
2179 * relative addressing.
2180 */
2181void brw_dp_READ_4_vs_relative(struct brw_compile *p,
2182			       struct brw_reg dest,
2183			       struct brw_reg addr_reg,
2184			       GLuint offset,
2185			       GLuint bind_table_index)
2186{
2187   struct intel_context *intel = &p->brw->intel;
2188   struct brw_reg src = brw_vec8_grf(0, 0);
2189   int msg_type;
2190
2191   /* Setup MRF[1] with offset into const buffer */
2192   brw_push_insn_state(p);
2193   brw_set_access_mode(p, BRW_ALIGN_1);
2194   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2195   brw_set_mask_control(p, BRW_MASK_DISABLE);
2196   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2197
2198   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2199    * fields ignored.
2200    */
2201   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2202	   addr_reg, brw_imm_d(offset));
2203   brw_pop_insn_state(p);
2204
2205   gen6_resolve_implied_move(p, &src, 0);
2206   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2207
2208   insn->header.predicate_control = BRW_PREDICATE_NONE;
2209   insn->header.compression_control = BRW_COMPRESSION_NONE;
2210   insn->header.destreg__conditionalmod = 0;
2211   insn->header.mask_control = BRW_MASK_DISABLE;
2212
2213   brw_set_dest(p, insn, dest);
2214   brw_set_src0(p, insn, src);
2215
2216   if (intel->gen >= 6)
2217      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2218   else if (intel->gen == 5 || intel->is_g4x)
2219      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2220   else
2221      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2222
2223   brw_set_dp_read_message(p,
2224			   insn,
2225			   bind_table_index,
2226			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2227			   msg_type,
2228			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2229			   2, /* msg_length */
2230			   1); /* response_length */
2231}
2232
2233
2234
2235void brw_fb_WRITE(struct brw_compile *p,
2236		  int dispatch_width,
2237                  GLuint msg_reg_nr,
2238                  struct brw_reg src0,
2239                  GLuint msg_control,
2240                  GLuint binding_table_index,
2241                  GLuint msg_length,
2242                  GLuint response_length,
2243                  bool eot,
2244                  bool header_present)
2245{
2246   struct intel_context *intel = &p->brw->intel;
2247   struct brw_instruction *insn;
2248   GLuint msg_type;
2249   struct brw_reg dest;
2250
2251   if (dispatch_width == 16)
2252      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2253   else
2254      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2255
2256   if (intel->gen >= 6) {
2257      insn = next_insn(p, BRW_OPCODE_SENDC);
2258   } else {
2259      insn = next_insn(p, BRW_OPCODE_SEND);
2260   }
2261   /* The execution mask is ignored for render target writes. */
2262   insn->header.predicate_control = 0;
2263   insn->header.compression_control = BRW_COMPRESSION_NONE;
2264
2265   if (intel->gen >= 6) {
2266      /* headerless version, just submit color payload */
2267      src0 = brw_message_reg(msg_reg_nr);
2268
2269      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2270   } else {
2271      insn->header.destreg__conditionalmod = msg_reg_nr;
2272
2273      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2274   }
2275
2276   brw_set_dest(p, insn, dest);
2277   brw_set_src0(p, insn, src0);
2278   brw_set_dp_write_message(p,
2279			    insn,
2280			    binding_table_index,
2281			    msg_control,
2282			    msg_type,
2283			    msg_length,
2284			    header_present,
2285			    eot, /* last render target write */
2286			    response_length,
2287			    eot,
2288			    0 /* send_commit_msg */);
2289}
2290
2291
2292/**
2293 * Texture sample instruction.
2294 * Note: the msg_type plus msg_length values determine exactly what kind
2295 * of sampling operation is performed.  See volume 4, page 161 of docs.
2296 */
2297void brw_SAMPLE(struct brw_compile *p,
2298		struct brw_reg dest,
2299		GLuint msg_reg_nr,
2300		struct brw_reg src0,
2301		GLuint binding_table_index,
2302		GLuint sampler,
2303		GLuint writemask,
2304		GLuint msg_type,
2305		GLuint response_length,
2306		GLuint msg_length,
2307		GLuint header_present,
2308		GLuint simd_mode,
2309		GLuint return_format)
2310{
2311   struct intel_context *intel = &p->brw->intel;
2312   bool need_stall = 0;
2313
2314   if (writemask == 0) {
2315      /*printf("%s: zero writemask??\n", __FUNCTION__); */
2316      return;
2317   }
2318
2319   /* Hardware doesn't do destination dependency checking on send
2320    * instructions properly.  Add a workaround which generates the
2321    * dependency by other means.  In practice it seems like this bug
2322    * only crops up for texture samples, and only where registers are
2323    * written by the send and then written again later without being
2324    * read in between.  Luckily for us, we already track that
2325    * information and use it to modify the writemask for the
2326    * instruction, so that is a guide for whether a workaround is
2327    * needed.
2328    */
2329   if (writemask != WRITEMASK_XYZW) {
2330      GLuint dst_offset = 0;
2331      GLuint i, newmask = 0, len = 0;
2332
2333      for (i = 0; i < 4; i++) {
2334	 if (writemask & (1<<i))
2335	    break;
2336	 dst_offset += 2;
2337      }
2338      for (; i < 4; i++) {
2339	 if (!(writemask & (1<<i)))
2340	    break;
2341	 newmask |= 1<<i;
2342	 len++;
2343      }
2344
2345      if (newmask != writemask) {
2346	 need_stall = 1;
2347         /* printf("need stall %x %x\n", newmask , writemask); */
2348      }
2349      else {
2350	 bool dispatch_16 = false;
2351
2352	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2353
2354	 guess_execution_size(p, p->current, dest);
2355	 if (p->current->header.execution_size == BRW_EXECUTE_16)
2356	    dispatch_16 = true;
2357
2358	 newmask = ~newmask & WRITEMASK_XYZW;
2359
2360	 brw_push_insn_state(p);
2361
2362	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2363	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2364
2365	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2366		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2367  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2368
2369	 brw_pop_insn_state(p);
2370
2371  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2372	 dest = offset(dest, dst_offset);
2373
2374	 /* For 16-wide dispatch, masked channels are skipped in the
2375	  * response.  For 8-wide, masked channels still take up slots,
2376	  * and are just not written to.
2377	  */
2378	 if (dispatch_16)
2379	    response_length = len * 2;
2380      }
2381   }
2382
2383   {
2384      struct brw_instruction *insn;
2385
2386      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2387
2388      insn = next_insn(p, BRW_OPCODE_SEND);
2389      insn->header.predicate_control = 0; /* XXX */
2390      insn->header.compression_control = BRW_COMPRESSION_NONE;
2391      if (intel->gen < 6)
2392	  insn->header.destreg__conditionalmod = msg_reg_nr;
2393
2394      brw_set_dest(p, insn, dest);
2395      brw_set_src0(p, insn, src0);
2396      brw_set_sampler_message(p, insn,
2397			      binding_table_index,
2398			      sampler,
2399			      msg_type,
2400			      response_length,
2401			      msg_length,
2402			      header_present,
2403			      simd_mode,
2404			      return_format);
2405   }
2406
2407   if (need_stall) {
2408      struct brw_reg reg = vec8(offset(dest, response_length-1));
2409
2410      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2411       */
2412      brw_push_insn_state(p);
2413      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2414      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2415	      retype(reg, BRW_REGISTER_TYPE_UD));
2416      brw_pop_insn_state(p);
2417   }
2418
2419}
2420
2421/* All these variables are pretty confusing - we might be better off
2422 * using bitmasks and macros for this, in the old style.  Or perhaps
2423 * just having the caller instantiate the fields in dword3 itself.
2424 */
2425void brw_urb_WRITE(struct brw_compile *p,
2426		   struct brw_reg dest,
2427		   GLuint msg_reg_nr,
2428		   struct brw_reg src0,
2429		   bool allocate,
2430		   bool used,
2431		   GLuint msg_length,
2432		   GLuint response_length,
2433		   bool eot,
2434		   bool writes_complete,
2435		   GLuint offset,
2436		   GLuint swizzle)
2437{
2438   struct intel_context *intel = &p->brw->intel;
2439   struct brw_instruction *insn;
2440
2441   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2442
2443   if (intel->gen == 7) {
2444      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2445      brw_push_insn_state(p);
2446      brw_set_access_mode(p, BRW_ALIGN_1);
2447      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2448		       BRW_REGISTER_TYPE_UD),
2449	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2450		brw_imm_ud(0xff00));
2451      brw_pop_insn_state(p);
2452   }
2453
2454   insn = next_insn(p, BRW_OPCODE_SEND);
2455
2456   assert(msg_length < BRW_MAX_MRF);
2457
2458   brw_set_dest(p, insn, dest);
2459   brw_set_src0(p, insn, src0);
2460   brw_set_src1(p, insn, brw_imm_d(0));
2461
2462   if (intel->gen < 6)
2463      insn->header.destreg__conditionalmod = msg_reg_nr;
2464
2465   brw_set_urb_message(p,
2466		       insn,
2467		       allocate,
2468		       used,
2469		       msg_length,
2470		       response_length,
2471		       eot,
2472		       writes_complete,
2473		       offset,
2474		       swizzle);
2475}
2476
2477static int
2478brw_find_next_block_end(struct brw_compile *p, int start)
2479{
2480   int ip;
2481
2482   for (ip = start + 1; ip < p->nr_insn; ip++) {
2483      struct brw_instruction *insn = &p->store[ip];
2484
2485      switch (insn->header.opcode) {
2486      case BRW_OPCODE_ENDIF:
2487      case BRW_OPCODE_ELSE:
2488      case BRW_OPCODE_WHILE:
2489	 return ip;
2490      }
2491   }
2492   assert(!"not reached");
2493   return start + 1;
2494}
2495
2496/* There is no DO instruction on gen6, so to find the end of the loop
2497 * we have to see if the loop is jumping back before our start
2498 * instruction.
2499 */
2500static int
2501brw_find_loop_end(struct brw_compile *p, int start)
2502{
2503   struct intel_context *intel = &p->brw->intel;
2504   int ip;
2505   int br = 2;
2506
2507   for (ip = start + 1; ip < p->nr_insn; ip++) {
2508      struct brw_instruction *insn = &p->store[ip];
2509
2510      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2511	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2512				   : insn->bits3.break_cont.jip;
2513	 if (ip + jip / br <= start)
2514	    return ip;
2515      }
2516   }
2517   assert(!"not reached");
2518   return start + 1;
2519}
2520
2521/* After program generation, go back and update the UIP and JIP of
2522 * BREAK and CONT instructions to their correct locations.
2523 */
2524void
2525brw_set_uip_jip(struct brw_compile *p)
2526{
2527   struct intel_context *intel = &p->brw->intel;
2528   int ip;
2529   int br = 2;
2530
2531   if (intel->gen < 6)
2532      return;
2533
2534   for (ip = 0; ip < p->nr_insn; ip++) {
2535      struct brw_instruction *insn = &p->store[ip];
2536
2537      switch (insn->header.opcode) {
2538      case BRW_OPCODE_BREAK:
2539	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2540	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2541	 insn->bits3.break_cont.uip =
2542	    br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2543	 break;
2544      case BRW_OPCODE_CONTINUE:
2545	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2546	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2547
2548	 assert(insn->bits3.break_cont.uip != 0);
2549	 assert(insn->bits3.break_cont.jip != 0);
2550	 break;
2551      }
2552   }
2553}
2554
2555void brw_ff_sync(struct brw_compile *p,
2556		   struct brw_reg dest,
2557		   GLuint msg_reg_nr,
2558		   struct brw_reg src0,
2559		   bool allocate,
2560		   GLuint response_length,
2561		   bool eot)
2562{
2563   struct intel_context *intel = &p->brw->intel;
2564   struct brw_instruction *insn;
2565
2566   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2567
2568   insn = next_insn(p, BRW_OPCODE_SEND);
2569   brw_set_dest(p, insn, dest);
2570   brw_set_src0(p, insn, src0);
2571   brw_set_src1(p, insn, brw_imm_d(0));
2572
2573   if (intel->gen < 6)
2574      insn->header.destreg__conditionalmod = msg_reg_nr;
2575
2576   brw_set_ff_sync_message(p,
2577			   insn,
2578			   allocate,
2579			   response_length,
2580			   eot);
2581}
2582
2583/**
2584 * Emit the SEND instruction necessary to generate stream output data on Gen6
2585 * (for transform feedback).
2586 *
2587 * If send_commit_msg is true, this is the last piece of stream output data
2588 * from this thread, so send the data as a committed write.  According to the
2589 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2590 *
2591 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2592 *   writes are complete by sending the final write as a committed write."
2593 */
2594void
2595brw_svb_write(struct brw_compile *p,
2596              struct brw_reg dest,
2597              GLuint msg_reg_nr,
2598              struct brw_reg src0,
2599              GLuint binding_table_index,
2600              bool   send_commit_msg)
2601{
2602   struct brw_instruction *insn;
2603
2604   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2605
2606   insn = next_insn(p, BRW_OPCODE_SEND);
2607   brw_set_dest(p, insn, dest);
2608   brw_set_src0(p, insn, src0);
2609   brw_set_src1(p, insn, brw_imm_d(0));
2610   brw_set_dp_write_message(p, insn,
2611                            binding_table_index,
2612                            0, /* msg_control: ignored */
2613                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2614                            1, /* msg_length */
2615                            true, /* header_present */
2616                            0, /* last_render_target: ignored */
2617                            send_commit_msg, /* response_length */
2618                            0, /* end_of_thread */
2619                            send_commit_msg); /* send_commit_msg */
2620}
2621