brw_eu_emit.c revision 172bb92db1a3c317867d9cfec6f15c09c37a0f6c
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct intel_context *intel = &p->brw->intel;
67   if (intel->gen < 6)
68      return;
69
70   if (src->file == BRW_MESSAGE_REGISTER_FILE)
71      return;
72
73   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74      brw_push_insn_state(p);
75      brw_set_mask_control(p, BRW_MASK_DISABLE);
76      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	      retype(*src, BRW_REGISTER_TYPE_UD));
79      brw_pop_insn_state(p);
80   }
81   *src = brw_message_reg(msg_reg_nr);
82}
83
84static void
85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86{
87   /* From the BSpec / ISA Reference / send - [DevIVB+]:
88    * "The send with EOT should use register space R112-R127 for <src>. This is
89    *  to enable loading of a new thread into the same slot while the message
90    *  with EOT for current thread is pending dispatch."
91    *
92    * Since we're pretending to have 16 MRFs anyway, we may as well use the
93    * registers required for messages with EOT.
94    */
95   struct intel_context *intel = &p->brw->intel;
96   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97      reg->file = BRW_GENERAL_REGISTER_FILE;
98      reg->nr += GEN7_MRF_HACK_START;
99   }
100}
101
102
103void
104brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
105	     struct brw_reg dest)
106{
107   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
108       dest.file != BRW_MESSAGE_REGISTER_FILE)
109      assert(dest.nr < 128);
110
111   gen7_convert_mrf_to_grf(p, &dest);
112
113   insn->bits1.da1.dest_reg_file = dest.file;
114   insn->bits1.da1.dest_reg_type = dest.type;
115   insn->bits1.da1.dest_address_mode = dest.address_mode;
116
117   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
118      insn->bits1.da1.dest_reg_nr = dest.nr;
119
120      if (insn->header.access_mode == BRW_ALIGN_1) {
121	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
122	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
123	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
124	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
125      }
126      else {
127	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
128	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
129	 /* even ignored in da16, still need to set as '01' */
130	 insn->bits1.da16.dest_horiz_stride = 1;
131      }
132   }
133   else {
134      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
135
136      /* These are different sizes in align1 vs align16:
137       */
138      if (insn->header.access_mode == BRW_ALIGN_1) {
139	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
140	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
141	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
142	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
143      }
144      else {
145	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
146	 /* even ignored in da16, still need to set as '01' */
147	 insn->bits1.ia16.dest_horiz_stride = 1;
148      }
149   }
150
151   /* NEW: Set the execution size based on dest.width and
152    * insn->compression_control:
153    */
154   guess_execution_size(p, insn, dest);
155}
156
157extern int reg_type_size[];
158
159static void
160validate_reg(struct brw_instruction *insn, struct brw_reg reg)
161{
162   int hstride_for_reg[] = {0, 1, 2, 4};
163   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
164   int width_for_reg[] = {1, 2, 4, 8, 16};
165   int execsize_for_reg[] = {1, 2, 4, 8, 16};
166   int width, hstride, vstride, execsize;
167
168   if (reg.file == BRW_IMMEDIATE_VALUE) {
169      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
170       * mean the destination has to be 128-bit aligned and the
171       * destination horiz stride has to be a word.
172       */
173      if (reg.type == BRW_REGISTER_TYPE_V) {
174	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
175		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
176      }
177
178      return;
179   }
180
181   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
182       reg.file == BRW_ARF_NULL)
183      return;
184
185   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
186   hstride = hstride_for_reg[reg.hstride];
187
188   if (reg.vstride == 0xf) {
189      vstride = -1;
190   } else {
191      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
192      vstride = vstride_for_reg[reg.vstride];
193   }
194
195   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
196   width = width_for_reg[reg.width];
197
198   assert(insn->header.execution_size >= 0 &&
199	  insn->header.execution_size < Elements(execsize_for_reg));
200   execsize = execsize_for_reg[insn->header.execution_size];
201
202   /* Restrictions from 3.3.10: Register Region Restrictions. */
203   /* 3. */
204   assert(execsize >= width);
205
206   /* 4. */
207   if (execsize == width && hstride != 0) {
208      assert(vstride == -1 || vstride == width * hstride);
209   }
210
211   /* 5. */
212   if (execsize == width && hstride == 0) {
213      /* no restriction on vstride. */
214   }
215
216   /* 6. */
217   if (width == 1) {
218      assert(hstride == 0);
219   }
220
221   /* 7. */
222   if (execsize == 1 && width == 1) {
223      assert(hstride == 0);
224      assert(vstride == 0);
225   }
226
227   /* 8. */
228   if (vstride == 0 && hstride == 0) {
229      assert(width == 1);
230   }
231
232   /* 10. Check destination issues. */
233}
234
235void
236brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
237	     struct brw_reg reg)
238{
239   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
240      assert(reg.nr < 128);
241
242   gen7_convert_mrf_to_grf(p, &reg);
243
244   validate_reg(insn, reg);
245
246   insn->bits1.da1.src0_reg_file = reg.file;
247   insn->bits1.da1.src0_reg_type = reg.type;
248   insn->bits2.da1.src0_abs = reg.abs;
249   insn->bits2.da1.src0_negate = reg.negate;
250   insn->bits2.da1.src0_address_mode = reg.address_mode;
251
252   if (reg.file == BRW_IMMEDIATE_VALUE) {
253      insn->bits3.ud = reg.dw1.ud;
254
255      /* Required to set some fields in src1 as well:
256       */
257      insn->bits1.da1.src1_reg_file = 0; /* arf */
258      insn->bits1.da1.src1_reg_type = reg.type;
259   }
260   else
261   {
262      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
263	 if (insn->header.access_mode == BRW_ALIGN_1) {
264	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
265	    insn->bits2.da1.src0_reg_nr = reg.nr;
266	 }
267	 else {
268	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
269	    insn->bits2.da16.src0_reg_nr = reg.nr;
270	 }
271      }
272      else {
273	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
274
275	 if (insn->header.access_mode == BRW_ALIGN_1) {
276	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
277	 }
278	 else {
279	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
280	 }
281      }
282
283      if (insn->header.access_mode == BRW_ALIGN_1) {
284	 if (reg.width == BRW_WIDTH_1 &&
285	     insn->header.execution_size == BRW_EXECUTE_1) {
286	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
287	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
288	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
289	 }
290	 else {
291	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
292	    insn->bits2.da1.src0_width = reg.width;
293	    insn->bits2.da1.src0_vert_stride = reg.vstride;
294	 }
295      }
296      else {
297	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
298	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
299	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
300	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
301
302	 /* This is an oddity of the fact we're using the same
303	  * descriptions for registers in align_16 as align_1:
304	  */
305	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
306	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
307	 else
308	    insn->bits2.da16.src0_vert_stride = reg.vstride;
309      }
310   }
311}
312
313
314void brw_set_src1(struct brw_compile *p,
315		  struct brw_instruction *insn,
316		  struct brw_reg reg)
317{
318   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
319
320   assert(reg.nr < 128);
321
322   gen7_convert_mrf_to_grf(p, &reg);
323
324   validate_reg(insn, reg);
325
326   insn->bits1.da1.src1_reg_file = reg.file;
327   insn->bits1.da1.src1_reg_type = reg.type;
328   insn->bits3.da1.src1_abs = reg.abs;
329   insn->bits3.da1.src1_negate = reg.negate;
330
331   /* Only src1 can be immediate in two-argument instructions.
332    */
333   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
334
335   if (reg.file == BRW_IMMEDIATE_VALUE) {
336      insn->bits3.ud = reg.dw1.ud;
337   }
338   else {
339      /* This is a hardware restriction, which may or may not be lifted
340       * in the future:
341       */
342      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
343      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
344
345      if (insn->header.access_mode == BRW_ALIGN_1) {
346	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
347	 insn->bits3.da1.src1_reg_nr = reg.nr;
348      }
349      else {
350	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
351	 insn->bits3.da16.src1_reg_nr = reg.nr;
352      }
353
354      if (insn->header.access_mode == BRW_ALIGN_1) {
355	 if (reg.width == BRW_WIDTH_1 &&
356	     insn->header.execution_size == BRW_EXECUTE_1) {
357	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
358	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
359	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
360	 }
361	 else {
362	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
363	    insn->bits3.da1.src1_width = reg.width;
364	    insn->bits3.da1.src1_vert_stride = reg.vstride;
365	 }
366      }
367      else {
368	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
369	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
370	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
371	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
372
373	 /* This is an oddity of the fact we're using the same
374	  * descriptions for registers in align_16 as align_1:
375	  */
376	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
377	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
378	 else
379	    insn->bits3.da16.src1_vert_stride = reg.vstride;
380      }
381   }
382}
383
384/**
385 * Set the Message Descriptor and Extended Message Descriptor fields
386 * for SEND messages.
387 *
388 * \note This zeroes out the Function Control bits, so it must be called
389 *       \b before filling out any message-specific data.  Callers can
390 *       choose not to fill in irrelevant bits; they will be zero.
391 */
392static void
393brw_set_message_descriptor(struct brw_compile *p,
394			   struct brw_instruction *inst,
395			   enum brw_message_target sfid,
396			   unsigned msg_length,
397			   unsigned response_length,
398			   bool header_present,
399			   bool end_of_thread)
400{
401   struct intel_context *intel = &p->brw->intel;
402
403   brw_set_src1(p, inst, brw_imm_d(0));
404
405   if (intel->gen >= 5) {
406      inst->bits3.generic_gen5.header_present = header_present;
407      inst->bits3.generic_gen5.response_length = response_length;
408      inst->bits3.generic_gen5.msg_length = msg_length;
409      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
410
411      if (intel->gen >= 6) {
412	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
413	 inst->header.destreg__conditionalmod = sfid;
414      } else {
415	 /* Set Extended Message Descriptor (ex_desc) */
416	 inst->bits2.send_gen5.sfid = sfid;
417	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
418      }
419   } else {
420      inst->bits3.generic.response_length = response_length;
421      inst->bits3.generic.msg_length = msg_length;
422      inst->bits3.generic.msg_target = sfid;
423      inst->bits3.generic.end_of_thread = end_of_thread;
424   }
425}
426
427static void brw_set_math_message( struct brw_compile *p,
428				  struct brw_instruction *insn,
429				  GLuint function,
430				  GLuint integer_type,
431				  bool low_precision,
432				  bool saturate,
433				  GLuint dataType )
434{
435   struct brw_context *brw = p->brw;
436   struct intel_context *intel = &brw->intel;
437   unsigned msg_length;
438   unsigned response_length;
439
440   /* Infer message length from the function */
441   switch (function) {
442   case BRW_MATH_FUNCTION_POW:
443   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
444   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
445   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
446      msg_length = 2;
447      break;
448   default:
449      msg_length = 1;
450      break;
451   }
452
453   /* Infer response length from the function */
454   switch (function) {
455   case BRW_MATH_FUNCTION_SINCOS:
456   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
457      response_length = 2;
458      break;
459   default:
460      response_length = 1;
461      break;
462   }
463
464   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
465			      msg_length, response_length, false, false);
466   if (intel->gen == 5) {
467      insn->bits3.math_gen5.function = function;
468      insn->bits3.math_gen5.int_type = integer_type;
469      insn->bits3.math_gen5.precision = low_precision;
470      insn->bits3.math_gen5.saturate = saturate;
471      insn->bits3.math_gen5.data_type = dataType;
472      insn->bits3.math_gen5.snapshot = 0;
473   } else {
474      insn->bits3.math.function = function;
475      insn->bits3.math.int_type = integer_type;
476      insn->bits3.math.precision = low_precision;
477      insn->bits3.math.saturate = saturate;
478      insn->bits3.math.data_type = dataType;
479   }
480}
481
482
483static void brw_set_ff_sync_message(struct brw_compile *p,
484				    struct brw_instruction *insn,
485				    bool allocate,
486				    GLuint response_length,
487				    bool end_of_thread)
488{
489   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
490			      1, response_length, true, end_of_thread);
491   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
492   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
493   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
494   insn->bits3.urb_gen5.allocate = allocate;
495   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
496   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
497}
498
499static void brw_set_urb_message( struct brw_compile *p,
500				 struct brw_instruction *insn,
501				 bool allocate,
502				 bool used,
503				 GLuint msg_length,
504				 GLuint response_length,
505				 bool end_of_thread,
506				 bool complete,
507				 GLuint offset,
508				 GLuint swizzle_control )
509{
510   struct brw_context *brw = p->brw;
511   struct intel_context *intel = &brw->intel;
512
513   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
514			      msg_length, response_length, true, end_of_thread);
515   if (intel->gen == 7) {
516      insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
517      insn->bits3.urb_gen7.offset = offset;
518      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
519      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
520      /* per_slot_offset = 0 makes it ignore offsets in message header */
521      insn->bits3.urb_gen7.per_slot_offset = 0;
522      insn->bits3.urb_gen7.complete = complete;
523   } else if (intel->gen >= 5) {
524      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
525      insn->bits3.urb_gen5.offset = offset;
526      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
527      insn->bits3.urb_gen5.allocate = allocate;
528      insn->bits3.urb_gen5.used = used;	/* ? */
529      insn->bits3.urb_gen5.complete = complete;
530   } else {
531      insn->bits3.urb.opcode = 0;	/* ? */
532      insn->bits3.urb.offset = offset;
533      insn->bits3.urb.swizzle_control = swizzle_control;
534      insn->bits3.urb.allocate = allocate;
535      insn->bits3.urb.used = used;	/* ? */
536      insn->bits3.urb.complete = complete;
537   }
538}
539
540void
541brw_set_dp_write_message(struct brw_compile *p,
542			 struct brw_instruction *insn,
543			 GLuint binding_table_index,
544			 GLuint msg_control,
545			 GLuint msg_type,
546			 GLuint msg_length,
547			 bool header_present,
548			 GLuint last_render_target,
549			 GLuint response_length,
550			 GLuint end_of_thread,
551			 GLuint send_commit_msg)
552{
553   struct brw_context *brw = p->brw;
554   struct intel_context *intel = &brw->intel;
555   unsigned sfid;
556
557   if (intel->gen >= 7) {
558      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
559      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
560	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
561      else
562	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
563   } else if (intel->gen == 6) {
564      /* Use the render cache for all write messages. */
565      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
566   } else {
567      sfid = BRW_SFID_DATAPORT_WRITE;
568   }
569
570   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
571			      header_present, end_of_thread);
572
573   if (intel->gen >= 7) {
574      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
575      insn->bits3.gen7_dp.msg_control = msg_control;
576      insn->bits3.gen7_dp.last_render_target = last_render_target;
577      insn->bits3.gen7_dp.msg_type = msg_type;
578   } else if (intel->gen == 6) {
579      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
580      insn->bits3.gen6_dp.msg_control = msg_control;
581      insn->bits3.gen6_dp.last_render_target = last_render_target;
582      insn->bits3.gen6_dp.msg_type = msg_type;
583      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
584   } else if (intel->gen == 5) {
585      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
586      insn->bits3.dp_write_gen5.msg_control = msg_control;
587      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
588      insn->bits3.dp_write_gen5.msg_type = msg_type;
589      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
590   } else {
591      insn->bits3.dp_write.binding_table_index = binding_table_index;
592      insn->bits3.dp_write.msg_control = msg_control;
593      insn->bits3.dp_write.last_render_target = last_render_target;
594      insn->bits3.dp_write.msg_type = msg_type;
595      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
596   }
597}
598
599void
600brw_set_dp_read_message(struct brw_compile *p,
601			struct brw_instruction *insn,
602			GLuint binding_table_index,
603			GLuint msg_control,
604			GLuint msg_type,
605			GLuint target_cache,
606			GLuint msg_length,
607			GLuint response_length)
608{
609   struct brw_context *brw = p->brw;
610   struct intel_context *intel = &brw->intel;
611   unsigned sfid;
612
613   if (intel->gen >= 7) {
614      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
615   } else if (intel->gen == 6) {
616      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
617	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
618      else
619	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
620   } else {
621      sfid = BRW_SFID_DATAPORT_READ;
622   }
623
624   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
625			      true, false);
626
627   if (intel->gen >= 7) {
628      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
629      insn->bits3.gen7_dp.msg_control = msg_control;
630      insn->bits3.gen7_dp.last_render_target = 0;
631      insn->bits3.gen7_dp.msg_type = msg_type;
632   } else if (intel->gen == 6) {
633      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
634      insn->bits3.gen6_dp.msg_control = msg_control;
635      insn->bits3.gen6_dp.last_render_target = 0;
636      insn->bits3.gen6_dp.msg_type = msg_type;
637      insn->bits3.gen6_dp.send_commit_msg = 0;
638   } else if (intel->gen == 5) {
639      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
640      insn->bits3.dp_read_gen5.msg_control = msg_control;
641      insn->bits3.dp_read_gen5.msg_type = msg_type;
642      insn->bits3.dp_read_gen5.target_cache = target_cache;
643   } else if (intel->is_g4x) {
644      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
645      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
646      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
647      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
648   } else {
649      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
650      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
651      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
652      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
653   }
654}
655
656void
657brw_set_sampler_message(struct brw_compile *p,
658                        struct brw_instruction *insn,
659                        GLuint binding_table_index,
660                        GLuint sampler,
661                        GLuint msg_type,
662                        GLuint response_length,
663                        GLuint msg_length,
664                        GLuint header_present,
665                        GLuint simd_mode,
666                        GLuint return_format)
667{
668   struct brw_context *brw = p->brw;
669   struct intel_context *intel = &brw->intel;
670
671   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
672			      response_length, header_present, false);
673
674   if (intel->gen >= 7) {
675      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
676      insn->bits3.sampler_gen7.sampler = sampler;
677      insn->bits3.sampler_gen7.msg_type = msg_type;
678      insn->bits3.sampler_gen7.simd_mode = simd_mode;
679   } else if (intel->gen >= 5) {
680      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
681      insn->bits3.sampler_gen5.sampler = sampler;
682      insn->bits3.sampler_gen5.msg_type = msg_type;
683      insn->bits3.sampler_gen5.simd_mode = simd_mode;
684   } else if (intel->is_g4x) {
685      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
686      insn->bits3.sampler_g4x.sampler = sampler;
687      insn->bits3.sampler_g4x.msg_type = msg_type;
688   } else {
689      insn->bits3.sampler.binding_table_index = binding_table_index;
690      insn->bits3.sampler.sampler = sampler;
691      insn->bits3.sampler.msg_type = msg_type;
692      insn->bits3.sampler.return_format = return_format;
693   }
694}
695
696
697#define next_insn brw_next_insn
698struct brw_instruction *
699brw_next_insn(struct brw_compile *p, GLuint opcode)
700{
701   struct brw_instruction *insn;
702
703   if (p->nr_insn + 1 > p->store_size) {
704      if (0)
705         printf("incresing the store size to %d\n", p->store_size << 1);
706      p->store_size <<= 1;
707      p->store = reralloc(p->mem_ctx, p->store,
708                          struct brw_instruction, p->store_size);
709      if (!p->store)
710         assert(!"realloc eu store memeory failed");
711   }
712
713   insn = &p->store[p->nr_insn++];
714   memcpy(insn, p->current, sizeof(*insn));
715
716   /* Reset this one-shot flag:
717    */
718
719   if (p->current->header.destreg__conditionalmod) {
720      p->current->header.destreg__conditionalmod = 0;
721      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
722   }
723
724   insn->header.opcode = opcode;
725   return insn;
726}
727
728static struct brw_instruction *brw_alu1( struct brw_compile *p,
729					 GLuint opcode,
730					 struct brw_reg dest,
731					 struct brw_reg src )
732{
733   struct brw_instruction *insn = next_insn(p, opcode);
734   brw_set_dest(p, insn, dest);
735   brw_set_src0(p, insn, src);
736   return insn;
737}
738
739static struct brw_instruction *brw_alu2(struct brw_compile *p,
740					GLuint opcode,
741					struct brw_reg dest,
742					struct brw_reg src0,
743					struct brw_reg src1 )
744{
745   struct brw_instruction *insn = next_insn(p, opcode);
746   brw_set_dest(p, insn, dest);
747   brw_set_src0(p, insn, src0);
748   brw_set_src1(p, insn, src1);
749   return insn;
750}
751
752static int
753get_3src_subreg_nr(struct brw_reg reg)
754{
755   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
756      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
757      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
758   } else {
759      return reg.subnr / 4;
760   }
761}
762
763static struct brw_instruction *brw_alu3(struct brw_compile *p,
764					GLuint opcode,
765					struct brw_reg dest,
766					struct brw_reg src0,
767					struct brw_reg src1,
768					struct brw_reg src2)
769{
770   struct brw_instruction *insn = next_insn(p, opcode);
771
772   gen7_convert_mrf_to_grf(p, &dest);
773
774   assert(insn->header.access_mode == BRW_ALIGN_16);
775
776   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
777	  dest.file == BRW_MESSAGE_REGISTER_FILE);
778   assert(dest.nr < 128);
779   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
780   assert(dest.type = BRW_REGISTER_TYPE_F);
781   insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
782   insn->bits1.da3src.dest_reg_nr = dest.nr;
783   insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
784   insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
785   guess_execution_size(p, insn, dest);
786
787   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
788   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
789   assert(src0.nr < 128);
790   assert(src0.type == BRW_REGISTER_TYPE_F);
791   insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
792   insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
793   insn->bits2.da3src.src0_reg_nr = src0.nr;
794   insn->bits1.da3src.src0_abs = src0.abs;
795   insn->bits1.da3src.src0_negate = src0.negate;
796   insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
797
798   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
799   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
800   assert(src1.nr < 128);
801   assert(src1.type == BRW_REGISTER_TYPE_F);
802   insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
803   insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
804   insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
805   insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
806   insn->bits3.da3src.src1_reg_nr = src1.nr;
807   insn->bits1.da3src.src1_abs = src1.abs;
808   insn->bits1.da3src.src1_negate = src1.negate;
809
810   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
811   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
812   assert(src2.nr < 128);
813   assert(src2.type == BRW_REGISTER_TYPE_F);
814   insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
815   insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
816   insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
817   insn->bits3.da3src.src2_reg_nr = src2.nr;
818   insn->bits1.da3src.src2_abs = src2.abs;
819   insn->bits1.da3src.src2_negate = src2.negate;
820
821   return insn;
822}
823
824
825/***********************************************************************
826 * Convenience routines.
827 */
828#define ALU1(OP)					\
829struct brw_instruction *brw_##OP(struct brw_compile *p,	\
830	      struct brw_reg dest,			\
831	      struct brw_reg src0)   			\
832{							\
833   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
834}
835
836#define ALU2(OP)					\
837struct brw_instruction *brw_##OP(struct brw_compile *p,	\
838	      struct brw_reg dest,			\
839	      struct brw_reg src0,			\
840	      struct brw_reg src1)   			\
841{							\
842   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
843}
844
845#define ALU3(OP)					\
846struct brw_instruction *brw_##OP(struct brw_compile *p,	\
847	      struct brw_reg dest,			\
848	      struct brw_reg src0,			\
849	      struct brw_reg src1,			\
850	      struct brw_reg src2)   			\
851{							\
852   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
853}
854
855/* Rounding operations (other than RNDD) require two instructions - the first
856 * stores a rounded value (possibly the wrong way) in the dest register, but
857 * also sets a per-channel "increment bit" in the flag register.  A predicated
858 * add of 1.0 fixes dest to contain the desired result.
859 *
860 * Sandybridge and later appear to round correctly without an ADD.
861 */
862#define ROUND(OP)							      \
863void brw_##OP(struct brw_compile *p,					      \
864	      struct brw_reg dest,					      \
865	      struct brw_reg src)					      \
866{									      \
867   struct brw_instruction *rnd, *add;					      \
868   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
869   brw_set_dest(p, rnd, dest);						      \
870   brw_set_src0(p, rnd, src);						      \
871									      \
872   if (p->brw->intel.gen < 6) {						      \
873      /* turn on round-increments */					      \
874      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
875      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
876      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
877   }									      \
878}
879
880
881ALU1(MOV)
882ALU2(SEL)
883ALU1(NOT)
884ALU2(AND)
885ALU2(OR)
886ALU2(XOR)
887ALU2(SHR)
888ALU2(SHL)
889ALU2(RSR)
890ALU2(RSL)
891ALU2(ASR)
892ALU1(FRC)
893ALU1(RNDD)
894ALU2(MAC)
895ALU2(MACH)
896ALU1(LZD)
897ALU2(DP4)
898ALU2(DPH)
899ALU2(DP3)
900ALU2(DP2)
901ALU2(LINE)
902ALU2(PLN)
903ALU3(MAD)
904
905ROUND(RNDZ)
906ROUND(RNDE)
907
908
909struct brw_instruction *brw_ADD(struct brw_compile *p,
910				struct brw_reg dest,
911				struct brw_reg src0,
912				struct brw_reg src1)
913{
914   /* 6.2.2: add */
915   if (src0.type == BRW_REGISTER_TYPE_F ||
916       (src0.file == BRW_IMMEDIATE_VALUE &&
917	src0.type == BRW_REGISTER_TYPE_VF)) {
918      assert(src1.type != BRW_REGISTER_TYPE_UD);
919      assert(src1.type != BRW_REGISTER_TYPE_D);
920   }
921
922   if (src1.type == BRW_REGISTER_TYPE_F ||
923       (src1.file == BRW_IMMEDIATE_VALUE &&
924	src1.type == BRW_REGISTER_TYPE_VF)) {
925      assert(src0.type != BRW_REGISTER_TYPE_UD);
926      assert(src0.type != BRW_REGISTER_TYPE_D);
927   }
928
929   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
930}
931
932struct brw_instruction *brw_MUL(struct brw_compile *p,
933				struct brw_reg dest,
934				struct brw_reg src0,
935				struct brw_reg src1)
936{
937   /* 6.32.38: mul */
938   if (src0.type == BRW_REGISTER_TYPE_D ||
939       src0.type == BRW_REGISTER_TYPE_UD ||
940       src1.type == BRW_REGISTER_TYPE_D ||
941       src1.type == BRW_REGISTER_TYPE_UD) {
942      assert(dest.type != BRW_REGISTER_TYPE_F);
943   }
944
945   if (src0.type == BRW_REGISTER_TYPE_F ||
946       (src0.file == BRW_IMMEDIATE_VALUE &&
947	src0.type == BRW_REGISTER_TYPE_VF)) {
948      assert(src1.type != BRW_REGISTER_TYPE_UD);
949      assert(src1.type != BRW_REGISTER_TYPE_D);
950   }
951
952   if (src1.type == BRW_REGISTER_TYPE_F ||
953       (src1.file == BRW_IMMEDIATE_VALUE &&
954	src1.type == BRW_REGISTER_TYPE_VF)) {
955      assert(src0.type != BRW_REGISTER_TYPE_UD);
956      assert(src0.type != BRW_REGISTER_TYPE_D);
957   }
958
959   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
960	  src0.nr != BRW_ARF_ACCUMULATOR);
961   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
962	  src1.nr != BRW_ARF_ACCUMULATOR);
963
964   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
965}
966
967
968void brw_NOP(struct brw_compile *p)
969{
970   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
971   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
972   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
973   brw_set_src1(p, insn, brw_imm_ud(0x0));
974}
975
976
977
978
979
980/***********************************************************************
981 * Comparisons, if/else/endif
982 */
983
984struct brw_instruction *brw_JMPI(struct brw_compile *p,
985                                 struct brw_reg dest,
986                                 struct brw_reg src0,
987                                 struct brw_reg src1)
988{
989   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
990
991   insn->header.execution_size = 1;
992   insn->header.compression_control = BRW_COMPRESSION_NONE;
993   insn->header.mask_control = BRW_MASK_DISABLE;
994
995   p->current->header.predicate_control = BRW_PREDICATE_NONE;
996
997   return insn;
998}
999
1000static void
1001push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1002{
1003   p->if_stack[p->if_stack_depth] = inst - p->store;
1004
1005   p->if_stack_depth++;
1006   if (p->if_stack_array_size <= p->if_stack_depth) {
1007      p->if_stack_array_size *= 2;
1008      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1009			     p->if_stack_array_size);
1010   }
1011}
1012
1013static struct brw_instruction *
1014pop_if_stack(struct brw_compile *p)
1015{
1016   p->if_stack_depth--;
1017   return &p->store[p->if_stack[p->if_stack_depth]];
1018}
1019
1020static void
1021push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1022{
1023   if (p->loop_stack_array_size < p->loop_stack_depth) {
1024      p->loop_stack_array_size *= 2;
1025      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1026			       p->loop_stack_array_size);
1027      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1028				     p->loop_stack_array_size);
1029   }
1030
1031   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1032   p->loop_stack_depth++;
1033   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1034}
1035
1036static struct brw_instruction *
1037get_inner_do_insn(struct brw_compile *p)
1038{
1039   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1040}
1041
1042/* EU takes the value from the flag register and pushes it onto some
1043 * sort of a stack (presumably merging with any flag value already on
1044 * the stack).  Within an if block, the flags at the top of the stack
1045 * control execution on each channel of the unit, eg. on each of the
1046 * 16 pixel values in our wm programs.
1047 *
1048 * When the matching 'else' instruction is reached (presumably by
1049 * countdown of the instruction count patched in by our ELSE/ENDIF
1050 * functions), the relevent flags are inverted.
1051 *
1052 * When the matching 'endif' instruction is reached, the flags are
1053 * popped off.  If the stack is now empty, normal execution resumes.
1054 */
1055struct brw_instruction *
1056brw_IF(struct brw_compile *p, GLuint execute_size)
1057{
1058   struct intel_context *intel = &p->brw->intel;
1059   struct brw_instruction *insn;
1060
1061   insn = next_insn(p, BRW_OPCODE_IF);
1062
1063   /* Override the defaults for this instruction:
1064    */
1065   if (intel->gen < 6) {
1066      brw_set_dest(p, insn, brw_ip_reg());
1067      brw_set_src0(p, insn, brw_ip_reg());
1068      brw_set_src1(p, insn, brw_imm_d(0x0));
1069   } else if (intel->gen == 6) {
1070      brw_set_dest(p, insn, brw_imm_w(0));
1071      insn->bits1.branch_gen6.jump_count = 0;
1072      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1073      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1074   } else {
1075      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1076      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1077      brw_set_src1(p, insn, brw_imm_ud(0));
1078      insn->bits3.break_cont.jip = 0;
1079      insn->bits3.break_cont.uip = 0;
1080   }
1081
1082   insn->header.execution_size = execute_size;
1083   insn->header.compression_control = BRW_COMPRESSION_NONE;
1084   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1085   insn->header.mask_control = BRW_MASK_ENABLE;
1086   if (!p->single_program_flow)
1087      insn->header.thread_control = BRW_THREAD_SWITCH;
1088
1089   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1090
1091   push_if_stack(p, insn);
1092   p->if_depth_in_loop[p->loop_stack_depth]++;
1093   return insn;
1094}
1095
1096/* This function is only used for gen6-style IF instructions with an
1097 * embedded comparison (conditional modifier).  It is not used on gen7.
1098 */
1099struct brw_instruction *
1100gen6_IF(struct brw_compile *p, uint32_t conditional,
1101	struct brw_reg src0, struct brw_reg src1)
1102{
1103   struct brw_instruction *insn;
1104
1105   insn = next_insn(p, BRW_OPCODE_IF);
1106
1107   brw_set_dest(p, insn, brw_imm_w(0));
1108   if (p->compressed) {
1109      insn->header.execution_size = BRW_EXECUTE_16;
1110   } else {
1111      insn->header.execution_size = BRW_EXECUTE_8;
1112   }
1113   insn->bits1.branch_gen6.jump_count = 0;
1114   brw_set_src0(p, insn, src0);
1115   brw_set_src1(p, insn, src1);
1116
1117   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1118   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1119   insn->header.destreg__conditionalmod = conditional;
1120
1121   if (!p->single_program_flow)
1122      insn->header.thread_control = BRW_THREAD_SWITCH;
1123
1124   push_if_stack(p, insn);
1125   return insn;
1126}
1127
1128/**
1129 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1130 */
1131static void
1132convert_IF_ELSE_to_ADD(struct brw_compile *p,
1133		       struct brw_instruction *if_inst,
1134		       struct brw_instruction *else_inst)
1135{
1136   /* The next instruction (where the ENDIF would be, if it existed) */
1137   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1138
1139   assert(p->single_program_flow);
1140   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1141   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1142   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1143
1144   /* Convert IF to an ADD instruction that moves the instruction pointer
1145    * to the first instruction of the ELSE block.  If there is no ELSE
1146    * block, point to where ENDIF would be.  Reverse the predicate.
1147    *
1148    * There's no need to execute an ENDIF since we don't need to do any
1149    * stack operations, and if we're currently executing, we just want to
1150    * continue normally.
1151    */
1152   if_inst->header.opcode = BRW_OPCODE_ADD;
1153   if_inst->header.predicate_inverse = 1;
1154
1155   if (else_inst != NULL) {
1156      /* Convert ELSE to an ADD instruction that points where the ENDIF
1157       * would be.
1158       */
1159      else_inst->header.opcode = BRW_OPCODE_ADD;
1160
1161      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1162      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1163   } else {
1164      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1165   }
1166}
1167
1168/**
1169 * Patch IF and ELSE instructions with appropriate jump targets.
1170 */
1171static void
1172patch_IF_ELSE(struct brw_compile *p,
1173	      struct brw_instruction *if_inst,
1174	      struct brw_instruction *else_inst,
1175	      struct brw_instruction *endif_inst)
1176{
1177   struct intel_context *intel = &p->brw->intel;
1178
1179   /* We shouldn't be patching IF and ELSE instructions in single program flow
1180    * mode when gen < 6, because in single program flow mode on those
1181    * platforms, we convert flow control instructions to conditional ADDs that
1182    * operate on IP (see brw_ENDIF).
1183    *
1184    * However, on Gen6, writing to IP doesn't work in single program flow mode
1185    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1186    * not be updated by non-flow control instructions.").  And on later
1187    * platforms, there is no significant benefit to converting control flow
1188    * instructions to conditional ADDs.  So we do patch IF and ELSE
1189    * instructions in single program flow mode on those platforms.
1190    */
1191   if (intel->gen < 6)
1192      assert(!p->single_program_flow);
1193
1194   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1195   assert(endif_inst != NULL);
1196   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1197
1198   unsigned br = 1;
1199   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1200    * requires 2 chunks.
1201    */
1202   if (intel->gen >= 5)
1203      br = 2;
1204
1205   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1206   endif_inst->header.execution_size = if_inst->header.execution_size;
1207
1208   if (else_inst == NULL) {
1209      /* Patch IF -> ENDIF */
1210      if (intel->gen < 6) {
1211	 /* Turn it into an IFF, which means no mask stack operations for
1212	  * all-false and jumping past the ENDIF.
1213	  */
1214	 if_inst->header.opcode = BRW_OPCODE_IFF;
1215	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1216	 if_inst->bits3.if_else.pop_count = 0;
1217	 if_inst->bits3.if_else.pad0 = 0;
1218      } else if (intel->gen == 6) {
1219	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1220	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1221      } else {
1222	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1223	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1224      }
1225   } else {
1226      else_inst->header.execution_size = if_inst->header.execution_size;
1227
1228      /* Patch IF -> ELSE */
1229      if (intel->gen < 6) {
1230	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1231	 if_inst->bits3.if_else.pop_count = 0;
1232	 if_inst->bits3.if_else.pad0 = 0;
1233      } else if (intel->gen == 6) {
1234	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1235      }
1236
1237      /* Patch ELSE -> ENDIF */
1238      if (intel->gen < 6) {
1239	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1240	  * matching ENDIF.
1241	  */
1242	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1243	 else_inst->bits3.if_else.pop_count = 1;
1244	 else_inst->bits3.if_else.pad0 = 0;
1245      } else if (intel->gen == 6) {
1246	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1247	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1248      } else {
1249	 /* The IF instruction's JIP should point just past the ELSE */
1250	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1251	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1252	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1253	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1254      }
1255   }
1256}
1257
1258void
1259brw_ELSE(struct brw_compile *p)
1260{
1261   struct intel_context *intel = &p->brw->intel;
1262   struct brw_instruction *insn;
1263
1264   insn = next_insn(p, BRW_OPCODE_ELSE);
1265
1266   if (intel->gen < 6) {
1267      brw_set_dest(p, insn, brw_ip_reg());
1268      brw_set_src0(p, insn, brw_ip_reg());
1269      brw_set_src1(p, insn, brw_imm_d(0x0));
1270   } else if (intel->gen == 6) {
1271      brw_set_dest(p, insn, brw_imm_w(0));
1272      insn->bits1.branch_gen6.jump_count = 0;
1273      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1274      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1275   } else {
1276      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1277      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1278      brw_set_src1(p, insn, brw_imm_ud(0));
1279      insn->bits3.break_cont.jip = 0;
1280      insn->bits3.break_cont.uip = 0;
1281   }
1282
1283   insn->header.compression_control = BRW_COMPRESSION_NONE;
1284   insn->header.mask_control = BRW_MASK_ENABLE;
1285   if (!p->single_program_flow)
1286      insn->header.thread_control = BRW_THREAD_SWITCH;
1287
1288   push_if_stack(p, insn);
1289}
1290
1291void
1292brw_ENDIF(struct brw_compile *p)
1293{
1294   struct intel_context *intel = &p->brw->intel;
1295   struct brw_instruction *insn = NULL;
1296   struct brw_instruction *else_inst = NULL;
1297   struct brw_instruction *if_inst = NULL;
1298   struct brw_instruction *tmp;
1299   bool emit_endif = true;
1300
1301   /* In single program flow mode, we can express IF and ELSE instructions
1302    * equivalently as ADD instructions that operate on IP.  On platforms prior
1303    * to Gen6, flow control instructions cause an implied thread switch, so
1304    * this is a significant savings.
1305    *
1306    * However, on Gen6, writing to IP doesn't work in single program flow mode
1307    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1308    * not be updated by non-flow control instructions.").  And on later
1309    * platforms, there is no significant benefit to converting control flow
1310    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1311    * Gen5.
1312    */
1313   if (intel->gen < 6 && p->single_program_flow)
1314      emit_endif = false;
1315
1316   /*
1317    * A single next_insn() may change the base adress of instruction store
1318    * memory(p->store), so call it first before referencing the instruction
1319    * store pointer from an index
1320    */
1321   if (emit_endif)
1322      insn = next_insn(p, BRW_OPCODE_ENDIF);
1323
1324   /* Pop the IF and (optional) ELSE instructions from the stack */
1325   p->if_depth_in_loop[p->loop_stack_depth]--;
1326   tmp = pop_if_stack(p);
1327   if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1328      else_inst = tmp;
1329      tmp = pop_if_stack(p);
1330   }
1331   if_inst = tmp;
1332
1333   if (!emit_endif) {
1334      /* ENDIF is useless; don't bother emitting it. */
1335      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1336      return;
1337   }
1338
1339   if (intel->gen < 6) {
1340      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1341      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1342      brw_set_src1(p, insn, brw_imm_d(0x0));
1343   } else if (intel->gen == 6) {
1344      brw_set_dest(p, insn, brw_imm_w(0));
1345      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1346      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1347   } else {
1348      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1349      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1350      brw_set_src1(p, insn, brw_imm_ud(0));
1351   }
1352
1353   insn->header.compression_control = BRW_COMPRESSION_NONE;
1354   insn->header.mask_control = BRW_MASK_ENABLE;
1355   insn->header.thread_control = BRW_THREAD_SWITCH;
1356
1357   /* Also pop item off the stack in the endif instruction: */
1358   if (intel->gen < 6) {
1359      insn->bits3.if_else.jump_count = 0;
1360      insn->bits3.if_else.pop_count = 1;
1361      insn->bits3.if_else.pad0 = 0;
1362   } else if (intel->gen == 6) {
1363      insn->bits1.branch_gen6.jump_count = 2;
1364   } else {
1365      insn->bits3.break_cont.jip = 2;
1366   }
1367   patch_IF_ELSE(p, if_inst, else_inst, insn);
1368}
1369
1370struct brw_instruction *brw_BREAK(struct brw_compile *p)
1371{
1372   struct intel_context *intel = &p->brw->intel;
1373   struct brw_instruction *insn;
1374
1375   insn = next_insn(p, BRW_OPCODE_BREAK);
1376   if (intel->gen >= 6) {
1377      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1378      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1379      brw_set_src1(p, insn, brw_imm_d(0x0));
1380   } else {
1381      brw_set_dest(p, insn, brw_ip_reg());
1382      brw_set_src0(p, insn, brw_ip_reg());
1383      brw_set_src1(p, insn, brw_imm_d(0x0));
1384      insn->bits3.if_else.pad0 = 0;
1385      insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1386   }
1387   insn->header.compression_control = BRW_COMPRESSION_NONE;
1388   insn->header.execution_size = BRW_EXECUTE_8;
1389
1390   return insn;
1391}
1392
1393struct brw_instruction *gen6_CONT(struct brw_compile *p)
1394{
1395   struct brw_instruction *insn;
1396
1397   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1398   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1399   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1400   brw_set_dest(p, insn, brw_ip_reg());
1401   brw_set_src0(p, insn, brw_ip_reg());
1402   brw_set_src1(p, insn, brw_imm_d(0x0));
1403
1404   insn->header.compression_control = BRW_COMPRESSION_NONE;
1405   insn->header.execution_size = BRW_EXECUTE_8;
1406   return insn;
1407}
1408
1409struct brw_instruction *brw_CONT(struct brw_compile *p)
1410{
1411   struct brw_instruction *insn;
1412   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1413   brw_set_dest(p, insn, brw_ip_reg());
1414   brw_set_src0(p, insn, brw_ip_reg());
1415   brw_set_src1(p, insn, brw_imm_d(0x0));
1416   insn->header.compression_control = BRW_COMPRESSION_NONE;
1417   insn->header.execution_size = BRW_EXECUTE_8;
1418   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1419   insn->bits3.if_else.pad0 = 0;
1420   insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1421   return insn;
1422}
1423
1424/* DO/WHILE loop:
1425 *
1426 * The DO/WHILE is just an unterminated loop -- break or continue are
1427 * used for control within the loop.  We have a few ways they can be
1428 * done.
1429 *
1430 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1431 * jip and no DO instruction.
1432 *
1433 * For non-uniform control flow pre-gen6, there's a DO instruction to
1434 * push the mask, and a WHILE to jump back, and BREAK to get out and
1435 * pop the mask.
1436 *
1437 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1438 * just points back to the first instruction of the loop.
1439 */
1440struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1441{
1442   struct intel_context *intel = &p->brw->intel;
1443
1444   if (intel->gen >= 6 || p->single_program_flow) {
1445      push_loop_stack(p, &p->store[p->nr_insn]);
1446      return &p->store[p->nr_insn];
1447   } else {
1448      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1449
1450      push_loop_stack(p, insn);
1451
1452      /* Override the defaults for this instruction:
1453       */
1454      brw_set_dest(p, insn, brw_null_reg());
1455      brw_set_src0(p, insn, brw_null_reg());
1456      brw_set_src1(p, insn, brw_null_reg());
1457
1458      insn->header.compression_control = BRW_COMPRESSION_NONE;
1459      insn->header.execution_size = execute_size;
1460      insn->header.predicate_control = BRW_PREDICATE_NONE;
1461      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1462      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1463
1464      return insn;
1465   }
1466}
1467
1468/**
1469 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1470 * instruction here.
1471 *
1472 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1473 * nesting, since it can always just point to the end of the block/current loop.
1474 */
1475static void
1476brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1477{
1478   struct intel_context *intel = &p->brw->intel;
1479   struct brw_instruction *do_inst = get_inner_do_insn(p);
1480   struct brw_instruction *inst;
1481   int br = (intel->gen == 5) ? 2 : 1;
1482
1483   for (inst = while_inst - 1; inst != do_inst; inst--) {
1484      /* If the jump count is != 0, that means that this instruction has already
1485       * been patched because it's part of a loop inside of the one we're
1486       * patching.
1487       */
1488      if (inst->header.opcode == BRW_OPCODE_BREAK &&
1489	  inst->bits3.if_else.jump_count == 0) {
1490	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1491      } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1492		 inst->bits3.if_else.jump_count == 0) {
1493	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1494      }
1495   }
1496}
1497
1498struct brw_instruction *brw_WHILE(struct brw_compile *p)
1499{
1500   struct intel_context *intel = &p->brw->intel;
1501   struct brw_instruction *insn, *do_insn;
1502   GLuint br = 1;
1503
1504   if (intel->gen >= 5)
1505      br = 2;
1506
1507   if (intel->gen >= 7) {
1508      insn = next_insn(p, BRW_OPCODE_WHILE);
1509      do_insn = get_inner_do_insn(p);
1510
1511      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1512      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1513      brw_set_src1(p, insn, brw_imm_ud(0));
1514      insn->bits3.break_cont.jip = br * (do_insn - insn);
1515
1516      insn->header.execution_size = BRW_EXECUTE_8;
1517   } else if (intel->gen == 6) {
1518      insn = next_insn(p, BRW_OPCODE_WHILE);
1519      do_insn = get_inner_do_insn(p);
1520
1521      brw_set_dest(p, insn, brw_imm_w(0));
1522      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1523      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1524      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1525
1526      insn->header.execution_size = BRW_EXECUTE_8;
1527   } else {
1528      if (p->single_program_flow) {
1529	 insn = next_insn(p, BRW_OPCODE_ADD);
1530         do_insn = get_inner_do_insn(p);
1531
1532	 brw_set_dest(p, insn, brw_ip_reg());
1533	 brw_set_src0(p, insn, brw_ip_reg());
1534	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1535	 insn->header.execution_size = BRW_EXECUTE_1;
1536      } else {
1537	 insn = next_insn(p, BRW_OPCODE_WHILE);
1538         do_insn = get_inner_do_insn(p);
1539
1540	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1541
1542	 brw_set_dest(p, insn, brw_ip_reg());
1543	 brw_set_src0(p, insn, brw_ip_reg());
1544	 brw_set_src1(p, insn, brw_imm_d(0));
1545
1546	 insn->header.execution_size = do_insn->header.execution_size;
1547	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1548	 insn->bits3.if_else.pop_count = 0;
1549	 insn->bits3.if_else.pad0 = 0;
1550
1551	 brw_patch_break_cont(p, insn);
1552      }
1553   }
1554   insn->header.compression_control = BRW_COMPRESSION_NONE;
1555   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1556
1557   p->loop_stack_depth--;
1558
1559   return insn;
1560}
1561
1562
1563/* FORWARD JUMPS:
1564 */
1565void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1566{
1567   struct intel_context *intel = &p->brw->intel;
1568   struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1569   GLuint jmpi = 1;
1570
1571   if (intel->gen >= 5)
1572      jmpi = 2;
1573
1574   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1575   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1576
1577   jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1578}
1579
1580
1581
1582/* To integrate with the above, it makes sense that the comparison
1583 * instruction should populate the flag register.  It might be simpler
1584 * just to use the flag reg for most WM tasks?
1585 */
1586void brw_CMP(struct brw_compile *p,
1587	     struct brw_reg dest,
1588	     GLuint conditional,
1589	     struct brw_reg src0,
1590	     struct brw_reg src1)
1591{
1592   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1593
1594   insn->header.destreg__conditionalmod = conditional;
1595   brw_set_dest(p, insn, dest);
1596   brw_set_src0(p, insn, src0);
1597   brw_set_src1(p, insn, src1);
1598
1599/*    guess_execution_size(insn, src0); */
1600
1601
1602   /* Make it so that future instructions will use the computed flag
1603    * value until brw_set_predicate_control_flag_value() is called
1604    * again.
1605    */
1606   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1607       dest.nr == 0) {
1608      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1609      p->flag_value = 0xff;
1610   }
1611}
1612
1613/* Issue 'wait' instruction for n1, host could program MMIO
1614   to wake up thread. */
1615void brw_WAIT (struct brw_compile *p)
1616{
1617   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1618   struct brw_reg src = brw_notification_1_reg();
1619
1620   brw_set_dest(p, insn, src);
1621   brw_set_src0(p, insn, src);
1622   brw_set_src1(p, insn, brw_null_reg());
1623   insn->header.execution_size = 0; /* must */
1624   insn->header.predicate_control = 0;
1625   insn->header.compression_control = 0;
1626}
1627
1628
1629/***********************************************************************
1630 * Helpers for the various SEND message types:
1631 */
1632
1633/** Extended math function, float[8].
1634 */
1635void brw_math( struct brw_compile *p,
1636	       struct brw_reg dest,
1637	       GLuint function,
1638	       GLuint saturate,
1639	       GLuint msg_reg_nr,
1640	       struct brw_reg src,
1641	       GLuint data_type,
1642	       GLuint precision )
1643{
1644   struct intel_context *intel = &p->brw->intel;
1645
1646   if (intel->gen >= 6) {
1647      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1648
1649      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1650      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1651
1652      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1653      if (intel->gen == 6)
1654	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1655
1656      /* Source modifiers are ignored for extended math instructions on Gen6. */
1657      if (intel->gen == 6) {
1658	 assert(!src.negate);
1659	 assert(!src.abs);
1660      }
1661
1662      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1663	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1664	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1665	 assert(src.type != BRW_REGISTER_TYPE_F);
1666      } else {
1667	 assert(src.type == BRW_REGISTER_TYPE_F);
1668      }
1669
1670      /* Math is the same ISA format as other opcodes, except that CondModifier
1671       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1672       */
1673      insn->header.destreg__conditionalmod = function;
1674      insn->header.saturate = saturate;
1675
1676      brw_set_dest(p, insn, dest);
1677      brw_set_src0(p, insn, src);
1678      brw_set_src1(p, insn, brw_null_reg());
1679   } else {
1680      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1681
1682      /* Example code doesn't set predicate_control for send
1683       * instructions.
1684       */
1685      insn->header.predicate_control = 0;
1686      insn->header.destreg__conditionalmod = msg_reg_nr;
1687
1688      brw_set_dest(p, insn, dest);
1689      brw_set_src0(p, insn, src);
1690      brw_set_math_message(p,
1691			   insn,
1692			   function,
1693			   src.type == BRW_REGISTER_TYPE_D,
1694			   precision,
1695			   saturate,
1696			   data_type);
1697   }
1698}
1699
1700/** Extended math function, float[8].
1701 */
1702void brw_math2(struct brw_compile *p,
1703	       struct brw_reg dest,
1704	       GLuint function,
1705	       struct brw_reg src0,
1706	       struct brw_reg src1)
1707{
1708   struct intel_context *intel = &p->brw->intel;
1709   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1710
1711   assert(intel->gen >= 6);
1712   (void) intel;
1713
1714
1715   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1716   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1717   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1718
1719   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1720   if (intel->gen == 6) {
1721      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1722      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1723   }
1724
1725   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1726       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1727       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1728      assert(src0.type != BRW_REGISTER_TYPE_F);
1729      assert(src1.type != BRW_REGISTER_TYPE_F);
1730   } else {
1731      assert(src0.type == BRW_REGISTER_TYPE_F);
1732      assert(src1.type == BRW_REGISTER_TYPE_F);
1733   }
1734
1735   /* Source modifiers are ignored for extended math instructions on Gen6. */
1736   if (intel->gen == 6) {
1737      assert(!src0.negate);
1738      assert(!src0.abs);
1739      assert(!src1.negate);
1740      assert(!src1.abs);
1741   }
1742
1743   /* Math is the same ISA format as other opcodes, except that CondModifier
1744    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1745    */
1746   insn->header.destreg__conditionalmod = function;
1747
1748   brw_set_dest(p, insn, dest);
1749   brw_set_src0(p, insn, src0);
1750   brw_set_src1(p, insn, src1);
1751}
1752
1753/**
1754 * Extended math function, float[16].
1755 * Use 2 send instructions.
1756 */
1757void brw_math_16( struct brw_compile *p,
1758		  struct brw_reg dest,
1759		  GLuint function,
1760		  GLuint saturate,
1761		  GLuint msg_reg_nr,
1762		  struct brw_reg src,
1763		  GLuint precision )
1764{
1765   struct intel_context *intel = &p->brw->intel;
1766   struct brw_instruction *insn;
1767
1768   if (intel->gen >= 6) {
1769      insn = next_insn(p, BRW_OPCODE_MATH);
1770
1771      /* Math is the same ISA format as other opcodes, except that CondModifier
1772       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1773       */
1774      insn->header.destreg__conditionalmod = function;
1775      insn->header.saturate = saturate;
1776
1777      /* Source modifiers are ignored for extended math instructions. */
1778      assert(!src.negate);
1779      assert(!src.abs);
1780
1781      brw_set_dest(p, insn, dest);
1782      brw_set_src0(p, insn, src);
1783      brw_set_src1(p, insn, brw_null_reg());
1784      return;
1785   }
1786
1787   /* First instruction:
1788    */
1789   brw_push_insn_state(p);
1790   brw_set_predicate_control_flag_value(p, 0xff);
1791   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1792
1793   insn = next_insn(p, BRW_OPCODE_SEND);
1794   insn->header.destreg__conditionalmod = msg_reg_nr;
1795
1796   brw_set_dest(p, insn, dest);
1797   brw_set_src0(p, insn, src);
1798   brw_set_math_message(p,
1799			insn,
1800			function,
1801			BRW_MATH_INTEGER_UNSIGNED,
1802			precision,
1803			saturate,
1804			BRW_MATH_DATA_VECTOR);
1805
1806   /* Second instruction:
1807    */
1808   insn = next_insn(p, BRW_OPCODE_SEND);
1809   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1810   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1811
1812   brw_set_dest(p, insn, offset(dest,1));
1813   brw_set_src0(p, insn, src);
1814   brw_set_math_message(p,
1815			insn,
1816			function,
1817			BRW_MATH_INTEGER_UNSIGNED,
1818			precision,
1819			saturate,
1820			BRW_MATH_DATA_VECTOR);
1821
1822   brw_pop_insn_state(p);
1823}
1824
1825
1826/**
1827 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1828 * using a constant offset per channel.
1829 *
1830 * The offset must be aligned to oword size (16 bytes).  Used for
1831 * register spilling.
1832 */
1833void brw_oword_block_write_scratch(struct brw_compile *p,
1834				   struct brw_reg mrf,
1835				   int num_regs,
1836				   GLuint offset)
1837{
1838   struct intel_context *intel = &p->brw->intel;
1839   uint32_t msg_control, msg_type;
1840   int mlen;
1841
1842   if (intel->gen >= 6)
1843      offset /= 16;
1844
1845   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1846
1847   if (num_regs == 1) {
1848      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1849      mlen = 2;
1850   } else {
1851      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1852      mlen = 3;
1853   }
1854
1855   /* Set up the message header.  This is g0, with g0.2 filled with
1856    * the offset.  We don't want to leave our offset around in g0 or
1857    * it'll screw up texture samples, so set it up inside the message
1858    * reg.
1859    */
1860   {
1861      brw_push_insn_state(p);
1862      brw_set_mask_control(p, BRW_MASK_DISABLE);
1863      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1864
1865      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1866
1867      /* set message header global offset field (reg 0, element 2) */
1868      brw_MOV(p,
1869	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1870				  mrf.nr,
1871				  2), BRW_REGISTER_TYPE_UD),
1872	      brw_imm_ud(offset));
1873
1874      brw_pop_insn_state(p);
1875   }
1876
1877   {
1878      struct brw_reg dest;
1879      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1880      int send_commit_msg;
1881      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1882					 BRW_REGISTER_TYPE_UW);
1883
1884      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1885	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1886	 src_header = vec16(src_header);
1887      }
1888      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1889      insn->header.destreg__conditionalmod = mrf.nr;
1890
1891      /* Until gen6, writes followed by reads from the same location
1892       * are not guaranteed to be ordered unless write_commit is set.
1893       * If set, then a no-op write is issued to the destination
1894       * register to set a dependency, and a read from the destination
1895       * can be used to ensure the ordering.
1896       *
1897       * For gen6, only writes between different threads need ordering
1898       * protection.  Our use of DP writes is all about register
1899       * spilling within a thread.
1900       */
1901      if (intel->gen >= 6) {
1902	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1903	 send_commit_msg = 0;
1904      } else {
1905	 dest = src_header;
1906	 send_commit_msg = 1;
1907      }
1908
1909      brw_set_dest(p, insn, dest);
1910      if (intel->gen >= 6) {
1911	 brw_set_src0(p, insn, mrf);
1912      } else {
1913	 brw_set_src0(p, insn, brw_null_reg());
1914      }
1915
1916      if (intel->gen >= 6)
1917	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1918      else
1919	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1920
1921      brw_set_dp_write_message(p,
1922			       insn,
1923			       255, /* binding table index (255=stateless) */
1924			       msg_control,
1925			       msg_type,
1926			       mlen,
1927			       true, /* header_present */
1928			       0, /* not a render target */
1929			       send_commit_msg, /* response_length */
1930			       0, /* eot */
1931			       send_commit_msg);
1932   }
1933}
1934
1935
1936/**
1937 * Read a block of owords (half a GRF each) from the scratch buffer
1938 * using a constant index per channel.
1939 *
1940 * Offset must be aligned to oword size (16 bytes).  Used for register
1941 * spilling.
1942 */
1943void
1944brw_oword_block_read_scratch(struct brw_compile *p,
1945			     struct brw_reg dest,
1946			     struct brw_reg mrf,
1947			     int num_regs,
1948			     GLuint offset)
1949{
1950   struct intel_context *intel = &p->brw->intel;
1951   uint32_t msg_control;
1952   int rlen;
1953
1954   if (intel->gen >= 6)
1955      offset /= 16;
1956
1957   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1958   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1959
1960   if (num_regs == 1) {
1961      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1962      rlen = 1;
1963   } else {
1964      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1965      rlen = 2;
1966   }
1967
1968   {
1969      brw_push_insn_state(p);
1970      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1971      brw_set_mask_control(p, BRW_MASK_DISABLE);
1972
1973      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1974
1975      /* set message header global offset field (reg 0, element 2) */
1976      brw_MOV(p,
1977	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1978				  mrf.nr,
1979				  2), BRW_REGISTER_TYPE_UD),
1980	      brw_imm_ud(offset));
1981
1982      brw_pop_insn_state(p);
1983   }
1984
1985   {
1986      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1987
1988      assert(insn->header.predicate_control == 0);
1989      insn->header.compression_control = BRW_COMPRESSION_NONE;
1990      insn->header.destreg__conditionalmod = mrf.nr;
1991
1992      brw_set_dest(p, insn, dest);	/* UW? */
1993      if (intel->gen >= 6) {
1994	 brw_set_src0(p, insn, mrf);
1995      } else {
1996	 brw_set_src0(p, insn, brw_null_reg());
1997      }
1998
1999      brw_set_dp_read_message(p,
2000			      insn,
2001			      255, /* binding table index (255=stateless) */
2002			      msg_control,
2003			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2004			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2005			      1, /* msg_length */
2006			      rlen);
2007   }
2008}
2009
2010/**
2011 * Read a float[4] vector from the data port Data Cache (const buffer).
2012 * Location (in buffer) should be a multiple of 16.
2013 * Used for fetching shader constants.
2014 */
2015void brw_oword_block_read(struct brw_compile *p,
2016			  struct brw_reg dest,
2017			  struct brw_reg mrf,
2018			  uint32_t offset,
2019			  uint32_t bind_table_index)
2020{
2021   struct intel_context *intel = &p->brw->intel;
2022
2023   /* On newer hardware, offset is in units of owords. */
2024   if (intel->gen >= 6)
2025      offset /= 16;
2026
2027   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2028
2029   brw_push_insn_state(p);
2030   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2031   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2032   brw_set_mask_control(p, BRW_MASK_DISABLE);
2033
2034   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2035
2036   /* set message header global offset field (reg 0, element 2) */
2037   brw_MOV(p,
2038	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2039			       mrf.nr,
2040			       2), BRW_REGISTER_TYPE_UD),
2041	   brw_imm_ud(offset));
2042
2043   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2044   insn->header.destreg__conditionalmod = mrf.nr;
2045
2046   /* cast dest to a uword[8] vector */
2047   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2048
2049   brw_set_dest(p, insn, dest);
2050   if (intel->gen >= 6) {
2051      brw_set_src0(p, insn, mrf);
2052   } else {
2053      brw_set_src0(p, insn, brw_null_reg());
2054   }
2055
2056   brw_set_dp_read_message(p,
2057			   insn,
2058			   bind_table_index,
2059			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2060			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2061			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2062			   1, /* msg_length */
2063			   1); /* response_length (1 reg, 2 owords!) */
2064
2065   brw_pop_insn_state(p);
2066}
2067
2068/**
2069 * Read a set of dwords from the data port Data Cache (const buffer).
2070 *
2071 * Location (in buffer) appears as UD offsets in the register after
2072 * the provided mrf header reg.
2073 */
2074void brw_dword_scattered_read(struct brw_compile *p,
2075			      struct brw_reg dest,
2076			      struct brw_reg mrf,
2077			      uint32_t bind_table_index)
2078{
2079   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2080
2081   brw_push_insn_state(p);
2082   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2083   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2084   brw_set_mask_control(p, BRW_MASK_DISABLE);
2085   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2086   brw_pop_insn_state(p);
2087
2088   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2089   insn->header.destreg__conditionalmod = mrf.nr;
2090
2091   /* cast dest to a uword[8] vector */
2092   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2093
2094   brw_set_dest(p, insn, dest);
2095   brw_set_src0(p, insn, brw_null_reg());
2096
2097   brw_set_dp_read_message(p,
2098			   insn,
2099			   bind_table_index,
2100			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
2101			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
2102			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2103			   2, /* msg_length */
2104			   1); /* response_length */
2105}
2106
2107
2108
2109/**
2110 * Read float[4] constant(s) from VS constant buffer.
2111 * For relative addressing, two float[4] constants will be read into 'dest'.
2112 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
2113 */
2114void brw_dp_READ_4_vs(struct brw_compile *p,
2115                      struct brw_reg dest,
2116                      GLuint location,
2117                      GLuint bind_table_index)
2118{
2119   struct intel_context *intel = &p->brw->intel;
2120   struct brw_instruction *insn;
2121   GLuint msg_reg_nr = 1;
2122
2123   if (intel->gen >= 6)
2124      location /= 16;
2125
2126   /* Setup MRF[1] with location/offset into const buffer */
2127   brw_push_insn_state(p);
2128   brw_set_access_mode(p, BRW_ALIGN_1);
2129   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2130   brw_set_mask_control(p, BRW_MASK_DISABLE);
2131   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2132   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
2133		     BRW_REGISTER_TYPE_UD),
2134	   brw_imm_ud(location));
2135   brw_pop_insn_state(p);
2136
2137   insn = next_insn(p, BRW_OPCODE_SEND);
2138
2139   insn->header.predicate_control = BRW_PREDICATE_NONE;
2140   insn->header.compression_control = BRW_COMPRESSION_NONE;
2141   insn->header.destreg__conditionalmod = msg_reg_nr;
2142   insn->header.mask_control = BRW_MASK_DISABLE;
2143
2144   brw_set_dest(p, insn, dest);
2145   if (intel->gen >= 6) {
2146      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
2147   } else {
2148      brw_set_src0(p, insn, brw_null_reg());
2149   }
2150
2151   brw_set_dp_read_message(p,
2152			   insn,
2153			   bind_table_index,
2154			   0,
2155			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2156			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2157			   1, /* msg_length */
2158			   1); /* response_length (1 Oword) */
2159}
2160
2161/**
2162 * Read a float[4] constant per vertex from VS constant buffer, with
2163 * relative addressing.
2164 */
2165void brw_dp_READ_4_vs_relative(struct brw_compile *p,
2166			       struct brw_reg dest,
2167			       struct brw_reg addr_reg,
2168			       GLuint offset,
2169			       GLuint bind_table_index)
2170{
2171   struct intel_context *intel = &p->brw->intel;
2172   struct brw_reg src = brw_vec8_grf(0, 0);
2173   int msg_type;
2174
2175   /* Setup MRF[1] with offset into const buffer */
2176   brw_push_insn_state(p);
2177   brw_set_access_mode(p, BRW_ALIGN_1);
2178   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2179   brw_set_mask_control(p, BRW_MASK_DISABLE);
2180   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2181
2182   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2183    * fields ignored.
2184    */
2185   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2186	   addr_reg, brw_imm_d(offset));
2187   brw_pop_insn_state(p);
2188
2189   gen6_resolve_implied_move(p, &src, 0);
2190   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2191
2192   insn->header.predicate_control = BRW_PREDICATE_NONE;
2193   insn->header.compression_control = BRW_COMPRESSION_NONE;
2194   insn->header.destreg__conditionalmod = 0;
2195   insn->header.mask_control = BRW_MASK_DISABLE;
2196
2197   brw_set_dest(p, insn, dest);
2198   brw_set_src0(p, insn, src);
2199
2200   if (intel->gen >= 6)
2201      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2202   else if (intel->gen == 5 || intel->is_g4x)
2203      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2204   else
2205      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2206
2207   brw_set_dp_read_message(p,
2208			   insn,
2209			   bind_table_index,
2210			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2211			   msg_type,
2212			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2213			   2, /* msg_length */
2214			   1); /* response_length */
2215}
2216
2217
2218
2219void brw_fb_WRITE(struct brw_compile *p,
2220		  int dispatch_width,
2221                  GLuint msg_reg_nr,
2222                  struct brw_reg src0,
2223                  GLuint binding_table_index,
2224                  GLuint msg_length,
2225                  GLuint response_length,
2226                  bool eot,
2227                  bool header_present)
2228{
2229   struct intel_context *intel = &p->brw->intel;
2230   struct brw_instruction *insn;
2231   GLuint msg_control, msg_type;
2232   struct brw_reg dest;
2233
2234   if (dispatch_width == 16)
2235      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2236   else
2237      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2238
2239   if (intel->gen >= 6 && binding_table_index == 0) {
2240      insn = next_insn(p, BRW_OPCODE_SENDC);
2241   } else {
2242      insn = next_insn(p, BRW_OPCODE_SEND);
2243   }
2244   /* The execution mask is ignored for render target writes. */
2245   insn->header.predicate_control = 0;
2246   insn->header.compression_control = BRW_COMPRESSION_NONE;
2247
2248   if (intel->gen >= 6) {
2249      /* headerless version, just submit color payload */
2250      src0 = brw_message_reg(msg_reg_nr);
2251
2252      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2253   } else {
2254      insn->header.destreg__conditionalmod = msg_reg_nr;
2255
2256      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2257   }
2258
2259   if (dispatch_width == 16)
2260      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2261   else
2262      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2263
2264   brw_set_dest(p, insn, dest);
2265   brw_set_src0(p, insn, src0);
2266   brw_set_dp_write_message(p,
2267			    insn,
2268			    binding_table_index,
2269			    msg_control,
2270			    msg_type,
2271			    msg_length,
2272			    header_present,
2273			    eot, /* last render target write */
2274			    response_length,
2275			    eot,
2276			    0 /* send_commit_msg */);
2277}
2278
2279
2280/**
2281 * Texture sample instruction.
2282 * Note: the msg_type plus msg_length values determine exactly what kind
2283 * of sampling operation is performed.  See volume 4, page 161 of docs.
2284 */
2285void brw_SAMPLE(struct brw_compile *p,
2286		struct brw_reg dest,
2287		GLuint msg_reg_nr,
2288		struct brw_reg src0,
2289		GLuint binding_table_index,
2290		GLuint sampler,
2291		GLuint writemask,
2292		GLuint msg_type,
2293		GLuint response_length,
2294		GLuint msg_length,
2295		GLuint header_present,
2296		GLuint simd_mode,
2297		GLuint return_format)
2298{
2299   struct intel_context *intel = &p->brw->intel;
2300   bool need_stall = 0;
2301
2302   if (writemask == 0) {
2303      /*printf("%s: zero writemask??\n", __FUNCTION__); */
2304      return;
2305   }
2306
2307   /* Hardware doesn't do destination dependency checking on send
2308    * instructions properly.  Add a workaround which generates the
2309    * dependency by other means.  In practice it seems like this bug
2310    * only crops up for texture samples, and only where registers are
2311    * written by the send and then written again later without being
2312    * read in between.  Luckily for us, we already track that
2313    * information and use it to modify the writemask for the
2314    * instruction, so that is a guide for whether a workaround is
2315    * needed.
2316    */
2317   if (writemask != WRITEMASK_XYZW) {
2318      GLuint dst_offset = 0;
2319      GLuint i, newmask = 0, len = 0;
2320
2321      for (i = 0; i < 4; i++) {
2322	 if (writemask & (1<<i))
2323	    break;
2324	 dst_offset += 2;
2325      }
2326      for (; i < 4; i++) {
2327	 if (!(writemask & (1<<i)))
2328	    break;
2329	 newmask |= 1<<i;
2330	 len++;
2331      }
2332
2333      if (newmask != writemask) {
2334	 need_stall = 1;
2335         /* printf("need stall %x %x\n", newmask , writemask); */
2336      }
2337      else {
2338	 bool dispatch_16 = false;
2339
2340	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2341
2342	 guess_execution_size(p, p->current, dest);
2343	 if (p->current->header.execution_size == BRW_EXECUTE_16)
2344	    dispatch_16 = true;
2345
2346	 newmask = ~newmask & WRITEMASK_XYZW;
2347
2348	 brw_push_insn_state(p);
2349
2350	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2351	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2352
2353	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2354		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2355  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2356
2357	 brw_pop_insn_state(p);
2358
2359  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2360	 dest = offset(dest, dst_offset);
2361
2362	 /* For 16-wide dispatch, masked channels are skipped in the
2363	  * response.  For 8-wide, masked channels still take up slots,
2364	  * and are just not written to.
2365	  */
2366	 if (dispatch_16)
2367	    response_length = len * 2;
2368      }
2369   }
2370
2371   {
2372      struct brw_instruction *insn;
2373
2374      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2375
2376      insn = next_insn(p, BRW_OPCODE_SEND);
2377      insn->header.predicate_control = 0; /* XXX */
2378      insn->header.compression_control = BRW_COMPRESSION_NONE;
2379      if (intel->gen < 6)
2380	  insn->header.destreg__conditionalmod = msg_reg_nr;
2381
2382      brw_set_dest(p, insn, dest);
2383      brw_set_src0(p, insn, src0);
2384      brw_set_sampler_message(p, insn,
2385			      binding_table_index,
2386			      sampler,
2387			      msg_type,
2388			      response_length,
2389			      msg_length,
2390			      header_present,
2391			      simd_mode,
2392			      return_format);
2393   }
2394
2395   if (need_stall) {
2396      struct brw_reg reg = vec8(offset(dest, response_length-1));
2397
2398      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2399       */
2400      brw_push_insn_state(p);
2401      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2402      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2403	      retype(reg, BRW_REGISTER_TYPE_UD));
2404      brw_pop_insn_state(p);
2405   }
2406
2407}
2408
2409/* All these variables are pretty confusing - we might be better off
2410 * using bitmasks and macros for this, in the old style.  Or perhaps
2411 * just having the caller instantiate the fields in dword3 itself.
2412 */
2413void brw_urb_WRITE(struct brw_compile *p,
2414		   struct brw_reg dest,
2415		   GLuint msg_reg_nr,
2416		   struct brw_reg src0,
2417		   bool allocate,
2418		   bool used,
2419		   GLuint msg_length,
2420		   GLuint response_length,
2421		   bool eot,
2422		   bool writes_complete,
2423		   GLuint offset,
2424		   GLuint swizzle)
2425{
2426   struct intel_context *intel = &p->brw->intel;
2427   struct brw_instruction *insn;
2428
2429   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2430
2431   if (intel->gen == 7) {
2432      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2433      brw_push_insn_state(p);
2434      brw_set_access_mode(p, BRW_ALIGN_1);
2435      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2436		       BRW_REGISTER_TYPE_UD),
2437	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2438		brw_imm_ud(0xff00));
2439      brw_pop_insn_state(p);
2440   }
2441
2442   insn = next_insn(p, BRW_OPCODE_SEND);
2443
2444   assert(msg_length < BRW_MAX_MRF);
2445
2446   brw_set_dest(p, insn, dest);
2447   brw_set_src0(p, insn, src0);
2448   brw_set_src1(p, insn, brw_imm_d(0));
2449
2450   if (intel->gen < 6)
2451      insn->header.destreg__conditionalmod = msg_reg_nr;
2452
2453   brw_set_urb_message(p,
2454		       insn,
2455		       allocate,
2456		       used,
2457		       msg_length,
2458		       response_length,
2459		       eot,
2460		       writes_complete,
2461		       offset,
2462		       swizzle);
2463}
2464
2465static int
2466brw_find_next_block_end(struct brw_compile *p, int start)
2467{
2468   int ip;
2469
2470   for (ip = start + 1; ip < p->nr_insn; ip++) {
2471      struct brw_instruction *insn = &p->store[ip];
2472
2473      switch (insn->header.opcode) {
2474      case BRW_OPCODE_ENDIF:
2475      case BRW_OPCODE_ELSE:
2476      case BRW_OPCODE_WHILE:
2477	 return ip;
2478      }
2479   }
2480   assert(!"not reached");
2481   return start + 1;
2482}
2483
2484/* There is no DO instruction on gen6, so to find the end of the loop
2485 * we have to see if the loop is jumping back before our start
2486 * instruction.
2487 */
2488static int
2489brw_find_loop_end(struct brw_compile *p, int start)
2490{
2491   struct intel_context *intel = &p->brw->intel;
2492   int ip;
2493   int br = 2;
2494
2495   for (ip = start + 1; ip < p->nr_insn; ip++) {
2496      struct brw_instruction *insn = &p->store[ip];
2497
2498      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2499	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2500				   : insn->bits3.break_cont.jip;
2501	 if (ip + jip / br <= start)
2502	    return ip;
2503      }
2504   }
2505   assert(!"not reached");
2506   return start + 1;
2507}
2508
2509/* After program generation, go back and update the UIP and JIP of
2510 * BREAK and CONT instructions to their correct locations.
2511 */
2512void
2513brw_set_uip_jip(struct brw_compile *p)
2514{
2515   struct intel_context *intel = &p->brw->intel;
2516   int ip;
2517   int br = 2;
2518
2519   if (intel->gen < 6)
2520      return;
2521
2522   for (ip = 0; ip < p->nr_insn; ip++) {
2523      struct brw_instruction *insn = &p->store[ip];
2524
2525      switch (insn->header.opcode) {
2526      case BRW_OPCODE_BREAK:
2527	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2528	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2529	 insn->bits3.break_cont.uip =
2530	    br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2531	 break;
2532      case BRW_OPCODE_CONTINUE:
2533	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2534	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2535
2536	 assert(insn->bits3.break_cont.uip != 0);
2537	 assert(insn->bits3.break_cont.jip != 0);
2538	 break;
2539      }
2540   }
2541}
2542
2543void brw_ff_sync(struct brw_compile *p,
2544		   struct brw_reg dest,
2545		   GLuint msg_reg_nr,
2546		   struct brw_reg src0,
2547		   bool allocate,
2548		   GLuint response_length,
2549		   bool eot)
2550{
2551   struct intel_context *intel = &p->brw->intel;
2552   struct brw_instruction *insn;
2553
2554   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2555
2556   insn = next_insn(p, BRW_OPCODE_SEND);
2557   brw_set_dest(p, insn, dest);
2558   brw_set_src0(p, insn, src0);
2559   brw_set_src1(p, insn, brw_imm_d(0));
2560
2561   if (intel->gen < 6)
2562      insn->header.destreg__conditionalmod = msg_reg_nr;
2563
2564   brw_set_ff_sync_message(p,
2565			   insn,
2566			   allocate,
2567			   response_length,
2568			   eot);
2569}
2570
2571/**
2572 * Emit the SEND instruction necessary to generate stream output data on Gen6
2573 * (for transform feedback).
2574 *
2575 * If send_commit_msg is true, this is the last piece of stream output data
2576 * from this thread, so send the data as a committed write.  According to the
2577 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2578 *
2579 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2580 *   writes are complete by sending the final write as a committed write."
2581 */
2582void
2583brw_svb_write(struct brw_compile *p,
2584              struct brw_reg dest,
2585              GLuint msg_reg_nr,
2586              struct brw_reg src0,
2587              GLuint binding_table_index,
2588              bool   send_commit_msg)
2589{
2590   struct brw_instruction *insn;
2591
2592   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2593
2594   insn = next_insn(p, BRW_OPCODE_SEND);
2595   brw_set_dest(p, insn, dest);
2596   brw_set_src0(p, insn, src0);
2597   brw_set_src1(p, insn, brw_imm_d(0));
2598   brw_set_dp_write_message(p, insn,
2599                            binding_table_index,
2600                            0, /* msg_control: ignored */
2601                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2602                            1, /* msg_length */
2603                            true, /* header_present */
2604                            0, /* last_render_target: ignored */
2605                            send_commit_msg, /* response_length */
2606                            0, /* end_of_thread */
2607                            send_commit_msg); /* send_commit_msg */
2608}
2609