brw_eu_emit.c revision 0a17093eaf84696b05d04a45d6d51281f7b2786b
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct intel_context *intel = &p->brw->intel;
67   if (intel->gen < 6)
68      return;
69
70   if (src->file == BRW_MESSAGE_REGISTER_FILE)
71      return;
72
73   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74      brw_push_insn_state(p);
75      brw_set_mask_control(p, BRW_MASK_DISABLE);
76      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	      retype(*src, BRW_REGISTER_TYPE_UD));
79      brw_pop_insn_state(p);
80   }
81   *src = brw_message_reg(msg_reg_nr);
82}
83
84static void
85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86{
87   struct intel_context *intel = &p->brw->intel;
88   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
89      reg->file = BRW_GENERAL_REGISTER_FILE;
90      reg->nr += 111;
91   }
92}
93
94
95void
96brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
97	     struct brw_reg dest)
98{
99   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
100       dest.file != BRW_MESSAGE_REGISTER_FILE)
101      assert(dest.nr < 128);
102
103   gen7_convert_mrf_to_grf(p, &dest);
104
105   insn->bits1.da1.dest_reg_file = dest.file;
106   insn->bits1.da1.dest_reg_type = dest.type;
107   insn->bits1.da1.dest_address_mode = dest.address_mode;
108
109   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
110      insn->bits1.da1.dest_reg_nr = dest.nr;
111
112      if (insn->header.access_mode == BRW_ALIGN_1) {
113	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
114	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
115	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
116	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
117      }
118      else {
119	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
120	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
121	 /* even ignored in da16, still need to set as '01' */
122	 insn->bits1.da16.dest_horiz_stride = 1;
123      }
124   }
125   else {
126      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
127
128      /* These are different sizes in align1 vs align16:
129       */
130      if (insn->header.access_mode == BRW_ALIGN_1) {
131	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
132	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
133	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
134	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
135      }
136      else {
137	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
138	 /* even ignored in da16, still need to set as '01' */
139	 insn->bits1.ia16.dest_horiz_stride = 1;
140      }
141   }
142
143   /* NEW: Set the execution size based on dest.width and
144    * insn->compression_control:
145    */
146   guess_execution_size(p, insn, dest);
147}
148
149extern int reg_type_size[];
150
151static void
152validate_reg(struct brw_instruction *insn, struct brw_reg reg)
153{
154   int hstride_for_reg[] = {0, 1, 2, 4};
155   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
156   int width_for_reg[] = {1, 2, 4, 8, 16};
157   int execsize_for_reg[] = {1, 2, 4, 8, 16};
158   int width, hstride, vstride, execsize;
159
160   if (reg.file == BRW_IMMEDIATE_VALUE) {
161      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
162       * mean the destination has to be 128-bit aligned and the
163       * destination horiz stride has to be a word.
164       */
165      if (reg.type == BRW_REGISTER_TYPE_V) {
166	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
167		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
168      }
169
170      return;
171   }
172
173   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
174       reg.file == BRW_ARF_NULL)
175      return;
176
177   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
178   hstride = hstride_for_reg[reg.hstride];
179
180   if (reg.vstride == 0xf) {
181      vstride = -1;
182   } else {
183      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
184      vstride = vstride_for_reg[reg.vstride];
185   }
186
187   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
188   width = width_for_reg[reg.width];
189
190   assert(insn->header.execution_size >= 0 &&
191	  insn->header.execution_size < Elements(execsize_for_reg));
192   execsize = execsize_for_reg[insn->header.execution_size];
193
194   /* Restrictions from 3.3.10: Register Region Restrictions. */
195   /* 3. */
196   assert(execsize >= width);
197
198   /* 4. */
199   if (execsize == width && hstride != 0) {
200      assert(vstride == -1 || vstride == width * hstride);
201   }
202
203   /* 5. */
204   if (execsize == width && hstride == 0) {
205      /* no restriction on vstride. */
206   }
207
208   /* 6. */
209   if (width == 1) {
210      assert(hstride == 0);
211   }
212
213   /* 7. */
214   if (execsize == 1 && width == 1) {
215      assert(hstride == 0);
216      assert(vstride == 0);
217   }
218
219   /* 8. */
220   if (vstride == 0 && hstride == 0) {
221      assert(width == 1);
222   }
223
224   /* 10. Check destination issues. */
225}
226
227void
228brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
229	     struct brw_reg reg)
230{
231   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
232      assert(reg.nr < 128);
233
234   gen7_convert_mrf_to_grf(p, &reg);
235
236   validate_reg(insn, reg);
237
238   insn->bits1.da1.src0_reg_file = reg.file;
239   insn->bits1.da1.src0_reg_type = reg.type;
240   insn->bits2.da1.src0_abs = reg.abs;
241   insn->bits2.da1.src0_negate = reg.negate;
242   insn->bits2.da1.src0_address_mode = reg.address_mode;
243
244   if (reg.file == BRW_IMMEDIATE_VALUE) {
245      insn->bits3.ud = reg.dw1.ud;
246
247      /* Required to set some fields in src1 as well:
248       */
249      insn->bits1.da1.src1_reg_file = 0; /* arf */
250      insn->bits1.da1.src1_reg_type = reg.type;
251   }
252   else
253   {
254      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
255	 if (insn->header.access_mode == BRW_ALIGN_1) {
256	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
257	    insn->bits2.da1.src0_reg_nr = reg.nr;
258	 }
259	 else {
260	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
261	    insn->bits2.da16.src0_reg_nr = reg.nr;
262	 }
263      }
264      else {
265	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
266
267	 if (insn->header.access_mode == BRW_ALIGN_1) {
268	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
269	 }
270	 else {
271	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
272	 }
273      }
274
275      if (insn->header.access_mode == BRW_ALIGN_1) {
276	 if (reg.width == BRW_WIDTH_1 &&
277	     insn->header.execution_size == BRW_EXECUTE_1) {
278	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
279	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
280	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
281	 }
282	 else {
283	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
284	    insn->bits2.da1.src0_width = reg.width;
285	    insn->bits2.da1.src0_vert_stride = reg.vstride;
286	 }
287      }
288      else {
289	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
290	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
291	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
292	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
293
294	 /* This is an oddity of the fact we're using the same
295	  * descriptions for registers in align_16 as align_1:
296	  */
297	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
298	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
299	 else
300	    insn->bits2.da16.src0_vert_stride = reg.vstride;
301      }
302   }
303}
304
305
306void brw_set_src1(struct brw_compile *p,
307		  struct brw_instruction *insn,
308		  struct brw_reg reg)
309{
310   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
311
312   assert(reg.nr < 128);
313
314   gen7_convert_mrf_to_grf(p, &reg);
315
316   validate_reg(insn, reg);
317
318   insn->bits1.da1.src1_reg_file = reg.file;
319   insn->bits1.da1.src1_reg_type = reg.type;
320   insn->bits3.da1.src1_abs = reg.abs;
321   insn->bits3.da1.src1_negate = reg.negate;
322
323   /* Only src1 can be immediate in two-argument instructions.
324    */
325   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
326
327   if (reg.file == BRW_IMMEDIATE_VALUE) {
328      insn->bits3.ud = reg.dw1.ud;
329   }
330   else {
331      /* This is a hardware restriction, which may or may not be lifted
332       * in the future:
333       */
334      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
335      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
336
337      if (insn->header.access_mode == BRW_ALIGN_1) {
338	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
339	 insn->bits3.da1.src1_reg_nr = reg.nr;
340      }
341      else {
342	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
343	 insn->bits3.da16.src1_reg_nr = reg.nr;
344      }
345
346      if (insn->header.access_mode == BRW_ALIGN_1) {
347	 if (reg.width == BRW_WIDTH_1 &&
348	     insn->header.execution_size == BRW_EXECUTE_1) {
349	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
350	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
351	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
352	 }
353	 else {
354	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
355	    insn->bits3.da1.src1_width = reg.width;
356	    insn->bits3.da1.src1_vert_stride = reg.vstride;
357	 }
358      }
359      else {
360	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
361	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
362	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
363	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
364
365	 /* This is an oddity of the fact we're using the same
366	  * descriptions for registers in align_16 as align_1:
367	  */
368	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
369	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
370	 else
371	    insn->bits3.da16.src1_vert_stride = reg.vstride;
372      }
373   }
374}
375
376/**
377 * Set the Message Descriptor and Extended Message Descriptor fields
378 * for SEND messages.
379 *
380 * \note This zeroes out the Function Control bits, so it must be called
381 *       \b before filling out any message-specific data.  Callers can
382 *       choose not to fill in irrelevant bits; they will be zero.
383 */
384static void
385brw_set_message_descriptor(struct brw_compile *p,
386			   struct brw_instruction *inst,
387			   enum brw_message_target sfid,
388			   unsigned msg_length,
389			   unsigned response_length,
390			   bool header_present,
391			   bool end_of_thread)
392{
393   struct intel_context *intel = &p->brw->intel;
394
395   brw_set_src1(p, inst, brw_imm_d(0));
396
397   if (intel->gen >= 5) {
398      inst->bits3.generic_gen5.header_present = header_present;
399      inst->bits3.generic_gen5.response_length = response_length;
400      inst->bits3.generic_gen5.msg_length = msg_length;
401      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
402
403      if (intel->gen >= 6) {
404	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
405	 inst->header.destreg__conditionalmod = sfid;
406      } else {
407	 /* Set Extended Message Descriptor (ex_desc) */
408	 inst->bits2.send_gen5.sfid = sfid;
409	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
410      }
411   } else {
412      inst->bits3.generic.response_length = response_length;
413      inst->bits3.generic.msg_length = msg_length;
414      inst->bits3.generic.msg_target = sfid;
415      inst->bits3.generic.end_of_thread = end_of_thread;
416   }
417}
418
419static void brw_set_math_message( struct brw_compile *p,
420				  struct brw_instruction *insn,
421				  GLuint function,
422				  GLuint integer_type,
423				  bool low_precision,
424				  bool saturate,
425				  GLuint dataType )
426{
427   struct brw_context *brw = p->brw;
428   struct intel_context *intel = &brw->intel;
429   unsigned msg_length;
430   unsigned response_length;
431
432   /* Infer message length from the function */
433   switch (function) {
434   case BRW_MATH_FUNCTION_POW:
435   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
436   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
437   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
438      msg_length = 2;
439      break;
440   default:
441      msg_length = 1;
442      break;
443   }
444
445   /* Infer response length from the function */
446   switch (function) {
447   case BRW_MATH_FUNCTION_SINCOS:
448   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
449      response_length = 2;
450      break;
451   default:
452      response_length = 1;
453      break;
454   }
455
456   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
457			      msg_length, response_length, false, false);
458   if (intel->gen == 5) {
459      insn->bits3.math_gen5.function = function;
460      insn->bits3.math_gen5.int_type = integer_type;
461      insn->bits3.math_gen5.precision = low_precision;
462      insn->bits3.math_gen5.saturate = saturate;
463      insn->bits3.math_gen5.data_type = dataType;
464      insn->bits3.math_gen5.snapshot = 0;
465   } else {
466      insn->bits3.math.function = function;
467      insn->bits3.math.int_type = integer_type;
468      insn->bits3.math.precision = low_precision;
469      insn->bits3.math.saturate = saturate;
470      insn->bits3.math.data_type = dataType;
471   }
472}
473
474
475static void brw_set_ff_sync_message(struct brw_compile *p,
476				    struct brw_instruction *insn,
477				    bool allocate,
478				    GLuint response_length,
479				    bool end_of_thread)
480{
481   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
482			      1, response_length, true, end_of_thread);
483   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
484   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
485   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
486   insn->bits3.urb_gen5.allocate = allocate;
487   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
488   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
489}
490
491static void brw_set_urb_message( struct brw_compile *p,
492				 struct brw_instruction *insn,
493				 bool allocate,
494				 bool used,
495				 GLuint msg_length,
496				 GLuint response_length,
497				 bool end_of_thread,
498				 bool complete,
499				 GLuint offset,
500				 GLuint swizzle_control )
501{
502   struct brw_context *brw = p->brw;
503   struct intel_context *intel = &brw->intel;
504
505   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
506			      msg_length, response_length, true, end_of_thread);
507   if (intel->gen == 7) {
508      insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
509      insn->bits3.urb_gen7.offset = offset;
510      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
511      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
512      /* per_slot_offset = 0 makes it ignore offsets in message header */
513      insn->bits3.urb_gen7.per_slot_offset = 0;
514      insn->bits3.urb_gen7.complete = complete;
515   } else if (intel->gen >= 5) {
516      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
517      insn->bits3.urb_gen5.offset = offset;
518      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
519      insn->bits3.urb_gen5.allocate = allocate;
520      insn->bits3.urb_gen5.used = used;	/* ? */
521      insn->bits3.urb_gen5.complete = complete;
522   } else {
523      insn->bits3.urb.opcode = 0;	/* ? */
524      insn->bits3.urb.offset = offset;
525      insn->bits3.urb.swizzle_control = swizzle_control;
526      insn->bits3.urb.allocate = allocate;
527      insn->bits3.urb.used = used;	/* ? */
528      insn->bits3.urb.complete = complete;
529   }
530}
531
532void
533brw_set_dp_write_message(struct brw_compile *p,
534			 struct brw_instruction *insn,
535			 GLuint binding_table_index,
536			 GLuint msg_control,
537			 GLuint msg_type,
538			 GLuint msg_length,
539			 bool header_present,
540			 GLuint last_render_target,
541			 GLuint response_length,
542			 GLuint end_of_thread,
543			 GLuint send_commit_msg)
544{
545   struct brw_context *brw = p->brw;
546   struct intel_context *intel = &brw->intel;
547   unsigned sfid;
548
549   if (intel->gen >= 7) {
550      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
551      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
552	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
553      else
554	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
555   } else if (intel->gen == 6) {
556      /* Use the render cache for all write messages. */
557      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
558   } else {
559      sfid = BRW_SFID_DATAPORT_WRITE;
560   }
561
562   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
563			      header_present, end_of_thread);
564
565   if (intel->gen >= 7) {
566      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
567      insn->bits3.gen7_dp.msg_control = msg_control;
568      insn->bits3.gen7_dp.last_render_target = last_render_target;
569      insn->bits3.gen7_dp.msg_type = msg_type;
570   } else if (intel->gen == 6) {
571      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
572      insn->bits3.gen6_dp.msg_control = msg_control;
573      insn->bits3.gen6_dp.last_render_target = last_render_target;
574      insn->bits3.gen6_dp.msg_type = msg_type;
575      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
576   } else if (intel->gen == 5) {
577      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
578      insn->bits3.dp_write_gen5.msg_control = msg_control;
579      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
580      insn->bits3.dp_write_gen5.msg_type = msg_type;
581      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
582   } else {
583      insn->bits3.dp_write.binding_table_index = binding_table_index;
584      insn->bits3.dp_write.msg_control = msg_control;
585      insn->bits3.dp_write.last_render_target = last_render_target;
586      insn->bits3.dp_write.msg_type = msg_type;
587      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
588   }
589}
590
591void
592brw_set_dp_read_message(struct brw_compile *p,
593			struct brw_instruction *insn,
594			GLuint binding_table_index,
595			GLuint msg_control,
596			GLuint msg_type,
597			GLuint target_cache,
598			GLuint msg_length,
599			GLuint response_length)
600{
601   struct brw_context *brw = p->brw;
602   struct intel_context *intel = &brw->intel;
603   unsigned sfid;
604
605   if (intel->gen >= 7) {
606      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
607   } else if (intel->gen == 6) {
608      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
609	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
610      else
611	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
612   } else {
613      sfid = BRW_SFID_DATAPORT_READ;
614   }
615
616   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
617			      true, false);
618
619   if (intel->gen >= 7) {
620      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
621      insn->bits3.gen7_dp.msg_control = msg_control;
622      insn->bits3.gen7_dp.last_render_target = 0;
623      insn->bits3.gen7_dp.msg_type = msg_type;
624   } else if (intel->gen == 6) {
625      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
626      insn->bits3.gen6_dp.msg_control = msg_control;
627      insn->bits3.gen6_dp.last_render_target = 0;
628      insn->bits3.gen6_dp.msg_type = msg_type;
629      insn->bits3.gen6_dp.send_commit_msg = 0;
630   } else if (intel->gen == 5) {
631      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
632      insn->bits3.dp_read_gen5.msg_control = msg_control;
633      insn->bits3.dp_read_gen5.msg_type = msg_type;
634      insn->bits3.dp_read_gen5.target_cache = target_cache;
635   } else if (intel->is_g4x) {
636      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
637      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
638      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
639      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
640   } else {
641      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
642      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
643      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
644      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
645   }
646}
647
648static void brw_set_sampler_message(struct brw_compile *p,
649                                    struct brw_instruction *insn,
650                                    GLuint binding_table_index,
651                                    GLuint sampler,
652                                    GLuint msg_type,
653                                    GLuint response_length,
654                                    GLuint msg_length,
655                                    GLuint header_present,
656                                    GLuint simd_mode,
657				    GLuint return_format)
658{
659   struct brw_context *brw = p->brw;
660   struct intel_context *intel = &brw->intel;
661
662   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
663			      response_length, header_present, false);
664
665   if (intel->gen >= 7) {
666      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
667      insn->bits3.sampler_gen7.sampler = sampler;
668      insn->bits3.sampler_gen7.msg_type = msg_type;
669      insn->bits3.sampler_gen7.simd_mode = simd_mode;
670   } else if (intel->gen >= 5) {
671      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
672      insn->bits3.sampler_gen5.sampler = sampler;
673      insn->bits3.sampler_gen5.msg_type = msg_type;
674      insn->bits3.sampler_gen5.simd_mode = simd_mode;
675   } else if (intel->is_g4x) {
676      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
677      insn->bits3.sampler_g4x.sampler = sampler;
678      insn->bits3.sampler_g4x.msg_type = msg_type;
679   } else {
680      insn->bits3.sampler.binding_table_index = binding_table_index;
681      insn->bits3.sampler.sampler = sampler;
682      insn->bits3.sampler.msg_type = msg_type;
683      insn->bits3.sampler.return_format = return_format;
684   }
685}
686
687
688#define next_insn brw_next_insn
689struct brw_instruction *
690brw_next_insn(struct brw_compile *p, GLuint opcode)
691{
692   struct brw_instruction *insn;
693
694   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
695
696   insn = &p->store[p->nr_insn++];
697   memcpy(insn, p->current, sizeof(*insn));
698
699   /* Reset this one-shot flag:
700    */
701
702   if (p->current->header.destreg__conditionalmod) {
703      p->current->header.destreg__conditionalmod = 0;
704      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
705   }
706
707   insn->header.opcode = opcode;
708   return insn;
709}
710
711static struct brw_instruction *brw_alu1( struct brw_compile *p,
712					 GLuint opcode,
713					 struct brw_reg dest,
714					 struct brw_reg src )
715{
716   struct brw_instruction *insn = next_insn(p, opcode);
717   brw_set_dest(p, insn, dest);
718   brw_set_src0(p, insn, src);
719   return insn;
720}
721
722static struct brw_instruction *brw_alu2(struct brw_compile *p,
723					GLuint opcode,
724					struct brw_reg dest,
725					struct brw_reg src0,
726					struct brw_reg src1 )
727{
728   struct brw_instruction *insn = next_insn(p, opcode);
729   brw_set_dest(p, insn, dest);
730   brw_set_src0(p, insn, src0);
731   brw_set_src1(p, insn, src1);
732   return insn;
733}
734
735
736/***********************************************************************
737 * Convenience routines.
738 */
739#define ALU1(OP)					\
740struct brw_instruction *brw_##OP(struct brw_compile *p,	\
741	      struct brw_reg dest,			\
742	      struct brw_reg src0)   			\
743{							\
744   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
745}
746
747#define ALU2(OP)					\
748struct brw_instruction *brw_##OP(struct brw_compile *p,	\
749	      struct brw_reg dest,			\
750	      struct brw_reg src0,			\
751	      struct brw_reg src1)   			\
752{							\
753   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
754}
755
756/* Rounding operations (other than RNDD) require two instructions - the first
757 * stores a rounded value (possibly the wrong way) in the dest register, but
758 * also sets a per-channel "increment bit" in the flag register.  A predicated
759 * add of 1.0 fixes dest to contain the desired result.
760 *
761 * Sandybridge and later appear to round correctly without an ADD.
762 */
763#define ROUND(OP)							      \
764void brw_##OP(struct brw_compile *p,					      \
765	      struct brw_reg dest,					      \
766	      struct brw_reg src)					      \
767{									      \
768   struct brw_instruction *rnd, *add;					      \
769   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
770   brw_set_dest(p, rnd, dest);						      \
771   brw_set_src0(p, rnd, src);						      \
772									      \
773   if (p->brw->intel.gen < 6) {						      \
774      /* turn on round-increments */					      \
775      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
776      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
777      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
778   }									      \
779}
780
781
782ALU1(MOV)
783ALU2(SEL)
784ALU1(NOT)
785ALU2(AND)
786ALU2(OR)
787ALU2(XOR)
788ALU2(SHR)
789ALU2(SHL)
790ALU2(RSR)
791ALU2(RSL)
792ALU2(ASR)
793ALU1(FRC)
794ALU1(RNDD)
795ALU2(MAC)
796ALU2(MACH)
797ALU1(LZD)
798ALU2(DP4)
799ALU2(DPH)
800ALU2(DP3)
801ALU2(DP2)
802ALU2(LINE)
803ALU2(PLN)
804
805
806ROUND(RNDZ)
807ROUND(RNDE)
808
809
810struct brw_instruction *brw_ADD(struct brw_compile *p,
811				struct brw_reg dest,
812				struct brw_reg src0,
813				struct brw_reg src1)
814{
815   /* 6.2.2: add */
816   if (src0.type == BRW_REGISTER_TYPE_F ||
817       (src0.file == BRW_IMMEDIATE_VALUE &&
818	src0.type == BRW_REGISTER_TYPE_VF)) {
819      assert(src1.type != BRW_REGISTER_TYPE_UD);
820      assert(src1.type != BRW_REGISTER_TYPE_D);
821   }
822
823   if (src1.type == BRW_REGISTER_TYPE_F ||
824       (src1.file == BRW_IMMEDIATE_VALUE &&
825	src1.type == BRW_REGISTER_TYPE_VF)) {
826      assert(src0.type != BRW_REGISTER_TYPE_UD);
827      assert(src0.type != BRW_REGISTER_TYPE_D);
828   }
829
830   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
831}
832
833struct brw_instruction *brw_MUL(struct brw_compile *p,
834				struct brw_reg dest,
835				struct brw_reg src0,
836				struct brw_reg src1)
837{
838   /* 6.32.38: mul */
839   if (src0.type == BRW_REGISTER_TYPE_D ||
840       src0.type == BRW_REGISTER_TYPE_UD ||
841       src1.type == BRW_REGISTER_TYPE_D ||
842       src1.type == BRW_REGISTER_TYPE_UD) {
843      assert(dest.type != BRW_REGISTER_TYPE_F);
844   }
845
846   if (src0.type == BRW_REGISTER_TYPE_F ||
847       (src0.file == BRW_IMMEDIATE_VALUE &&
848	src0.type == BRW_REGISTER_TYPE_VF)) {
849      assert(src1.type != BRW_REGISTER_TYPE_UD);
850      assert(src1.type != BRW_REGISTER_TYPE_D);
851   }
852
853   if (src1.type == BRW_REGISTER_TYPE_F ||
854       (src1.file == BRW_IMMEDIATE_VALUE &&
855	src1.type == BRW_REGISTER_TYPE_VF)) {
856      assert(src0.type != BRW_REGISTER_TYPE_UD);
857      assert(src0.type != BRW_REGISTER_TYPE_D);
858   }
859
860   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
861	  src0.nr != BRW_ARF_ACCUMULATOR);
862   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
863	  src1.nr != BRW_ARF_ACCUMULATOR);
864
865   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
866}
867
868
869void brw_NOP(struct brw_compile *p)
870{
871   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
872   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
873   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
874   brw_set_src1(p, insn, brw_imm_ud(0x0));
875}
876
877
878
879
880
881/***********************************************************************
882 * Comparisons, if/else/endif
883 */
884
885struct brw_instruction *brw_JMPI(struct brw_compile *p,
886                                 struct brw_reg dest,
887                                 struct brw_reg src0,
888                                 struct brw_reg src1)
889{
890   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
891
892   insn->header.execution_size = 1;
893   insn->header.compression_control = BRW_COMPRESSION_NONE;
894   insn->header.mask_control = BRW_MASK_DISABLE;
895
896   p->current->header.predicate_control = BRW_PREDICATE_NONE;
897
898   return insn;
899}
900
901static void
902push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
903{
904   p->if_stack[p->if_stack_depth] = inst - p->store;
905
906   p->if_stack_depth++;
907   if (p->if_stack_array_size <= p->if_stack_depth) {
908      p->if_stack_array_size *= 2;
909      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
910			     p->if_stack_array_size);
911   }
912}
913
914static struct brw_instruction *
915pop_if_stack(struct brw_compile *p)
916{
917   p->if_stack_depth--;
918   return &p->store[p->if_stack[p->if_stack_depth]];
919}
920
921static void
922push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
923{
924   if (p->loop_stack_array_size < p->loop_stack_depth) {
925      p->loop_stack_array_size *= 2;
926      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
927			       p->loop_stack_array_size);
928      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
929				     p->loop_stack_array_size);
930   }
931
932   p->loop_stack[p->loop_stack_depth] = inst - p->store;
933   p->loop_stack_depth++;
934   p->if_depth_in_loop[p->loop_stack_depth] = 0;
935}
936
937static struct brw_instruction *
938get_inner_do_insn(struct brw_compile *p)
939{
940   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
941}
942
943/* EU takes the value from the flag register and pushes it onto some
944 * sort of a stack (presumably merging with any flag value already on
945 * the stack).  Within an if block, the flags at the top of the stack
946 * control execution on each channel of the unit, eg. on each of the
947 * 16 pixel values in our wm programs.
948 *
949 * When the matching 'else' instruction is reached (presumably by
950 * countdown of the instruction count patched in by our ELSE/ENDIF
951 * functions), the relevent flags are inverted.
952 *
953 * When the matching 'endif' instruction is reached, the flags are
954 * popped off.  If the stack is now empty, normal execution resumes.
955 */
956struct brw_instruction *
957brw_IF(struct brw_compile *p, GLuint execute_size)
958{
959   struct intel_context *intel = &p->brw->intel;
960   struct brw_instruction *insn;
961
962   insn = next_insn(p, BRW_OPCODE_IF);
963
964   /* Override the defaults for this instruction:
965    */
966   if (intel->gen < 6) {
967      brw_set_dest(p, insn, brw_ip_reg());
968      brw_set_src0(p, insn, brw_ip_reg());
969      brw_set_src1(p, insn, brw_imm_d(0x0));
970   } else if (intel->gen == 6) {
971      brw_set_dest(p, insn, brw_imm_w(0));
972      insn->bits1.branch_gen6.jump_count = 0;
973      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
974      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
975   } else {
976      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
977      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
978      brw_set_src1(p, insn, brw_imm_ud(0));
979      insn->bits3.break_cont.jip = 0;
980      insn->bits3.break_cont.uip = 0;
981   }
982
983   insn->header.execution_size = execute_size;
984   insn->header.compression_control = BRW_COMPRESSION_NONE;
985   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
986   insn->header.mask_control = BRW_MASK_ENABLE;
987   if (!p->single_program_flow)
988      insn->header.thread_control = BRW_THREAD_SWITCH;
989
990   p->current->header.predicate_control = BRW_PREDICATE_NONE;
991
992   push_if_stack(p, insn);
993   p->if_depth_in_loop[p->loop_stack_depth]++;
994   return insn;
995}
996
997/* This function is only used for gen6-style IF instructions with an
998 * embedded comparison (conditional modifier).  It is not used on gen7.
999 */
1000struct brw_instruction *
1001gen6_IF(struct brw_compile *p, uint32_t conditional,
1002	struct brw_reg src0, struct brw_reg src1)
1003{
1004   struct brw_instruction *insn;
1005
1006   insn = next_insn(p, BRW_OPCODE_IF);
1007
1008   brw_set_dest(p, insn, brw_imm_w(0));
1009   if (p->compressed) {
1010      insn->header.execution_size = BRW_EXECUTE_16;
1011   } else {
1012      insn->header.execution_size = BRW_EXECUTE_8;
1013   }
1014   insn->bits1.branch_gen6.jump_count = 0;
1015   brw_set_src0(p, insn, src0);
1016   brw_set_src1(p, insn, src1);
1017
1018   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1019   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1020   insn->header.destreg__conditionalmod = conditional;
1021
1022   if (!p->single_program_flow)
1023      insn->header.thread_control = BRW_THREAD_SWITCH;
1024
1025   push_if_stack(p, insn);
1026   return insn;
1027}
1028
1029/**
1030 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1031 */
1032static void
1033convert_IF_ELSE_to_ADD(struct brw_compile *p,
1034		       struct brw_instruction *if_inst,
1035		       struct brw_instruction *else_inst)
1036{
1037   /* The next instruction (where the ENDIF would be, if it existed) */
1038   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1039
1040   assert(p->single_program_flow);
1041   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1042   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1043   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1044
1045   /* Convert IF to an ADD instruction that moves the instruction pointer
1046    * to the first instruction of the ELSE block.  If there is no ELSE
1047    * block, point to where ENDIF would be.  Reverse the predicate.
1048    *
1049    * There's no need to execute an ENDIF since we don't need to do any
1050    * stack operations, and if we're currently executing, we just want to
1051    * continue normally.
1052    */
1053   if_inst->header.opcode = BRW_OPCODE_ADD;
1054   if_inst->header.predicate_inverse = 1;
1055
1056   if (else_inst != NULL) {
1057      /* Convert ELSE to an ADD instruction that points where the ENDIF
1058       * would be.
1059       */
1060      else_inst->header.opcode = BRW_OPCODE_ADD;
1061
1062      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1063      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1064   } else {
1065      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1066   }
1067}
1068
1069/**
1070 * Patch IF and ELSE instructions with appropriate jump targets.
1071 */
1072static void
1073patch_IF_ELSE(struct brw_compile *p,
1074	      struct brw_instruction *if_inst,
1075	      struct brw_instruction *else_inst,
1076	      struct brw_instruction *endif_inst)
1077{
1078   struct intel_context *intel = &p->brw->intel;
1079
1080   /* We shouldn't be patching IF and ELSE instructions in single program flow
1081    * mode when gen < 6, because in single program flow mode on those
1082    * platforms, we convert flow control instructions to conditional ADDs that
1083    * operate on IP (see brw_ENDIF).
1084    *
1085    * However, on Gen6, writing to IP doesn't work in single program flow mode
1086    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1087    * not be updated by non-flow control instructions.").  And on later
1088    * platforms, there is no significant benefit to converting control flow
1089    * instructions to conditional ADDs.  So we do patch IF and ELSE
1090    * instructions in single program flow mode on those platforms.
1091    */
1092   if (intel->gen < 6)
1093      assert(!p->single_program_flow);
1094
1095   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1096   assert(endif_inst != NULL);
1097   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1098
1099   unsigned br = 1;
1100   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1101    * requires 2 chunks.
1102    */
1103   if (intel->gen >= 5)
1104      br = 2;
1105
1106   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1107   endif_inst->header.execution_size = if_inst->header.execution_size;
1108
1109   if (else_inst == NULL) {
1110      /* Patch IF -> ENDIF */
1111      if (intel->gen < 6) {
1112	 /* Turn it into an IFF, which means no mask stack operations for
1113	  * all-false and jumping past the ENDIF.
1114	  */
1115	 if_inst->header.opcode = BRW_OPCODE_IFF;
1116	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1117	 if_inst->bits3.if_else.pop_count = 0;
1118	 if_inst->bits3.if_else.pad0 = 0;
1119      } else if (intel->gen == 6) {
1120	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1121	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1122      } else {
1123	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1124	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1125      }
1126   } else {
1127      else_inst->header.execution_size = if_inst->header.execution_size;
1128
1129      /* Patch IF -> ELSE */
1130      if (intel->gen < 6) {
1131	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1132	 if_inst->bits3.if_else.pop_count = 0;
1133	 if_inst->bits3.if_else.pad0 = 0;
1134      } else if (intel->gen == 6) {
1135	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1136      }
1137
1138      /* Patch ELSE -> ENDIF */
1139      if (intel->gen < 6) {
1140	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1141	  * matching ENDIF.
1142	  */
1143	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1144	 else_inst->bits3.if_else.pop_count = 1;
1145	 else_inst->bits3.if_else.pad0 = 0;
1146      } else if (intel->gen == 6) {
1147	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1148	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1149      } else {
1150	 /* The IF instruction's JIP should point just past the ELSE */
1151	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1152	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1153	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1154	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1155      }
1156   }
1157}
1158
1159void
1160brw_ELSE(struct brw_compile *p)
1161{
1162   struct intel_context *intel = &p->brw->intel;
1163   struct brw_instruction *insn;
1164
1165   insn = next_insn(p, BRW_OPCODE_ELSE);
1166
1167   if (intel->gen < 6) {
1168      brw_set_dest(p, insn, brw_ip_reg());
1169      brw_set_src0(p, insn, brw_ip_reg());
1170      brw_set_src1(p, insn, brw_imm_d(0x0));
1171   } else if (intel->gen == 6) {
1172      brw_set_dest(p, insn, brw_imm_w(0));
1173      insn->bits1.branch_gen6.jump_count = 0;
1174      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1175      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1176   } else {
1177      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1178      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1179      brw_set_src1(p, insn, brw_imm_ud(0));
1180      insn->bits3.break_cont.jip = 0;
1181      insn->bits3.break_cont.uip = 0;
1182   }
1183
1184   insn->header.compression_control = BRW_COMPRESSION_NONE;
1185   insn->header.mask_control = BRW_MASK_ENABLE;
1186   if (!p->single_program_flow)
1187      insn->header.thread_control = BRW_THREAD_SWITCH;
1188
1189   push_if_stack(p, insn);
1190}
1191
1192void
1193brw_ENDIF(struct brw_compile *p)
1194{
1195   struct intel_context *intel = &p->brw->intel;
1196   struct brw_instruction *insn;
1197   struct brw_instruction *else_inst = NULL;
1198   struct brw_instruction *if_inst = NULL;
1199   struct brw_instruction *tmp;
1200
1201   /* Pop the IF and (optional) ELSE instructions from the stack */
1202   p->if_depth_in_loop[p->loop_stack_depth]--;
1203   tmp = pop_if_stack(p);
1204   if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1205      else_inst = tmp;
1206      tmp = pop_if_stack(p);
1207   }
1208   if_inst = tmp;
1209
1210   /* In single program flow mode, we can express IF and ELSE instructions
1211    * equivalently as ADD instructions that operate on IP.  On platforms prior
1212    * to Gen6, flow control instructions cause an implied thread switch, so
1213    * this is a significant savings.
1214    *
1215    * However, on Gen6, writing to IP doesn't work in single program flow mode
1216    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1217    * not be updated by non-flow control instructions.").  And on later
1218    * platforms, there is no significant benefit to converting control flow
1219    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1220    * Gen5.
1221    */
1222   if (intel->gen < 6 && p->single_program_flow) {
1223      /* ENDIF is useless; don't bother emitting it. */
1224      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1225      return;
1226   }
1227
1228   insn = next_insn(p, BRW_OPCODE_ENDIF);
1229
1230   if (intel->gen < 6) {
1231      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1232      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1233      brw_set_src1(p, insn, brw_imm_d(0x0));
1234   } else if (intel->gen == 6) {
1235      brw_set_dest(p, insn, brw_imm_w(0));
1236      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1237      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1238   } else {
1239      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1240      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1241      brw_set_src1(p, insn, brw_imm_ud(0));
1242   }
1243
1244   insn->header.compression_control = BRW_COMPRESSION_NONE;
1245   insn->header.mask_control = BRW_MASK_ENABLE;
1246   insn->header.thread_control = BRW_THREAD_SWITCH;
1247
1248   /* Also pop item off the stack in the endif instruction: */
1249   if (intel->gen < 6) {
1250      insn->bits3.if_else.jump_count = 0;
1251      insn->bits3.if_else.pop_count = 1;
1252      insn->bits3.if_else.pad0 = 0;
1253   } else if (intel->gen == 6) {
1254      insn->bits1.branch_gen6.jump_count = 2;
1255   } else {
1256      insn->bits3.break_cont.jip = 2;
1257   }
1258   patch_IF_ELSE(p, if_inst, else_inst, insn);
1259}
1260
1261struct brw_instruction *brw_BREAK(struct brw_compile *p)
1262{
1263   struct intel_context *intel = &p->brw->intel;
1264   struct brw_instruction *insn;
1265
1266   insn = next_insn(p, BRW_OPCODE_BREAK);
1267   if (intel->gen >= 6) {
1268      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1269      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1270      brw_set_src1(p, insn, brw_imm_d(0x0));
1271   } else {
1272      brw_set_dest(p, insn, brw_ip_reg());
1273      brw_set_src0(p, insn, brw_ip_reg());
1274      brw_set_src1(p, insn, brw_imm_d(0x0));
1275      insn->bits3.if_else.pad0 = 0;
1276      insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1277   }
1278   insn->header.compression_control = BRW_COMPRESSION_NONE;
1279   insn->header.execution_size = BRW_EXECUTE_8;
1280
1281   return insn;
1282}
1283
1284struct brw_instruction *gen6_CONT(struct brw_compile *p)
1285{
1286   struct brw_instruction *insn;
1287
1288   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1289   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1290   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1291   brw_set_dest(p, insn, brw_ip_reg());
1292   brw_set_src0(p, insn, brw_ip_reg());
1293   brw_set_src1(p, insn, brw_imm_d(0x0));
1294
1295   insn->header.compression_control = BRW_COMPRESSION_NONE;
1296   insn->header.execution_size = BRW_EXECUTE_8;
1297   return insn;
1298}
1299
1300struct brw_instruction *brw_CONT(struct brw_compile *p)
1301{
1302   struct brw_instruction *insn;
1303   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1304   brw_set_dest(p, insn, brw_ip_reg());
1305   brw_set_src0(p, insn, brw_ip_reg());
1306   brw_set_src1(p, insn, brw_imm_d(0x0));
1307   insn->header.compression_control = BRW_COMPRESSION_NONE;
1308   insn->header.execution_size = BRW_EXECUTE_8;
1309   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1310   insn->bits3.if_else.pad0 = 0;
1311   insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1312   return insn;
1313}
1314
1315/* DO/WHILE loop:
1316 *
1317 * The DO/WHILE is just an unterminated loop -- break or continue are
1318 * used for control within the loop.  We have a few ways they can be
1319 * done.
1320 *
1321 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1322 * jip and no DO instruction.
1323 *
1324 * For non-uniform control flow pre-gen6, there's a DO instruction to
1325 * push the mask, and a WHILE to jump back, and BREAK to get out and
1326 * pop the mask.
1327 *
1328 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1329 * just points back to the first instruction of the loop.
1330 */
1331struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1332{
1333   struct intel_context *intel = &p->brw->intel;
1334
1335   if (intel->gen >= 6 || p->single_program_flow) {
1336      push_loop_stack(p, &p->store[p->nr_insn]);
1337      return &p->store[p->nr_insn];
1338   } else {
1339      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1340
1341      push_loop_stack(p, insn);
1342
1343      /* Override the defaults for this instruction:
1344       */
1345      brw_set_dest(p, insn, brw_null_reg());
1346      brw_set_src0(p, insn, brw_null_reg());
1347      brw_set_src1(p, insn, brw_null_reg());
1348
1349      insn->header.compression_control = BRW_COMPRESSION_NONE;
1350      insn->header.execution_size = execute_size;
1351      insn->header.predicate_control = BRW_PREDICATE_NONE;
1352      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1353      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1354
1355      return insn;
1356   }
1357}
1358
1359/**
1360 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1361 * instruction here.
1362 *
1363 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1364 * nesting, since it can always just point to the end of the block/current loop.
1365 */
1366static void
1367brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1368{
1369   struct intel_context *intel = &p->brw->intel;
1370   struct brw_instruction *do_inst = get_inner_do_insn(p);
1371   struct brw_instruction *inst;
1372   int br = (intel->gen == 5) ? 2 : 1;
1373
1374   for (inst = while_inst - 1; inst != do_inst; inst--) {
1375      /* If the jump count is != 0, that means that this instruction has already
1376       * been patched because it's part of a loop inside of the one we're
1377       * patching.
1378       */
1379      if (inst->header.opcode == BRW_OPCODE_BREAK &&
1380	  inst->bits3.if_else.jump_count == 0) {
1381	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1382      } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1383		 inst->bits3.if_else.jump_count == 0) {
1384	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1385      }
1386   }
1387}
1388
1389struct brw_instruction *brw_WHILE(struct brw_compile *p)
1390{
1391   struct intel_context *intel = &p->brw->intel;
1392   struct brw_instruction *insn, *do_insn;
1393   GLuint br = 1;
1394
1395   do_insn = get_inner_do_insn(p);
1396
1397   if (intel->gen >= 5)
1398      br = 2;
1399
1400   if (intel->gen >= 7) {
1401      insn = next_insn(p, BRW_OPCODE_WHILE);
1402
1403      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1404      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1405      brw_set_src1(p, insn, brw_imm_ud(0));
1406      insn->bits3.break_cont.jip = br * (do_insn - insn);
1407
1408      insn->header.execution_size = BRW_EXECUTE_8;
1409   } else if (intel->gen == 6) {
1410      insn = next_insn(p, BRW_OPCODE_WHILE);
1411
1412      brw_set_dest(p, insn, brw_imm_w(0));
1413      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1414      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1415      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1416
1417      insn->header.execution_size = BRW_EXECUTE_8;
1418   } else {
1419      if (p->single_program_flow) {
1420	 insn = next_insn(p, BRW_OPCODE_ADD);
1421
1422	 brw_set_dest(p, insn, brw_ip_reg());
1423	 brw_set_src0(p, insn, brw_ip_reg());
1424	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1425	 insn->header.execution_size = BRW_EXECUTE_1;
1426      } else {
1427	 insn = next_insn(p, BRW_OPCODE_WHILE);
1428
1429	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1430
1431	 brw_set_dest(p, insn, brw_ip_reg());
1432	 brw_set_src0(p, insn, brw_ip_reg());
1433	 brw_set_src1(p, insn, brw_imm_d(0));
1434
1435	 insn->header.execution_size = do_insn->header.execution_size;
1436	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1437	 insn->bits3.if_else.pop_count = 0;
1438	 insn->bits3.if_else.pad0 = 0;
1439
1440	 brw_patch_break_cont(p, insn);
1441      }
1442   }
1443   insn->header.compression_control = BRW_COMPRESSION_NONE;
1444   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1445
1446   p->loop_stack_depth--;
1447
1448   return insn;
1449}
1450
1451
1452/* FORWARD JUMPS:
1453 */
1454void brw_land_fwd_jump(struct brw_compile *p,
1455		       struct brw_instruction *jmp_insn)
1456{
1457   struct intel_context *intel = &p->brw->intel;
1458   struct brw_instruction *landing = &p->store[p->nr_insn];
1459   GLuint jmpi = 1;
1460
1461   if (intel->gen >= 5)
1462      jmpi = 2;
1463
1464   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1465   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1466
1467   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1468}
1469
1470
1471
1472/* To integrate with the above, it makes sense that the comparison
1473 * instruction should populate the flag register.  It might be simpler
1474 * just to use the flag reg for most WM tasks?
1475 */
1476void brw_CMP(struct brw_compile *p,
1477	     struct brw_reg dest,
1478	     GLuint conditional,
1479	     struct brw_reg src0,
1480	     struct brw_reg src1)
1481{
1482   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1483
1484   insn->header.destreg__conditionalmod = conditional;
1485   brw_set_dest(p, insn, dest);
1486   brw_set_src0(p, insn, src0);
1487   brw_set_src1(p, insn, src1);
1488
1489/*    guess_execution_size(insn, src0); */
1490
1491
1492   /* Make it so that future instructions will use the computed flag
1493    * value until brw_set_predicate_control_flag_value() is called
1494    * again.
1495    */
1496   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1497       dest.nr == 0) {
1498      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1499      p->flag_value = 0xff;
1500   }
1501}
1502
1503/* Issue 'wait' instruction for n1, host could program MMIO
1504   to wake up thread. */
1505void brw_WAIT (struct brw_compile *p)
1506{
1507   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1508   struct brw_reg src = brw_notification_1_reg();
1509
1510   brw_set_dest(p, insn, src);
1511   brw_set_src0(p, insn, src);
1512   brw_set_src1(p, insn, brw_null_reg());
1513   insn->header.execution_size = 0; /* must */
1514   insn->header.predicate_control = 0;
1515   insn->header.compression_control = 0;
1516}
1517
1518
1519/***********************************************************************
1520 * Helpers for the various SEND message types:
1521 */
1522
1523/** Extended math function, float[8].
1524 */
1525void brw_math( struct brw_compile *p,
1526	       struct brw_reg dest,
1527	       GLuint function,
1528	       GLuint saturate,
1529	       GLuint msg_reg_nr,
1530	       struct brw_reg src,
1531	       GLuint data_type,
1532	       GLuint precision )
1533{
1534   struct intel_context *intel = &p->brw->intel;
1535
1536   if (intel->gen >= 6) {
1537      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1538
1539      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1540      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1541
1542      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1543      if (intel->gen == 6)
1544	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1545
1546      /* Source modifiers are ignored for extended math instructions on Gen6. */
1547      if (intel->gen == 6) {
1548	 assert(!src.negate);
1549	 assert(!src.abs);
1550      }
1551
1552      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1553	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1554	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1555	 assert(src.type != BRW_REGISTER_TYPE_F);
1556      } else {
1557	 assert(src.type == BRW_REGISTER_TYPE_F);
1558      }
1559
1560      /* Math is the same ISA format as other opcodes, except that CondModifier
1561       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1562       */
1563      insn->header.destreg__conditionalmod = function;
1564      insn->header.saturate = saturate;
1565
1566      brw_set_dest(p, insn, dest);
1567      brw_set_src0(p, insn, src);
1568      brw_set_src1(p, insn, brw_null_reg());
1569   } else {
1570      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1571
1572      /* Example code doesn't set predicate_control for send
1573       * instructions.
1574       */
1575      insn->header.predicate_control = 0;
1576      insn->header.destreg__conditionalmod = msg_reg_nr;
1577
1578      brw_set_dest(p, insn, dest);
1579      brw_set_src0(p, insn, src);
1580      brw_set_math_message(p,
1581			   insn,
1582			   function,
1583			   src.type == BRW_REGISTER_TYPE_D,
1584			   precision,
1585			   saturate,
1586			   data_type);
1587   }
1588}
1589
1590/** Extended math function, float[8].
1591 */
1592void brw_math2(struct brw_compile *p,
1593	       struct brw_reg dest,
1594	       GLuint function,
1595	       struct brw_reg src0,
1596	       struct brw_reg src1)
1597{
1598   struct intel_context *intel = &p->brw->intel;
1599   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1600
1601   assert(intel->gen >= 6);
1602   (void) intel;
1603
1604
1605   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1606   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1607   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1608
1609   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1610   if (intel->gen == 6) {
1611      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1612      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1613   }
1614
1615   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1616       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1617       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1618      assert(src0.type != BRW_REGISTER_TYPE_F);
1619      assert(src1.type != BRW_REGISTER_TYPE_F);
1620   } else {
1621      assert(src0.type == BRW_REGISTER_TYPE_F);
1622      assert(src1.type == BRW_REGISTER_TYPE_F);
1623   }
1624
1625   /* Source modifiers are ignored for extended math instructions on Gen6. */
1626   if (intel->gen == 6) {
1627      assert(!src0.negate);
1628      assert(!src0.abs);
1629      assert(!src1.negate);
1630      assert(!src1.abs);
1631   }
1632
1633   /* Math is the same ISA format as other opcodes, except that CondModifier
1634    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1635    */
1636   insn->header.destreg__conditionalmod = function;
1637
1638   brw_set_dest(p, insn, dest);
1639   brw_set_src0(p, insn, src0);
1640   brw_set_src1(p, insn, src1);
1641}
1642
1643/**
1644 * Extended math function, float[16].
1645 * Use 2 send instructions.
1646 */
1647void brw_math_16( struct brw_compile *p,
1648		  struct brw_reg dest,
1649		  GLuint function,
1650		  GLuint saturate,
1651		  GLuint msg_reg_nr,
1652		  struct brw_reg src,
1653		  GLuint precision )
1654{
1655   struct intel_context *intel = &p->brw->intel;
1656   struct brw_instruction *insn;
1657
1658   if (intel->gen >= 6) {
1659      insn = next_insn(p, BRW_OPCODE_MATH);
1660
1661      /* Math is the same ISA format as other opcodes, except that CondModifier
1662       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1663       */
1664      insn->header.destreg__conditionalmod = function;
1665      insn->header.saturate = saturate;
1666
1667      /* Source modifiers are ignored for extended math instructions. */
1668      assert(!src.negate);
1669      assert(!src.abs);
1670
1671      brw_set_dest(p, insn, dest);
1672      brw_set_src0(p, insn, src);
1673      brw_set_src1(p, insn, brw_null_reg());
1674      return;
1675   }
1676
1677   /* First instruction:
1678    */
1679   brw_push_insn_state(p);
1680   brw_set_predicate_control_flag_value(p, 0xff);
1681   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1682
1683   insn = next_insn(p, BRW_OPCODE_SEND);
1684   insn->header.destreg__conditionalmod = msg_reg_nr;
1685
1686   brw_set_dest(p, insn, dest);
1687   brw_set_src0(p, insn, src);
1688   brw_set_math_message(p,
1689			insn,
1690			function,
1691			BRW_MATH_INTEGER_UNSIGNED,
1692			precision,
1693			saturate,
1694			BRW_MATH_DATA_VECTOR);
1695
1696   /* Second instruction:
1697    */
1698   insn = next_insn(p, BRW_OPCODE_SEND);
1699   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1700   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1701
1702   brw_set_dest(p, insn, offset(dest,1));
1703   brw_set_src0(p, insn, src);
1704   brw_set_math_message(p,
1705			insn,
1706			function,
1707			BRW_MATH_INTEGER_UNSIGNED,
1708			precision,
1709			saturate,
1710			BRW_MATH_DATA_VECTOR);
1711
1712   brw_pop_insn_state(p);
1713}
1714
1715
1716/**
1717 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1718 * using a constant offset per channel.
1719 *
1720 * The offset must be aligned to oword size (16 bytes).  Used for
1721 * register spilling.
1722 */
1723void brw_oword_block_write_scratch(struct brw_compile *p,
1724				   struct brw_reg mrf,
1725				   int num_regs,
1726				   GLuint offset)
1727{
1728   struct intel_context *intel = &p->brw->intel;
1729   uint32_t msg_control, msg_type;
1730   int mlen;
1731
1732   if (intel->gen >= 6)
1733      offset /= 16;
1734
1735   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1736
1737   if (num_regs == 1) {
1738      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1739      mlen = 2;
1740   } else {
1741      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1742      mlen = 3;
1743   }
1744
1745   /* Set up the message header.  This is g0, with g0.2 filled with
1746    * the offset.  We don't want to leave our offset around in g0 or
1747    * it'll screw up texture samples, so set it up inside the message
1748    * reg.
1749    */
1750   {
1751      brw_push_insn_state(p);
1752      brw_set_mask_control(p, BRW_MASK_DISABLE);
1753      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1754
1755      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1756
1757      /* set message header global offset field (reg 0, element 2) */
1758      brw_MOV(p,
1759	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1760				  mrf.nr,
1761				  2), BRW_REGISTER_TYPE_UD),
1762	      brw_imm_ud(offset));
1763
1764      brw_pop_insn_state(p);
1765   }
1766
1767   {
1768      struct brw_reg dest;
1769      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1770      int send_commit_msg;
1771      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1772					 BRW_REGISTER_TYPE_UW);
1773
1774      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1775	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1776	 src_header = vec16(src_header);
1777      }
1778      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1779      insn->header.destreg__conditionalmod = mrf.nr;
1780
1781      /* Until gen6, writes followed by reads from the same location
1782       * are not guaranteed to be ordered unless write_commit is set.
1783       * If set, then a no-op write is issued to the destination
1784       * register to set a dependency, and a read from the destination
1785       * can be used to ensure the ordering.
1786       *
1787       * For gen6, only writes between different threads need ordering
1788       * protection.  Our use of DP writes is all about register
1789       * spilling within a thread.
1790       */
1791      if (intel->gen >= 6) {
1792	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1793	 send_commit_msg = 0;
1794      } else {
1795	 dest = src_header;
1796	 send_commit_msg = 1;
1797      }
1798
1799      brw_set_dest(p, insn, dest);
1800      if (intel->gen >= 6) {
1801	 brw_set_src0(p, insn, mrf);
1802      } else {
1803	 brw_set_src0(p, insn, brw_null_reg());
1804      }
1805
1806      if (intel->gen >= 6)
1807	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1808      else
1809	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1810
1811      brw_set_dp_write_message(p,
1812			       insn,
1813			       255, /* binding table index (255=stateless) */
1814			       msg_control,
1815			       msg_type,
1816			       mlen,
1817			       true, /* header_present */
1818			       0, /* not a render target */
1819			       send_commit_msg, /* response_length */
1820			       0, /* eot */
1821			       send_commit_msg);
1822   }
1823}
1824
1825
1826/**
1827 * Read a block of owords (half a GRF each) from the scratch buffer
1828 * using a constant index per channel.
1829 *
1830 * Offset must be aligned to oword size (16 bytes).  Used for register
1831 * spilling.
1832 */
1833void
1834brw_oword_block_read_scratch(struct brw_compile *p,
1835			     struct brw_reg dest,
1836			     struct brw_reg mrf,
1837			     int num_regs,
1838			     GLuint offset)
1839{
1840   struct intel_context *intel = &p->brw->intel;
1841   uint32_t msg_control;
1842   int rlen;
1843
1844   if (intel->gen >= 6)
1845      offset /= 16;
1846
1847   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1848   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1849
1850   if (num_regs == 1) {
1851      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1852      rlen = 1;
1853   } else {
1854      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1855      rlen = 2;
1856   }
1857
1858   {
1859      brw_push_insn_state(p);
1860      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1861      brw_set_mask_control(p, BRW_MASK_DISABLE);
1862
1863      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1864
1865      /* set message header global offset field (reg 0, element 2) */
1866      brw_MOV(p,
1867	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1868				  mrf.nr,
1869				  2), BRW_REGISTER_TYPE_UD),
1870	      brw_imm_ud(offset));
1871
1872      brw_pop_insn_state(p);
1873   }
1874
1875   {
1876      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1877
1878      assert(insn->header.predicate_control == 0);
1879      insn->header.compression_control = BRW_COMPRESSION_NONE;
1880      insn->header.destreg__conditionalmod = mrf.nr;
1881
1882      brw_set_dest(p, insn, dest);	/* UW? */
1883      if (intel->gen >= 6) {
1884	 brw_set_src0(p, insn, mrf);
1885      } else {
1886	 brw_set_src0(p, insn, brw_null_reg());
1887      }
1888
1889      brw_set_dp_read_message(p,
1890			      insn,
1891			      255, /* binding table index (255=stateless) */
1892			      msg_control,
1893			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1894			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1895			      1, /* msg_length */
1896			      rlen);
1897   }
1898}
1899
1900/**
1901 * Read a float[4] vector from the data port Data Cache (const buffer).
1902 * Location (in buffer) should be a multiple of 16.
1903 * Used for fetching shader constants.
1904 */
1905void brw_oword_block_read(struct brw_compile *p,
1906			  struct brw_reg dest,
1907			  struct brw_reg mrf,
1908			  uint32_t offset,
1909			  uint32_t bind_table_index)
1910{
1911   struct intel_context *intel = &p->brw->intel;
1912
1913   /* On newer hardware, offset is in units of owords. */
1914   if (intel->gen >= 6)
1915      offset /= 16;
1916
1917   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1918
1919   brw_push_insn_state(p);
1920   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1921   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1922   brw_set_mask_control(p, BRW_MASK_DISABLE);
1923
1924   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1925
1926   /* set message header global offset field (reg 0, element 2) */
1927   brw_MOV(p,
1928	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1929			       mrf.nr,
1930			       2), BRW_REGISTER_TYPE_UD),
1931	   brw_imm_ud(offset));
1932
1933   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1934   insn->header.destreg__conditionalmod = mrf.nr;
1935
1936   /* cast dest to a uword[8] vector */
1937   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1938
1939   brw_set_dest(p, insn, dest);
1940   if (intel->gen >= 6) {
1941      brw_set_src0(p, insn, mrf);
1942   } else {
1943      brw_set_src0(p, insn, brw_null_reg());
1944   }
1945
1946   brw_set_dp_read_message(p,
1947			   insn,
1948			   bind_table_index,
1949			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1950			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1951			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1952			   1, /* msg_length */
1953			   1); /* response_length (1 reg, 2 owords!) */
1954
1955   brw_pop_insn_state(p);
1956}
1957
1958/**
1959 * Read a set of dwords from the data port Data Cache (const buffer).
1960 *
1961 * Location (in buffer) appears as UD offsets in the register after
1962 * the provided mrf header reg.
1963 */
1964void brw_dword_scattered_read(struct brw_compile *p,
1965			      struct brw_reg dest,
1966			      struct brw_reg mrf,
1967			      uint32_t bind_table_index)
1968{
1969   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1970
1971   brw_push_insn_state(p);
1972   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1973   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1974   brw_set_mask_control(p, BRW_MASK_DISABLE);
1975   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1976   brw_pop_insn_state(p);
1977
1978   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1979   insn->header.destreg__conditionalmod = mrf.nr;
1980
1981   /* cast dest to a uword[8] vector */
1982   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1983
1984   brw_set_dest(p, insn, dest);
1985   brw_set_src0(p, insn, brw_null_reg());
1986
1987   brw_set_dp_read_message(p,
1988			   insn,
1989			   bind_table_index,
1990			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1991			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1992			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1993			   2, /* msg_length */
1994			   1); /* response_length */
1995}
1996
1997
1998
1999/**
2000 * Read float[4] constant(s) from VS constant buffer.
2001 * For relative addressing, two float[4] constants will be read into 'dest'.
2002 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
2003 */
2004void brw_dp_READ_4_vs(struct brw_compile *p,
2005                      struct brw_reg dest,
2006                      GLuint location,
2007                      GLuint bind_table_index)
2008{
2009   struct intel_context *intel = &p->brw->intel;
2010   struct brw_instruction *insn;
2011   GLuint msg_reg_nr = 1;
2012
2013   if (intel->gen >= 6)
2014      location /= 16;
2015
2016   /* Setup MRF[1] with location/offset into const buffer */
2017   brw_push_insn_state(p);
2018   brw_set_access_mode(p, BRW_ALIGN_1);
2019   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2020   brw_set_mask_control(p, BRW_MASK_DISABLE);
2021   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2022   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
2023		     BRW_REGISTER_TYPE_UD),
2024	   brw_imm_ud(location));
2025   brw_pop_insn_state(p);
2026
2027   insn = next_insn(p, BRW_OPCODE_SEND);
2028
2029   insn->header.predicate_control = BRW_PREDICATE_NONE;
2030   insn->header.compression_control = BRW_COMPRESSION_NONE;
2031   insn->header.destreg__conditionalmod = msg_reg_nr;
2032   insn->header.mask_control = BRW_MASK_DISABLE;
2033
2034   brw_set_dest(p, insn, dest);
2035   if (intel->gen >= 6) {
2036      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
2037   } else {
2038      brw_set_src0(p, insn, brw_null_reg());
2039   }
2040
2041   brw_set_dp_read_message(p,
2042			   insn,
2043			   bind_table_index,
2044			   0,
2045			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2046			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2047			   1, /* msg_length */
2048			   1); /* response_length (1 Oword) */
2049}
2050
2051/**
2052 * Read a float[4] constant per vertex from VS constant buffer, with
2053 * relative addressing.
2054 */
2055void brw_dp_READ_4_vs_relative(struct brw_compile *p,
2056			       struct brw_reg dest,
2057			       struct brw_reg addr_reg,
2058			       GLuint offset,
2059			       GLuint bind_table_index)
2060{
2061   struct intel_context *intel = &p->brw->intel;
2062   struct brw_reg src = brw_vec8_grf(0, 0);
2063   int msg_type;
2064
2065   /* Setup MRF[1] with offset into const buffer */
2066   brw_push_insn_state(p);
2067   brw_set_access_mode(p, BRW_ALIGN_1);
2068   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2069   brw_set_mask_control(p, BRW_MASK_DISABLE);
2070   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2071
2072   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2073    * fields ignored.
2074    */
2075   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2076	   addr_reg, brw_imm_d(offset));
2077   brw_pop_insn_state(p);
2078
2079   gen6_resolve_implied_move(p, &src, 0);
2080   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2081
2082   insn->header.predicate_control = BRW_PREDICATE_NONE;
2083   insn->header.compression_control = BRW_COMPRESSION_NONE;
2084   insn->header.destreg__conditionalmod = 0;
2085   insn->header.mask_control = BRW_MASK_DISABLE;
2086
2087   brw_set_dest(p, insn, dest);
2088   brw_set_src0(p, insn, src);
2089
2090   if (intel->gen >= 6)
2091      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2092   else if (intel->gen == 5 || intel->is_g4x)
2093      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2094   else
2095      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2096
2097   brw_set_dp_read_message(p,
2098			   insn,
2099			   bind_table_index,
2100			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2101			   msg_type,
2102			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2103			   2, /* msg_length */
2104			   1); /* response_length */
2105}
2106
2107
2108
2109void brw_fb_WRITE(struct brw_compile *p,
2110		  int dispatch_width,
2111                  GLuint msg_reg_nr,
2112                  struct brw_reg src0,
2113                  GLuint binding_table_index,
2114                  GLuint msg_length,
2115                  GLuint response_length,
2116                  bool eot,
2117                  bool header_present)
2118{
2119   struct intel_context *intel = &p->brw->intel;
2120   struct brw_instruction *insn;
2121   GLuint msg_control, msg_type;
2122   struct brw_reg dest;
2123
2124   if (dispatch_width == 16)
2125      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2126   else
2127      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2128
2129   if (intel->gen >= 6 && binding_table_index == 0) {
2130      insn = next_insn(p, BRW_OPCODE_SENDC);
2131   } else {
2132      insn = next_insn(p, BRW_OPCODE_SEND);
2133   }
2134   /* The execution mask is ignored for render target writes. */
2135   insn->header.predicate_control = 0;
2136   insn->header.compression_control = BRW_COMPRESSION_NONE;
2137
2138   if (intel->gen >= 6) {
2139      /* headerless version, just submit color payload */
2140      src0 = brw_message_reg(msg_reg_nr);
2141
2142      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2143   } else {
2144      insn->header.destreg__conditionalmod = msg_reg_nr;
2145
2146      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2147   }
2148
2149   if (dispatch_width == 16)
2150      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2151   else
2152      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2153
2154   brw_set_dest(p, insn, dest);
2155   brw_set_src0(p, insn, src0);
2156   brw_set_dp_write_message(p,
2157			    insn,
2158			    binding_table_index,
2159			    msg_control,
2160			    msg_type,
2161			    msg_length,
2162			    header_present,
2163			    1, /* last render target write */
2164			    response_length,
2165			    eot,
2166			    0 /* send_commit_msg */);
2167}
2168
2169
2170/**
2171 * Texture sample instruction.
2172 * Note: the msg_type plus msg_length values determine exactly what kind
2173 * of sampling operation is performed.  See volume 4, page 161 of docs.
2174 */
2175void brw_SAMPLE(struct brw_compile *p,
2176		struct brw_reg dest,
2177		GLuint msg_reg_nr,
2178		struct brw_reg src0,
2179		GLuint binding_table_index,
2180		GLuint sampler,
2181		GLuint writemask,
2182		GLuint msg_type,
2183		GLuint response_length,
2184		GLuint msg_length,
2185		GLuint header_present,
2186		GLuint simd_mode,
2187		GLuint return_format)
2188{
2189   struct intel_context *intel = &p->brw->intel;
2190   bool need_stall = 0;
2191
2192   if (writemask == 0) {
2193      /*printf("%s: zero writemask??\n", __FUNCTION__); */
2194      return;
2195   }
2196
2197   /* Hardware doesn't do destination dependency checking on send
2198    * instructions properly.  Add a workaround which generates the
2199    * dependency by other means.  In practice it seems like this bug
2200    * only crops up for texture samples, and only where registers are
2201    * written by the send and then written again later without being
2202    * read in between.  Luckily for us, we already track that
2203    * information and use it to modify the writemask for the
2204    * instruction, so that is a guide for whether a workaround is
2205    * needed.
2206    */
2207   if (writemask != WRITEMASK_XYZW) {
2208      GLuint dst_offset = 0;
2209      GLuint i, newmask = 0, len = 0;
2210
2211      for (i = 0; i < 4; i++) {
2212	 if (writemask & (1<<i))
2213	    break;
2214	 dst_offset += 2;
2215      }
2216      for (; i < 4; i++) {
2217	 if (!(writemask & (1<<i)))
2218	    break;
2219	 newmask |= 1<<i;
2220	 len++;
2221      }
2222
2223      if (newmask != writemask) {
2224	 need_stall = 1;
2225         /* printf("need stall %x %x\n", newmask , writemask); */
2226      }
2227      else {
2228	 bool dispatch_16 = false;
2229
2230	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2231
2232	 guess_execution_size(p, p->current, dest);
2233	 if (p->current->header.execution_size == BRW_EXECUTE_16)
2234	    dispatch_16 = true;
2235
2236	 newmask = ~newmask & WRITEMASK_XYZW;
2237
2238	 brw_push_insn_state(p);
2239
2240	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2241	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2242
2243	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2244		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2245  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2246
2247	 brw_pop_insn_state(p);
2248
2249  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2250	 dest = offset(dest, dst_offset);
2251
2252	 /* For 16-wide dispatch, masked channels are skipped in the
2253	  * response.  For 8-wide, masked channels still take up slots,
2254	  * and are just not written to.
2255	  */
2256	 if (dispatch_16)
2257	    response_length = len * 2;
2258      }
2259   }
2260
2261   {
2262      struct brw_instruction *insn;
2263
2264      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2265
2266      insn = next_insn(p, BRW_OPCODE_SEND);
2267      insn->header.predicate_control = 0; /* XXX */
2268      insn->header.compression_control = BRW_COMPRESSION_NONE;
2269      if (intel->gen < 6)
2270	  insn->header.destreg__conditionalmod = msg_reg_nr;
2271
2272      brw_set_dest(p, insn, dest);
2273      brw_set_src0(p, insn, src0);
2274      brw_set_sampler_message(p, insn,
2275			      binding_table_index,
2276			      sampler,
2277			      msg_type,
2278			      response_length,
2279			      msg_length,
2280			      header_present,
2281			      simd_mode,
2282			      return_format);
2283   }
2284
2285   if (need_stall) {
2286      struct brw_reg reg = vec8(offset(dest, response_length-1));
2287
2288      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2289       */
2290      brw_push_insn_state(p);
2291      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2292      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2293	      retype(reg, BRW_REGISTER_TYPE_UD));
2294      brw_pop_insn_state(p);
2295   }
2296
2297}
2298
2299/* All these variables are pretty confusing - we might be better off
2300 * using bitmasks and macros for this, in the old style.  Or perhaps
2301 * just having the caller instantiate the fields in dword3 itself.
2302 */
2303void brw_urb_WRITE(struct brw_compile *p,
2304		   struct brw_reg dest,
2305		   GLuint msg_reg_nr,
2306		   struct brw_reg src0,
2307		   bool allocate,
2308		   bool used,
2309		   GLuint msg_length,
2310		   GLuint response_length,
2311		   bool eot,
2312		   bool writes_complete,
2313		   GLuint offset,
2314		   GLuint swizzle)
2315{
2316   struct intel_context *intel = &p->brw->intel;
2317   struct brw_instruction *insn;
2318
2319   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2320
2321   if (intel->gen == 7) {
2322      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2323      brw_push_insn_state(p);
2324      brw_set_access_mode(p, BRW_ALIGN_1);
2325      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2326		       BRW_REGISTER_TYPE_UD),
2327	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2328		brw_imm_ud(0xff00));
2329      brw_pop_insn_state(p);
2330   }
2331
2332   insn = next_insn(p, BRW_OPCODE_SEND);
2333
2334   assert(msg_length < BRW_MAX_MRF);
2335
2336   brw_set_dest(p, insn, dest);
2337   brw_set_src0(p, insn, src0);
2338   brw_set_src1(p, insn, brw_imm_d(0));
2339
2340   if (intel->gen < 6)
2341      insn->header.destreg__conditionalmod = msg_reg_nr;
2342
2343   brw_set_urb_message(p,
2344		       insn,
2345		       allocate,
2346		       used,
2347		       msg_length,
2348		       response_length,
2349		       eot,
2350		       writes_complete,
2351		       offset,
2352		       swizzle);
2353}
2354
2355static int
2356brw_find_next_block_end(struct brw_compile *p, int start)
2357{
2358   int ip;
2359
2360   for (ip = start + 1; ip < p->nr_insn; ip++) {
2361      struct brw_instruction *insn = &p->store[ip];
2362
2363      switch (insn->header.opcode) {
2364      case BRW_OPCODE_ENDIF:
2365      case BRW_OPCODE_ELSE:
2366      case BRW_OPCODE_WHILE:
2367	 return ip;
2368      }
2369   }
2370   assert(!"not reached");
2371   return start + 1;
2372}
2373
2374/* There is no DO instruction on gen6, so to find the end of the loop
2375 * we have to see if the loop is jumping back before our start
2376 * instruction.
2377 */
2378static int
2379brw_find_loop_end(struct brw_compile *p, int start)
2380{
2381   struct intel_context *intel = &p->brw->intel;
2382   int ip;
2383   int br = 2;
2384
2385   for (ip = start + 1; ip < p->nr_insn; ip++) {
2386      struct brw_instruction *insn = &p->store[ip];
2387
2388      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2389	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2390				   : insn->bits3.break_cont.jip;
2391	 if (ip + jip / br <= start)
2392	    return ip;
2393      }
2394   }
2395   assert(!"not reached");
2396   return start + 1;
2397}
2398
2399/* After program generation, go back and update the UIP and JIP of
2400 * BREAK and CONT instructions to their correct locations.
2401 */
2402void
2403brw_set_uip_jip(struct brw_compile *p)
2404{
2405   struct intel_context *intel = &p->brw->intel;
2406   int ip;
2407   int br = 2;
2408
2409   if (intel->gen < 6)
2410      return;
2411
2412   for (ip = 0; ip < p->nr_insn; ip++) {
2413      struct brw_instruction *insn = &p->store[ip];
2414
2415      switch (insn->header.opcode) {
2416      case BRW_OPCODE_BREAK:
2417	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2418	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2419	 insn->bits3.break_cont.uip =
2420	    br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2421	 break;
2422      case BRW_OPCODE_CONTINUE:
2423	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2424	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2425
2426	 assert(insn->bits3.break_cont.uip != 0);
2427	 assert(insn->bits3.break_cont.jip != 0);
2428	 break;
2429      }
2430   }
2431}
2432
2433void brw_ff_sync(struct brw_compile *p,
2434		   struct brw_reg dest,
2435		   GLuint msg_reg_nr,
2436		   struct brw_reg src0,
2437		   bool allocate,
2438		   GLuint response_length,
2439		   bool eot)
2440{
2441   struct intel_context *intel = &p->brw->intel;
2442   struct brw_instruction *insn;
2443
2444   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2445
2446   insn = next_insn(p, BRW_OPCODE_SEND);
2447   brw_set_dest(p, insn, dest);
2448   brw_set_src0(p, insn, src0);
2449   brw_set_src1(p, insn, brw_imm_d(0));
2450
2451   if (intel->gen < 6)
2452      insn->header.destreg__conditionalmod = msg_reg_nr;
2453
2454   brw_set_ff_sync_message(p,
2455			   insn,
2456			   allocate,
2457			   response_length,
2458			   eot);
2459}
2460
2461/**
2462 * Emit the SEND instruction necessary to generate stream output data on Gen6
2463 * (for transform feedback).
2464 *
2465 * If send_commit_msg is true, this is the last piece of stream output data
2466 * from this thread, so send the data as a committed write.  According to the
2467 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2468 *
2469 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2470 *   writes are complete by sending the final write as a committed write."
2471 */
2472void
2473brw_svb_write(struct brw_compile *p,
2474              struct brw_reg dest,
2475              GLuint msg_reg_nr,
2476              struct brw_reg src0,
2477              GLuint binding_table_index,
2478              bool   send_commit_msg)
2479{
2480   struct brw_instruction *insn;
2481
2482   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2483
2484   insn = next_insn(p, BRW_OPCODE_SEND);
2485   brw_set_dest(p, insn, dest);
2486   brw_set_src0(p, insn, src0);
2487   brw_set_src1(p, insn, brw_imm_d(0));
2488   brw_set_dp_write_message(p, insn,
2489                            binding_table_index,
2490                            0, /* msg_control: ignored */
2491                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2492                            1, /* msg_length */
2493                            true, /* header_present */
2494                            0, /* last_render_target: ignored */
2495                            send_commit_msg, /* response_length */
2496                            0, /* end_of_thread */
2497                            send_commit_msg); /* send_commit_msg */
2498}
2499