brw_eu_emit.c revision bbea5c5a5a7fb327d4ef03f80fe19cfa8d8edccd
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct intel_context *intel = &p->brw->intel;
67   if (intel->gen < 6)
68      return;
69
70   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
71      brw_push_insn_state(p);
72      brw_set_mask_control(p, BRW_MASK_DISABLE);
73      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
74      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
75	      retype(*src, BRW_REGISTER_TYPE_UD));
76      brw_pop_insn_state(p);
77   }
78   *src = brw_message_reg(msg_reg_nr);
79}
80
81static void
82gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
83{
84   struct intel_context *intel = &p->brw->intel;
85   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
86      reg->file = BRW_GENERAL_REGISTER_FILE;
87      reg->nr += 111;
88   }
89}
90
91
92void
93brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
94	     struct brw_reg dest)
95{
96   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
97       dest.file != BRW_MESSAGE_REGISTER_FILE)
98      assert(dest.nr < 128);
99
100   gen7_convert_mrf_to_grf(p, &dest);
101
102   insn->bits1.da1.dest_reg_file = dest.file;
103   insn->bits1.da1.dest_reg_type = dest.type;
104   insn->bits1.da1.dest_address_mode = dest.address_mode;
105
106   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
107      insn->bits1.da1.dest_reg_nr = dest.nr;
108
109      if (insn->header.access_mode == BRW_ALIGN_1) {
110	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
111	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
112	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
113	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
114      }
115      else {
116	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
117	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
118	 /* even ignored in da16, still need to set as '01' */
119	 insn->bits1.da16.dest_horiz_stride = 1;
120      }
121   }
122   else {
123      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
124
125      /* These are different sizes in align1 vs align16:
126       */
127      if (insn->header.access_mode == BRW_ALIGN_1) {
128	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
129	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
130	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
131	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
132      }
133      else {
134	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
135	 /* even ignored in da16, still need to set as '01' */
136	 insn->bits1.ia16.dest_horiz_stride = 1;
137      }
138   }
139
140   /* NEW: Set the execution size based on dest.width and
141    * insn->compression_control:
142    */
143   guess_execution_size(p, insn, dest);
144}
145
146extern int reg_type_size[];
147
148static void
149validate_reg(struct brw_instruction *insn, struct brw_reg reg)
150{
151   int hstride_for_reg[] = {0, 1, 2, 4};
152   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
153   int width_for_reg[] = {1, 2, 4, 8, 16};
154   int execsize_for_reg[] = {1, 2, 4, 8, 16};
155   int width, hstride, vstride, execsize;
156
157   if (reg.file == BRW_IMMEDIATE_VALUE) {
158      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
159       * mean the destination has to be 128-bit aligned and the
160       * destination horiz stride has to be a word.
161       */
162      if (reg.type == BRW_REGISTER_TYPE_V) {
163	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
164		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
165      }
166
167      return;
168   }
169
170   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
171       reg.file == BRW_ARF_NULL)
172      return;
173
174   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
175   hstride = hstride_for_reg[reg.hstride];
176
177   if (reg.vstride == 0xf) {
178      vstride = -1;
179   } else {
180      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
181      vstride = vstride_for_reg[reg.vstride];
182   }
183
184   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
185   width = width_for_reg[reg.width];
186
187   assert(insn->header.execution_size >= 0 &&
188	  insn->header.execution_size < Elements(execsize_for_reg));
189   execsize = execsize_for_reg[insn->header.execution_size];
190
191   /* Restrictions from 3.3.10: Register Region Restrictions. */
192   /* 3. */
193   assert(execsize >= width);
194
195   /* 4. */
196   if (execsize == width && hstride != 0) {
197      assert(vstride == -1 || vstride == width * hstride);
198   }
199
200   /* 5. */
201   if (execsize == width && hstride == 0) {
202      /* no restriction on vstride. */
203   }
204
205   /* 6. */
206   if (width == 1) {
207      assert(hstride == 0);
208   }
209
210   /* 7. */
211   if (execsize == 1 && width == 1) {
212      assert(hstride == 0);
213      assert(vstride == 0);
214   }
215
216   /* 8. */
217   if (vstride == 0 && hstride == 0) {
218      assert(width == 1);
219   }
220
221   /* 10. Check destination issues. */
222}
223
224void
225brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
226	     struct brw_reg reg)
227{
228   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
229      assert(reg.nr < 128);
230
231   gen7_convert_mrf_to_grf(p, &reg);
232
233   validate_reg(insn, reg);
234
235   insn->bits1.da1.src0_reg_file = reg.file;
236   insn->bits1.da1.src0_reg_type = reg.type;
237   insn->bits2.da1.src0_abs = reg.abs;
238   insn->bits2.da1.src0_negate = reg.negate;
239   insn->bits2.da1.src0_address_mode = reg.address_mode;
240
241   if (reg.file == BRW_IMMEDIATE_VALUE) {
242      insn->bits3.ud = reg.dw1.ud;
243
244      /* Required to set some fields in src1 as well:
245       */
246      insn->bits1.da1.src1_reg_file = 0; /* arf */
247      insn->bits1.da1.src1_reg_type = reg.type;
248   }
249   else
250   {
251      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
252	 if (insn->header.access_mode == BRW_ALIGN_1) {
253	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
254	    insn->bits2.da1.src0_reg_nr = reg.nr;
255	 }
256	 else {
257	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
258	    insn->bits2.da16.src0_reg_nr = reg.nr;
259	 }
260      }
261      else {
262	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
263
264	 if (insn->header.access_mode == BRW_ALIGN_1) {
265	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
266	 }
267	 else {
268	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
269	 }
270      }
271
272      if (insn->header.access_mode == BRW_ALIGN_1) {
273	 if (reg.width == BRW_WIDTH_1 &&
274	     insn->header.execution_size == BRW_EXECUTE_1) {
275	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
276	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
277	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
278	 }
279	 else {
280	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
281	    insn->bits2.da1.src0_width = reg.width;
282	    insn->bits2.da1.src0_vert_stride = reg.vstride;
283	 }
284      }
285      else {
286	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
287	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
288	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
289	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
290
291	 /* This is an oddity of the fact we're using the same
292	  * descriptions for registers in align_16 as align_1:
293	  */
294	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
295	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
296	 else
297	    insn->bits2.da16.src0_vert_stride = reg.vstride;
298      }
299   }
300}
301
302
303void brw_set_src1(struct brw_compile *p,
304		  struct brw_instruction *insn,
305		  struct brw_reg reg)
306{
307   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
308
309   assert(reg.nr < 128);
310
311   gen7_convert_mrf_to_grf(p, &reg);
312
313   validate_reg(insn, reg);
314
315   insn->bits1.da1.src1_reg_file = reg.file;
316   insn->bits1.da1.src1_reg_type = reg.type;
317   insn->bits3.da1.src1_abs = reg.abs;
318   insn->bits3.da1.src1_negate = reg.negate;
319
320   /* Only src1 can be immediate in two-argument instructions.
321    */
322   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
323
324   if (reg.file == BRW_IMMEDIATE_VALUE) {
325      insn->bits3.ud = reg.dw1.ud;
326   }
327   else {
328      /* This is a hardware restriction, which may or may not be lifted
329       * in the future:
330       */
331      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
332      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
333
334      if (insn->header.access_mode == BRW_ALIGN_1) {
335	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
336	 insn->bits3.da1.src1_reg_nr = reg.nr;
337      }
338      else {
339	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
340	 insn->bits3.da16.src1_reg_nr = reg.nr;
341      }
342
343      if (insn->header.access_mode == BRW_ALIGN_1) {
344	 if (reg.width == BRW_WIDTH_1 &&
345	     insn->header.execution_size == BRW_EXECUTE_1) {
346	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
347	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
348	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
349	 }
350	 else {
351	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
352	    insn->bits3.da1.src1_width = reg.width;
353	    insn->bits3.da1.src1_vert_stride = reg.vstride;
354	 }
355      }
356      else {
357	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
358	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
359	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
360	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
361
362	 /* This is an oddity of the fact we're using the same
363	  * descriptions for registers in align_16 as align_1:
364	  */
365	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
366	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
367	 else
368	    insn->bits3.da16.src1_vert_stride = reg.vstride;
369      }
370   }
371}
372
373
374
375static void brw_set_math_message( struct brw_compile *p,
376				  struct brw_instruction *insn,
377				  GLuint function,
378				  GLuint integer_type,
379				  bool low_precision,
380				  bool saturate,
381				  GLuint dataType )
382{
383   struct brw_context *brw = p->brw;
384   struct intel_context *intel = &brw->intel;
385   unsigned msg_length;
386   unsigned response_length;
387
388   /* Infer message length from the function */
389   switch (function) {
390   case BRW_MATH_FUNCTION_POW:
391   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
392   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
393   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
394      msg_length = 2;
395      break;
396   default:
397      msg_length = 1;
398      break;
399   }
400
401   /* Infer response length from the function */
402   switch (function) {
403   case BRW_MATH_FUNCTION_SINCOS:
404   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
405      response_length = 2;
406      break;
407   default:
408      response_length = 1;
409      break;
410   }
411
412   brw_set_src1(p, insn, brw_imm_d(0));
413   if (intel->gen == 5) {
414      insn->bits3.math_gen5.function = function;
415      insn->bits3.math_gen5.int_type = integer_type;
416      insn->bits3.math_gen5.precision = low_precision;
417      insn->bits3.math_gen5.saturate = saturate;
418      insn->bits3.math_gen5.data_type = dataType;
419      insn->bits3.math_gen5.snapshot = 0;
420      insn->bits3.math_gen5.header_present = 0;
421      insn->bits3.math_gen5.response_length = response_length;
422      insn->bits3.math_gen5.msg_length = msg_length;
423      insn->bits3.math_gen5.end_of_thread = 0;
424      insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
425      insn->bits2.send_gen5.end_of_thread = 0;
426   } else {
427      insn->bits3.math.function = function;
428      insn->bits3.math.int_type = integer_type;
429      insn->bits3.math.precision = low_precision;
430      insn->bits3.math.saturate = saturate;
431      insn->bits3.math.data_type = dataType;
432      insn->bits3.math.response_length = response_length;
433      insn->bits3.math.msg_length = msg_length;
434      insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
435      insn->bits3.math.end_of_thread = 0;
436   }
437}
438
439
440static void brw_set_ff_sync_message(struct brw_compile *p,
441				    struct brw_instruction *insn,
442				    bool allocate,
443				    GLuint response_length,
444				    bool end_of_thread)
445{
446   struct brw_context *brw = p->brw;
447   struct intel_context *intel = &brw->intel;
448   brw_set_src1(p, insn, brw_imm_d(0));
449
450   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
451   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
452   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
453   insn->bits3.urb_gen5.allocate = allocate;
454   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
455   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
456   insn->bits3.urb_gen5.header_present = 1;
457   insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
458   insn->bits3.urb_gen5.msg_length = 1;
459   insn->bits3.urb_gen5.end_of_thread = end_of_thread;
460   if (intel->gen >= 6) {
461      insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
462   } else {
463      insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
464      insn->bits2.send_gen5.end_of_thread = end_of_thread;
465   }
466}
467
468static void brw_set_urb_message( struct brw_compile *p,
469				 struct brw_instruction *insn,
470				 bool allocate,
471				 bool used,
472				 GLuint msg_length,
473				 GLuint response_length,
474				 bool end_of_thread,
475				 bool complete,
476				 GLuint offset,
477				 GLuint swizzle_control )
478{
479   struct brw_context *brw = p->brw;
480   struct intel_context *intel = &brw->intel;
481   brw_set_src1(p, insn, brw_imm_d(0));
482
483   if (intel->gen == 7) {
484      insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
485      insn->bits3.urb_gen7.offset = offset;
486      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
487      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
488      /* per_slot_offset = 0 makes it ignore offsets in message header */
489      insn->bits3.urb_gen7.per_slot_offset = 0;
490      insn->bits3.urb_gen7.complete = complete;
491      insn->bits3.urb_gen7.header_present = 1;
492      insn->bits3.urb_gen7.response_length = response_length;
493      insn->bits3.urb_gen7.msg_length = msg_length;
494      insn->bits3.urb_gen7.end_of_thread = end_of_thread;
495      insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
496   } else if (intel->gen >= 5) {
497      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
498      insn->bits3.urb_gen5.offset = offset;
499      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
500      insn->bits3.urb_gen5.allocate = allocate;
501      insn->bits3.urb_gen5.used = used;	/* ? */
502      insn->bits3.urb_gen5.complete = complete;
503      insn->bits3.urb_gen5.header_present = 1;
504      insn->bits3.urb_gen5.response_length = response_length;
505      insn->bits3.urb_gen5.msg_length = msg_length;
506      insn->bits3.urb_gen5.end_of_thread = end_of_thread;
507      if (intel->gen >= 6) {
508	 /* For SNB, the SFID bits moved to the condmod bits, and
509	  * EOT stayed in bits3 above.  Does the EOT bit setting
510	  * below on Ironlake even do anything?
511	  */
512	 insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
513      } else {
514	 insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
515	 insn->bits2.send_gen5.end_of_thread = end_of_thread;
516      }
517   } else {
518      insn->bits3.urb.opcode = 0;	/* ? */
519      insn->bits3.urb.offset = offset;
520      insn->bits3.urb.swizzle_control = swizzle_control;
521      insn->bits3.urb.allocate = allocate;
522      insn->bits3.urb.used = used;	/* ? */
523      insn->bits3.urb.complete = complete;
524      insn->bits3.urb.response_length = response_length;
525      insn->bits3.urb.msg_length = msg_length;
526      insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
527      insn->bits3.urb.end_of_thread = end_of_thread;
528   }
529}
530
531void
532brw_set_dp_write_message(struct brw_compile *p,
533			 struct brw_instruction *insn,
534			 GLuint binding_table_index,
535			 GLuint msg_control,
536			 GLuint msg_type,
537			 GLuint msg_length,
538			 bool header_present,
539			 GLuint pixel_scoreboard_clear,
540			 GLuint response_length,
541			 GLuint end_of_thread,
542			 GLuint send_commit_msg)
543{
544   struct brw_context *brw = p->brw;
545   struct intel_context *intel = &brw->intel;
546   brw_set_src1(p, insn, brw_imm_ud(0));
547
548   if (intel->gen >= 7) {
549      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
550      unsigned sfid = GEN7_MESSAGE_TARGET_DP_DATA_CACHE;
551      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
552	 sfid = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
553
554      insn->header.destreg__conditionalmod = sfid;
555
556      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
557      insn->bits3.gen7_dp.msg_control = msg_control;
558      insn->bits3.gen7_dp.pixel_scoreboard_clear = pixel_scoreboard_clear;
559      insn->bits3.gen7_dp.msg_type = msg_type;
560      insn->bits3.gen7_dp.header_present = header_present;
561      insn->bits3.gen7_dp.response_length = response_length;
562      insn->bits3.gen7_dp.msg_length = msg_length;
563      insn->bits3.gen7_dp.end_of_thread = end_of_thread;
564   } else if (intel->gen == 6) {
565      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
566      insn->bits3.gen6_dp.msg_control = msg_control;
567      insn->bits3.gen6_dp.pixel_scoreboard_clear = pixel_scoreboard_clear;
568      insn->bits3.gen6_dp.msg_type = msg_type;
569      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
570      insn->bits3.gen6_dp.header_present = header_present;
571      insn->bits3.gen6_dp.response_length = response_length;
572      insn->bits3.gen6_dp.msg_length = msg_length;
573      insn->bits3.gen6_dp.end_of_thread = end_of_thread;
574
575      /* We always use the render cache for write messages */
576      insn->header.destreg__conditionalmod = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
577   } else if (intel->gen == 5) {
578      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
579      insn->bits3.dp_write_gen5.msg_control = msg_control;
580      insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
581      insn->bits3.dp_write_gen5.msg_type = msg_type;
582      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
583      insn->bits3.dp_write_gen5.header_present = header_present;
584      insn->bits3.dp_write_gen5.response_length = response_length;
585      insn->bits3.dp_write_gen5.msg_length = msg_length;
586      insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
587      insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
588      insn->bits2.send_gen5.end_of_thread = end_of_thread;
589   } else {
590      insn->bits3.dp_write.binding_table_index = binding_table_index;
591      insn->bits3.dp_write.msg_control = msg_control;
592      insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
593      insn->bits3.dp_write.msg_type = msg_type;
594      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
595      insn->bits3.dp_write.response_length = response_length;
596      insn->bits3.dp_write.msg_length = msg_length;
597      insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
598      insn->bits3.dp_write.end_of_thread = end_of_thread;
599   }
600}
601
602void
603brw_set_dp_read_message(struct brw_compile *p,
604			struct brw_instruction *insn,
605			GLuint binding_table_index,
606			GLuint msg_control,
607			GLuint msg_type,
608			GLuint target_cache,
609			GLuint msg_length,
610			GLuint response_length)
611{
612   struct brw_context *brw = p->brw;
613   struct intel_context *intel = &brw->intel;
614   brw_set_src1(p, insn, brw_imm_d(0));
615
616   if (intel->gen >= 7) {
617      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
618      insn->bits3.gen7_dp.msg_control = msg_control;
619      insn->bits3.gen7_dp.pixel_scoreboard_clear = 0;
620      insn->bits3.gen7_dp.msg_type = msg_type;
621      insn->bits3.gen7_dp.header_present = 1;
622      insn->bits3.gen7_dp.response_length = response_length;
623      insn->bits3.gen7_dp.msg_length = msg_length;
624      insn->bits3.gen7_dp.end_of_thread = 0;
625      insn->header.destreg__conditionalmod = GEN7_MESSAGE_TARGET_DP_DATA_CACHE;
626   } else if (intel->gen == 6) {
627      uint32_t target_function;
628
629      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
630	 target_function = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
631      else
632	 target_function = GEN6_MESSAGE_TARGET_DP_SAMPLER_CACHE;
633
634      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
635      insn->bits3.gen6_dp.msg_control = msg_control;
636      insn->bits3.gen6_dp.pixel_scoreboard_clear = 0;
637      insn->bits3.gen6_dp.msg_type = msg_type;
638      insn->bits3.gen6_dp.send_commit_msg = 0;
639      insn->bits3.gen6_dp.header_present = 1;
640      insn->bits3.gen6_dp.response_length = response_length;
641      insn->bits3.gen6_dp.msg_length = msg_length;
642      insn->bits3.gen6_dp.end_of_thread = 0;
643      insn->header.destreg__conditionalmod = target_function;
644   } else if (intel->gen == 5) {
645      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
646      insn->bits3.dp_read_gen5.msg_control = msg_control;
647      insn->bits3.dp_read_gen5.msg_type = msg_type;
648      insn->bits3.dp_read_gen5.target_cache = target_cache;
649      insn->bits3.dp_read_gen5.header_present = 1;
650      insn->bits3.dp_read_gen5.response_length = response_length;
651      insn->bits3.dp_read_gen5.msg_length = msg_length;
652      insn->bits3.dp_read_gen5.pad1 = 0;
653      insn->bits3.dp_read_gen5.end_of_thread = 0;
654      insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
655      insn->bits2.send_gen5.end_of_thread = 0;
656   } else if (intel->is_g4x) {
657      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
658      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
659      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
660      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
661      insn->bits3.dp_read_g4x.response_length = response_length;  /*16:19*/
662      insn->bits3.dp_read_g4x.msg_length = msg_length;  /*20:23*/
663      insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
664      insn->bits3.dp_read_g4x.pad1 = 0;
665      insn->bits3.dp_read_g4x.end_of_thread = 0;
666   } else {
667      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
668      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
669      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
670      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
671      insn->bits3.dp_read.response_length = response_length;  /*16:19*/
672      insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
673      insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
674      insn->bits3.dp_read.pad1 = 0;  /*28:30*/
675      insn->bits3.dp_read.end_of_thread = 0;  /*31*/
676   }
677}
678
679static void brw_set_sampler_message(struct brw_compile *p,
680                                    struct brw_instruction *insn,
681                                    GLuint binding_table_index,
682                                    GLuint sampler,
683                                    GLuint msg_type,
684                                    GLuint response_length,
685                                    GLuint msg_length,
686                                    bool eot,
687                                    GLuint header_present,
688                                    GLuint simd_mode)
689{
690   struct brw_context *brw = p->brw;
691   struct intel_context *intel = &brw->intel;
692   assert(eot == 0);
693   brw_set_src1(p, insn, brw_imm_d(0));
694
695   if (intel->gen >= 7) {
696      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
697      insn->bits3.sampler_gen7.sampler = sampler;
698      insn->bits3.sampler_gen7.msg_type = msg_type;
699      insn->bits3.sampler_gen7.simd_mode = simd_mode;
700      insn->bits3.sampler_gen7.header_present = header_present;
701      insn->bits3.sampler_gen7.response_length = response_length;
702      insn->bits3.sampler_gen7.msg_length = msg_length;
703      insn->bits3.sampler_gen7.end_of_thread = eot;
704      insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
705   } else if (intel->gen >= 5) {
706      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
707      insn->bits3.sampler_gen5.sampler = sampler;
708      insn->bits3.sampler_gen5.msg_type = msg_type;
709      insn->bits3.sampler_gen5.simd_mode = simd_mode;
710      insn->bits3.sampler_gen5.header_present = header_present;
711      insn->bits3.sampler_gen5.response_length = response_length;
712      insn->bits3.sampler_gen5.msg_length = msg_length;
713      insn->bits3.sampler_gen5.end_of_thread = eot;
714      if (intel->gen >= 6)
715	  insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
716      else {
717	  insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
718	  insn->bits2.send_gen5.end_of_thread = eot;
719      }
720   } else if (intel->is_g4x) {
721      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
722      insn->bits3.sampler_g4x.sampler = sampler;
723      insn->bits3.sampler_g4x.msg_type = msg_type;
724      insn->bits3.sampler_g4x.response_length = response_length;
725      insn->bits3.sampler_g4x.msg_length = msg_length;
726      insn->bits3.sampler_g4x.end_of_thread = eot;
727      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
728   } else {
729      insn->bits3.sampler.binding_table_index = binding_table_index;
730      insn->bits3.sampler.sampler = sampler;
731      insn->bits3.sampler.msg_type = msg_type;
732      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
733      insn->bits3.sampler.response_length = response_length;
734      insn->bits3.sampler.msg_length = msg_length;
735      insn->bits3.sampler.end_of_thread = eot;
736      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
737   }
738}
739
740
741#define next_insn brw_next_insn
742struct brw_instruction *
743brw_next_insn(struct brw_compile *p, GLuint opcode)
744{
745   struct brw_instruction *insn;
746
747   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
748
749   insn = &p->store[p->nr_insn++];
750   memcpy(insn, p->current, sizeof(*insn));
751
752   /* Reset this one-shot flag:
753    */
754
755   if (p->current->header.destreg__conditionalmod) {
756      p->current->header.destreg__conditionalmod = 0;
757      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
758   }
759
760   insn->header.opcode = opcode;
761   return insn;
762}
763
764static struct brw_instruction *brw_alu1( struct brw_compile *p,
765					 GLuint opcode,
766					 struct brw_reg dest,
767					 struct brw_reg src )
768{
769   struct brw_instruction *insn = next_insn(p, opcode);
770   brw_set_dest(p, insn, dest);
771   brw_set_src0(p, insn, src);
772   return insn;
773}
774
775static struct brw_instruction *brw_alu2(struct brw_compile *p,
776					GLuint opcode,
777					struct brw_reg dest,
778					struct brw_reg src0,
779					struct brw_reg src1 )
780{
781   struct brw_instruction *insn = next_insn(p, opcode);
782   brw_set_dest(p, insn, dest);
783   brw_set_src0(p, insn, src0);
784   brw_set_src1(p, insn, src1);
785   return insn;
786}
787
788
789/***********************************************************************
790 * Convenience routines.
791 */
792#define ALU1(OP)					\
793struct brw_instruction *brw_##OP(struct brw_compile *p,	\
794	      struct brw_reg dest,			\
795	      struct brw_reg src0)   			\
796{							\
797   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
798}
799
800#define ALU2(OP)					\
801struct brw_instruction *brw_##OP(struct brw_compile *p,	\
802	      struct brw_reg dest,			\
803	      struct brw_reg src0,			\
804	      struct brw_reg src1)   			\
805{							\
806   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
807}
808
809/* Rounding operations (other than RNDD) require two instructions - the first
810 * stores a rounded value (possibly the wrong way) in the dest register, but
811 * also sets a per-channel "increment bit" in the flag register.  A predicated
812 * add of 1.0 fixes dest to contain the desired result.
813 *
814 * Sandybridge and later appear to round correctly without an ADD.
815 */
816#define ROUND(OP)							      \
817void brw_##OP(struct brw_compile *p,					      \
818	      struct brw_reg dest,					      \
819	      struct brw_reg src)					      \
820{									      \
821   struct brw_instruction *rnd, *add;					      \
822   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
823   brw_set_dest(p, rnd, dest);						      \
824   brw_set_src0(p, rnd, src);						      \
825									      \
826   if (p->brw->intel.gen < 6) {						      \
827      /* turn on round-increments */					      \
828      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
829      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
830      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
831   }									      \
832}
833
834
835ALU1(MOV)
836ALU2(SEL)
837ALU1(NOT)
838ALU2(AND)
839ALU2(OR)
840ALU2(XOR)
841ALU2(SHR)
842ALU2(SHL)
843ALU2(RSR)
844ALU2(RSL)
845ALU2(ASR)
846ALU1(FRC)
847ALU1(RNDD)
848ALU2(MAC)
849ALU2(MACH)
850ALU1(LZD)
851ALU2(DP4)
852ALU2(DPH)
853ALU2(DP3)
854ALU2(DP2)
855ALU2(LINE)
856ALU2(PLN)
857
858
859ROUND(RNDZ)
860ROUND(RNDE)
861
862
863struct brw_instruction *brw_ADD(struct brw_compile *p,
864				struct brw_reg dest,
865				struct brw_reg src0,
866				struct brw_reg src1)
867{
868   /* 6.2.2: add */
869   if (src0.type == BRW_REGISTER_TYPE_F ||
870       (src0.file == BRW_IMMEDIATE_VALUE &&
871	src0.type == BRW_REGISTER_TYPE_VF)) {
872      assert(src1.type != BRW_REGISTER_TYPE_UD);
873      assert(src1.type != BRW_REGISTER_TYPE_D);
874   }
875
876   if (src1.type == BRW_REGISTER_TYPE_F ||
877       (src1.file == BRW_IMMEDIATE_VALUE &&
878	src1.type == BRW_REGISTER_TYPE_VF)) {
879      assert(src0.type != BRW_REGISTER_TYPE_UD);
880      assert(src0.type != BRW_REGISTER_TYPE_D);
881   }
882
883   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
884}
885
886struct brw_instruction *brw_MUL(struct brw_compile *p,
887				struct brw_reg dest,
888				struct brw_reg src0,
889				struct brw_reg src1)
890{
891   /* 6.32.38: mul */
892   if (src0.type == BRW_REGISTER_TYPE_D ||
893       src0.type == BRW_REGISTER_TYPE_UD ||
894       src1.type == BRW_REGISTER_TYPE_D ||
895       src1.type == BRW_REGISTER_TYPE_UD) {
896      assert(dest.type != BRW_REGISTER_TYPE_F);
897   }
898
899   if (src0.type == BRW_REGISTER_TYPE_F ||
900       (src0.file == BRW_IMMEDIATE_VALUE &&
901	src0.type == BRW_REGISTER_TYPE_VF)) {
902      assert(src1.type != BRW_REGISTER_TYPE_UD);
903      assert(src1.type != BRW_REGISTER_TYPE_D);
904   }
905
906   if (src1.type == BRW_REGISTER_TYPE_F ||
907       (src1.file == BRW_IMMEDIATE_VALUE &&
908	src1.type == BRW_REGISTER_TYPE_VF)) {
909      assert(src0.type != BRW_REGISTER_TYPE_UD);
910      assert(src0.type != BRW_REGISTER_TYPE_D);
911   }
912
913   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
914	  src0.nr != BRW_ARF_ACCUMULATOR);
915   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
916	  src1.nr != BRW_ARF_ACCUMULATOR);
917
918   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
919}
920
921
922void brw_NOP(struct brw_compile *p)
923{
924   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
925   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
926   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
927   brw_set_src1(p, insn, brw_imm_ud(0x0));
928}
929
930
931
932
933
934/***********************************************************************
935 * Comparisons, if/else/endif
936 */
937
938struct brw_instruction *brw_JMPI(struct brw_compile *p,
939                                 struct brw_reg dest,
940                                 struct brw_reg src0,
941                                 struct brw_reg src1)
942{
943   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
944
945   insn->header.execution_size = 1;
946   insn->header.compression_control = BRW_COMPRESSION_NONE;
947   insn->header.mask_control = BRW_MASK_DISABLE;
948
949   p->current->header.predicate_control = BRW_PREDICATE_NONE;
950
951   return insn;
952}
953
954static void
955push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
956{
957   p->if_stack[p->if_stack_depth] = inst;
958
959   p->if_stack_depth++;
960   if (p->if_stack_array_size <= p->if_stack_depth) {
961      p->if_stack_array_size *= 2;
962      p->if_stack = reralloc(p->mem_ctx, p->if_stack, struct brw_instruction *,
963			     p->if_stack_array_size);
964   }
965}
966
967/* EU takes the value from the flag register and pushes it onto some
968 * sort of a stack (presumably merging with any flag value already on
969 * the stack).  Within an if block, the flags at the top of the stack
970 * control execution on each channel of the unit, eg. on each of the
971 * 16 pixel values in our wm programs.
972 *
973 * When the matching 'else' instruction is reached (presumably by
974 * countdown of the instruction count patched in by our ELSE/ENDIF
975 * functions), the relevent flags are inverted.
976 *
977 * When the matching 'endif' instruction is reached, the flags are
978 * popped off.  If the stack is now empty, normal execution resumes.
979 */
980struct brw_instruction *
981brw_IF(struct brw_compile *p, GLuint execute_size)
982{
983   struct intel_context *intel = &p->brw->intel;
984   struct brw_instruction *insn;
985
986   insn = next_insn(p, BRW_OPCODE_IF);
987
988   /* Override the defaults for this instruction:
989    */
990   if (intel->gen < 6) {
991      brw_set_dest(p, insn, brw_ip_reg());
992      brw_set_src0(p, insn, brw_ip_reg());
993      brw_set_src1(p, insn, brw_imm_d(0x0));
994   } else if (intel->gen == 6) {
995      brw_set_dest(p, insn, brw_imm_w(0));
996      insn->bits1.branch_gen6.jump_count = 0;
997      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
998      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
999   } else {
1000      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1001      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1002      brw_set_src1(p, insn, brw_imm_ud(0));
1003      insn->bits3.break_cont.jip = 0;
1004      insn->bits3.break_cont.uip = 0;
1005   }
1006
1007   insn->header.execution_size = execute_size;
1008   insn->header.compression_control = BRW_COMPRESSION_NONE;
1009   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1010   insn->header.mask_control = BRW_MASK_ENABLE;
1011   if (!p->single_program_flow)
1012      insn->header.thread_control = BRW_THREAD_SWITCH;
1013
1014   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1015
1016   push_if_stack(p, insn);
1017   return insn;
1018}
1019
1020/* This function is only used for gen6-style IF instructions with an
1021 * embedded comparison (conditional modifier).  It is not used on gen7.
1022 */
1023struct brw_instruction *
1024gen6_IF(struct brw_compile *p, uint32_t conditional,
1025	struct brw_reg src0, struct brw_reg src1)
1026{
1027   struct brw_instruction *insn;
1028
1029   insn = next_insn(p, BRW_OPCODE_IF);
1030
1031   brw_set_dest(p, insn, brw_imm_w(0));
1032   if (p->compressed) {
1033      insn->header.execution_size = BRW_EXECUTE_16;
1034   } else {
1035      insn->header.execution_size = BRW_EXECUTE_8;
1036   }
1037   insn->bits1.branch_gen6.jump_count = 0;
1038   brw_set_src0(p, insn, src0);
1039   brw_set_src1(p, insn, src1);
1040
1041   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1042   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1043   insn->header.destreg__conditionalmod = conditional;
1044
1045   if (!p->single_program_flow)
1046      insn->header.thread_control = BRW_THREAD_SWITCH;
1047
1048   push_if_stack(p, insn);
1049   return insn;
1050}
1051
1052/**
1053 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1054 */
1055static void
1056convert_IF_ELSE_to_ADD(struct brw_compile *p,
1057		       struct brw_instruction *if_inst,
1058		       struct brw_instruction *else_inst)
1059{
1060   /* The next instruction (where the ENDIF would be, if it existed) */
1061   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1062
1063   assert(p->single_program_flow);
1064   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1065   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1066   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1067
1068   /* Convert IF to an ADD instruction that moves the instruction pointer
1069    * to the first instruction of the ELSE block.  If there is no ELSE
1070    * block, point to where ENDIF would be.  Reverse the predicate.
1071    *
1072    * There's no need to execute an ENDIF since we don't need to do any
1073    * stack operations, and if we're currently executing, we just want to
1074    * continue normally.
1075    */
1076   if_inst->header.opcode = BRW_OPCODE_ADD;
1077   if_inst->header.predicate_inverse = 1;
1078
1079   if (else_inst != NULL) {
1080      /* Convert ELSE to an ADD instruction that points where the ENDIF
1081       * would be.
1082       */
1083      else_inst->header.opcode = BRW_OPCODE_ADD;
1084
1085      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1086      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1087   } else {
1088      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1089   }
1090}
1091
1092/**
1093 * Patch IF and ELSE instructions with appropriate jump targets.
1094 */
1095static void
1096patch_IF_ELSE(struct brw_compile *p,
1097	      struct brw_instruction *if_inst,
1098	      struct brw_instruction *else_inst,
1099	      struct brw_instruction *endif_inst)
1100{
1101   struct intel_context *intel = &p->brw->intel;
1102
1103   assert(!p->single_program_flow);
1104   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1105   assert(endif_inst != NULL);
1106   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1107
1108   unsigned br = 1;
1109   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1110    * requires 2 chunks.
1111    */
1112   if (intel->gen >= 5)
1113      br = 2;
1114
1115   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1116   endif_inst->header.execution_size = if_inst->header.execution_size;
1117
1118   if (else_inst == NULL) {
1119      /* Patch IF -> ENDIF */
1120      if (intel->gen < 6) {
1121	 /* Turn it into an IFF, which means no mask stack operations for
1122	  * all-false and jumping past the ENDIF.
1123	  */
1124	 if_inst->header.opcode = BRW_OPCODE_IFF;
1125	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1126	 if_inst->bits3.if_else.pop_count = 0;
1127	 if_inst->bits3.if_else.pad0 = 0;
1128      } else if (intel->gen == 6) {
1129	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1130	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1131      } else {
1132	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1133	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1134      }
1135   } else {
1136      else_inst->header.execution_size = if_inst->header.execution_size;
1137
1138      /* Patch IF -> ELSE */
1139      if (intel->gen < 6) {
1140	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1141	 if_inst->bits3.if_else.pop_count = 0;
1142	 if_inst->bits3.if_else.pad0 = 0;
1143      } else if (intel->gen == 6) {
1144	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1145      }
1146
1147      /* Patch ELSE -> ENDIF */
1148      if (intel->gen < 6) {
1149	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1150	  * matching ENDIF.
1151	  */
1152	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1153	 else_inst->bits3.if_else.pop_count = 1;
1154	 else_inst->bits3.if_else.pad0 = 0;
1155      } else if (intel->gen == 6) {
1156	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1157	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1158      } else {
1159	 /* The IF instruction's JIP should point just past the ELSE */
1160	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1161	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1162	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1163	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1164      }
1165   }
1166}
1167
1168void
1169brw_ELSE(struct brw_compile *p)
1170{
1171   struct intel_context *intel = &p->brw->intel;
1172   struct brw_instruction *insn;
1173
1174   insn = next_insn(p, BRW_OPCODE_ELSE);
1175
1176   if (intel->gen < 6) {
1177      brw_set_dest(p, insn, brw_ip_reg());
1178      brw_set_src0(p, insn, brw_ip_reg());
1179      brw_set_src1(p, insn, brw_imm_d(0x0));
1180   } else if (intel->gen == 6) {
1181      brw_set_dest(p, insn, brw_imm_w(0));
1182      insn->bits1.branch_gen6.jump_count = 0;
1183      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1184      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1185   } else {
1186      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1187      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1188      brw_set_src1(p, insn, brw_imm_ud(0));
1189      insn->bits3.break_cont.jip = 0;
1190      insn->bits3.break_cont.uip = 0;
1191   }
1192
1193   insn->header.compression_control = BRW_COMPRESSION_NONE;
1194   insn->header.mask_control = BRW_MASK_ENABLE;
1195   if (!p->single_program_flow)
1196      insn->header.thread_control = BRW_THREAD_SWITCH;
1197
1198   push_if_stack(p, insn);
1199}
1200
1201void
1202brw_ENDIF(struct brw_compile *p)
1203{
1204   struct intel_context *intel = &p->brw->intel;
1205   struct brw_instruction *insn;
1206   struct brw_instruction *else_inst = NULL;
1207   struct brw_instruction *if_inst = NULL;
1208
1209   /* Pop the IF and (optional) ELSE instructions from the stack */
1210   p->if_stack_depth--;
1211   if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
1212      else_inst = p->if_stack[p->if_stack_depth];
1213      p->if_stack_depth--;
1214   }
1215   if_inst = p->if_stack[p->if_stack_depth];
1216
1217   if (p->single_program_flow) {
1218      /* ENDIF is useless; don't bother emitting it. */
1219      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1220      return;
1221   }
1222
1223   insn = next_insn(p, BRW_OPCODE_ENDIF);
1224
1225   if (intel->gen < 6) {
1226      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1227      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1228      brw_set_src1(p, insn, brw_imm_d(0x0));
1229   } else if (intel->gen == 6) {
1230      brw_set_dest(p, insn, brw_imm_w(0));
1231      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1232      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1233   } else {
1234      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1235      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1236      brw_set_src1(p, insn, brw_imm_ud(0));
1237   }
1238
1239   insn->header.compression_control = BRW_COMPRESSION_NONE;
1240   insn->header.mask_control = BRW_MASK_ENABLE;
1241   insn->header.thread_control = BRW_THREAD_SWITCH;
1242
1243   /* Also pop item off the stack in the endif instruction: */
1244   if (intel->gen < 6) {
1245      insn->bits3.if_else.jump_count = 0;
1246      insn->bits3.if_else.pop_count = 1;
1247      insn->bits3.if_else.pad0 = 0;
1248   } else if (intel->gen == 6) {
1249      insn->bits1.branch_gen6.jump_count = 2;
1250   } else {
1251      insn->bits3.break_cont.jip = 2;
1252   }
1253   patch_IF_ELSE(p, if_inst, else_inst, insn);
1254}
1255
1256struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1257{
1258   struct intel_context *intel = &p->brw->intel;
1259   struct brw_instruction *insn;
1260
1261   insn = next_insn(p, BRW_OPCODE_BREAK);
1262   if (intel->gen >= 6) {
1263      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1264      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1265      brw_set_src1(p, insn, brw_imm_d(0x0));
1266   } else {
1267      brw_set_dest(p, insn, brw_ip_reg());
1268      brw_set_src0(p, insn, brw_ip_reg());
1269      brw_set_src1(p, insn, brw_imm_d(0x0));
1270      insn->bits3.if_else.pad0 = 0;
1271      insn->bits3.if_else.pop_count = pop_count;
1272   }
1273   insn->header.compression_control = BRW_COMPRESSION_NONE;
1274   insn->header.execution_size = BRW_EXECUTE_8;
1275
1276   return insn;
1277}
1278
1279struct brw_instruction *gen6_CONT(struct brw_compile *p,
1280				  struct brw_instruction *do_insn)
1281{
1282   struct brw_instruction *insn;
1283
1284   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1285   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1286   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1287   brw_set_dest(p, insn, brw_ip_reg());
1288   brw_set_src0(p, insn, brw_ip_reg());
1289   brw_set_src1(p, insn, brw_imm_d(0x0));
1290
1291   insn->header.compression_control = BRW_COMPRESSION_NONE;
1292   insn->header.execution_size = BRW_EXECUTE_8;
1293   return insn;
1294}
1295
1296struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1297{
1298   struct brw_instruction *insn;
1299   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1300   brw_set_dest(p, insn, brw_ip_reg());
1301   brw_set_src0(p, insn, brw_ip_reg());
1302   brw_set_src1(p, insn, brw_imm_d(0x0));
1303   insn->header.compression_control = BRW_COMPRESSION_NONE;
1304   insn->header.execution_size = BRW_EXECUTE_8;
1305   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1306   insn->bits3.if_else.pad0 = 0;
1307   insn->bits3.if_else.pop_count = pop_count;
1308   return insn;
1309}
1310
1311/* DO/WHILE loop:
1312 *
1313 * The DO/WHILE is just an unterminated loop -- break or continue are
1314 * used for control within the loop.  We have a few ways they can be
1315 * done.
1316 *
1317 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1318 * jip and no DO instruction.
1319 *
1320 * For non-uniform control flow pre-gen6, there's a DO instruction to
1321 * push the mask, and a WHILE to jump back, and BREAK to get out and
1322 * pop the mask.
1323 *
1324 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1325 * just points back to the first instruction of the loop.
1326 */
1327struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1328{
1329   struct intel_context *intel = &p->brw->intel;
1330
1331   if (intel->gen >= 6 || p->single_program_flow) {
1332      return &p->store[p->nr_insn];
1333   } else {
1334      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1335
1336      /* Override the defaults for this instruction:
1337       */
1338      brw_set_dest(p, insn, brw_null_reg());
1339      brw_set_src0(p, insn, brw_null_reg());
1340      brw_set_src1(p, insn, brw_null_reg());
1341
1342      insn->header.compression_control = BRW_COMPRESSION_NONE;
1343      insn->header.execution_size = execute_size;
1344      insn->header.predicate_control = BRW_PREDICATE_NONE;
1345      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1346      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1347
1348      return insn;
1349   }
1350}
1351
1352
1353
1354struct brw_instruction *brw_WHILE(struct brw_compile *p,
1355                                  struct brw_instruction *do_insn)
1356{
1357   struct intel_context *intel = &p->brw->intel;
1358   struct brw_instruction *insn;
1359   GLuint br = 1;
1360
1361   if (intel->gen >= 5)
1362      br = 2;
1363
1364   if (intel->gen >= 7) {
1365      insn = next_insn(p, BRW_OPCODE_WHILE);
1366
1367      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1368      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1369      brw_set_src1(p, insn, brw_imm_ud(0));
1370      insn->bits3.break_cont.jip = br * (do_insn - insn);
1371
1372      insn->header.execution_size = BRW_EXECUTE_8;
1373   } else if (intel->gen == 6) {
1374      insn = next_insn(p, BRW_OPCODE_WHILE);
1375
1376      brw_set_dest(p, insn, brw_imm_w(0));
1377      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1378      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1379      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1380
1381      insn->header.execution_size = BRW_EXECUTE_8;
1382   } else {
1383      if (p->single_program_flow) {
1384	 insn = next_insn(p, BRW_OPCODE_ADD);
1385
1386	 brw_set_dest(p, insn, brw_ip_reg());
1387	 brw_set_src0(p, insn, brw_ip_reg());
1388	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1389	 insn->header.execution_size = BRW_EXECUTE_1;
1390      } else {
1391	 insn = next_insn(p, BRW_OPCODE_WHILE);
1392
1393	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1394
1395	 brw_set_dest(p, insn, brw_ip_reg());
1396	 brw_set_src0(p, insn, brw_ip_reg());
1397	 brw_set_src1(p, insn, brw_imm_d(0));
1398
1399	 insn->header.execution_size = do_insn->header.execution_size;
1400	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1401	 insn->bits3.if_else.pop_count = 0;
1402	 insn->bits3.if_else.pad0 = 0;
1403      }
1404   }
1405   insn->header.compression_control = BRW_COMPRESSION_NONE;
1406   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1407
1408   return insn;
1409}
1410
1411
1412/* FORWARD JUMPS:
1413 */
1414void brw_land_fwd_jump(struct brw_compile *p,
1415		       struct brw_instruction *jmp_insn)
1416{
1417   struct intel_context *intel = &p->brw->intel;
1418   struct brw_instruction *landing = &p->store[p->nr_insn];
1419   GLuint jmpi = 1;
1420
1421   if (intel->gen >= 5)
1422      jmpi = 2;
1423
1424   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1425   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1426
1427   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1428}
1429
1430
1431
1432/* To integrate with the above, it makes sense that the comparison
1433 * instruction should populate the flag register.  It might be simpler
1434 * just to use the flag reg for most WM tasks?
1435 */
1436void brw_CMP(struct brw_compile *p,
1437	     struct brw_reg dest,
1438	     GLuint conditional,
1439	     struct brw_reg src0,
1440	     struct brw_reg src1)
1441{
1442   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1443
1444   insn->header.destreg__conditionalmod = conditional;
1445   brw_set_dest(p, insn, dest);
1446   brw_set_src0(p, insn, src0);
1447   brw_set_src1(p, insn, src1);
1448
1449/*    guess_execution_size(insn, src0); */
1450
1451
1452   /* Make it so that future instructions will use the computed flag
1453    * value until brw_set_predicate_control_flag_value() is called
1454    * again.
1455    */
1456   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1457       dest.nr == 0) {
1458      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1459      p->flag_value = 0xff;
1460   }
1461}
1462
1463/* Issue 'wait' instruction for n1, host could program MMIO
1464   to wake up thread. */
1465void brw_WAIT (struct brw_compile *p)
1466{
1467   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1468   struct brw_reg src = brw_notification_1_reg();
1469
1470   brw_set_dest(p, insn, src);
1471   brw_set_src0(p, insn, src);
1472   brw_set_src1(p, insn, brw_null_reg());
1473   insn->header.execution_size = 0; /* must */
1474   insn->header.predicate_control = 0;
1475   insn->header.compression_control = 0;
1476}
1477
1478
1479/***********************************************************************
1480 * Helpers for the various SEND message types:
1481 */
1482
1483/** Extended math function, float[8].
1484 */
1485void brw_math( struct brw_compile *p,
1486	       struct brw_reg dest,
1487	       GLuint function,
1488	       GLuint saturate,
1489	       GLuint msg_reg_nr,
1490	       struct brw_reg src,
1491	       GLuint data_type,
1492	       GLuint precision )
1493{
1494   struct intel_context *intel = &p->brw->intel;
1495
1496   if (intel->gen >= 6) {
1497      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1498
1499      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1500      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1501
1502      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1503      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1504
1505      /* Source modifiers are ignored for extended math instructions. */
1506      assert(!src.negate);
1507      assert(!src.abs);
1508
1509      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1510	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1511	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1512	 assert(src.type != BRW_REGISTER_TYPE_F);
1513      } else {
1514	 assert(src.type == BRW_REGISTER_TYPE_F);
1515      }
1516
1517      /* Math is the same ISA format as other opcodes, except that CondModifier
1518       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1519       */
1520      insn->header.destreg__conditionalmod = function;
1521      insn->header.saturate = saturate;
1522
1523      brw_set_dest(p, insn, dest);
1524      brw_set_src0(p, insn, src);
1525      brw_set_src1(p, insn, brw_null_reg());
1526   } else {
1527      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1528
1529      /* Example code doesn't set predicate_control for send
1530       * instructions.
1531       */
1532      insn->header.predicate_control = 0;
1533      insn->header.destreg__conditionalmod = msg_reg_nr;
1534
1535      brw_set_dest(p, insn, dest);
1536      brw_set_src0(p, insn, src);
1537      brw_set_math_message(p,
1538			   insn,
1539			   function,
1540			   src.type == BRW_REGISTER_TYPE_D,
1541			   precision,
1542			   saturate,
1543			   data_type);
1544   }
1545}
1546
1547/** Extended math function, float[8].
1548 */
1549void brw_math2(struct brw_compile *p,
1550	       struct brw_reg dest,
1551	       GLuint function,
1552	       struct brw_reg src0,
1553	       struct brw_reg src1)
1554{
1555   struct intel_context *intel = &p->brw->intel;
1556   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1557
1558   assert(intel->gen >= 6);
1559   (void) intel;
1560
1561
1562   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1563   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1564   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1565
1566   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1567   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1568   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1569
1570   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1571       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1572       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1573      assert(src0.type != BRW_REGISTER_TYPE_F);
1574      assert(src1.type != BRW_REGISTER_TYPE_F);
1575   } else {
1576      assert(src0.type == BRW_REGISTER_TYPE_F);
1577      assert(src1.type == BRW_REGISTER_TYPE_F);
1578   }
1579
1580   /* Source modifiers are ignored for extended math instructions. */
1581   assert(!src0.negate);
1582   assert(!src0.abs);
1583   assert(!src1.negate);
1584   assert(!src1.abs);
1585
1586   /* Math is the same ISA format as other opcodes, except that CondModifier
1587    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1588    */
1589   insn->header.destreg__conditionalmod = function;
1590
1591   brw_set_dest(p, insn, dest);
1592   brw_set_src0(p, insn, src0);
1593   brw_set_src1(p, insn, src1);
1594}
1595
1596/**
1597 * Extended math function, float[16].
1598 * Use 2 send instructions.
1599 */
1600void brw_math_16( struct brw_compile *p,
1601		  struct brw_reg dest,
1602		  GLuint function,
1603		  GLuint saturate,
1604		  GLuint msg_reg_nr,
1605		  struct brw_reg src,
1606		  GLuint precision )
1607{
1608   struct intel_context *intel = &p->brw->intel;
1609   struct brw_instruction *insn;
1610
1611   if (intel->gen >= 6) {
1612      insn = next_insn(p, BRW_OPCODE_MATH);
1613
1614      /* Math is the same ISA format as other opcodes, except that CondModifier
1615       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1616       */
1617      insn->header.destreg__conditionalmod = function;
1618      insn->header.saturate = saturate;
1619
1620      /* Source modifiers are ignored for extended math instructions. */
1621      assert(!src.negate);
1622      assert(!src.abs);
1623
1624      brw_set_dest(p, insn, dest);
1625      brw_set_src0(p, insn, src);
1626      brw_set_src1(p, insn, brw_null_reg());
1627      return;
1628   }
1629
1630   /* First instruction:
1631    */
1632   brw_push_insn_state(p);
1633   brw_set_predicate_control_flag_value(p, 0xff);
1634   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1635
1636   insn = next_insn(p, BRW_OPCODE_SEND);
1637   insn->header.destreg__conditionalmod = msg_reg_nr;
1638
1639   brw_set_dest(p, insn, dest);
1640   brw_set_src0(p, insn, src);
1641   brw_set_math_message(p,
1642			insn,
1643			function,
1644			BRW_MATH_INTEGER_UNSIGNED,
1645			precision,
1646			saturate,
1647			BRW_MATH_DATA_VECTOR);
1648
1649   /* Second instruction:
1650    */
1651   insn = next_insn(p, BRW_OPCODE_SEND);
1652   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1653   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1654
1655   brw_set_dest(p, insn, offset(dest,1));
1656   brw_set_src0(p, insn, src);
1657   brw_set_math_message(p,
1658			insn,
1659			function,
1660			BRW_MATH_INTEGER_UNSIGNED,
1661			precision,
1662			saturate,
1663			BRW_MATH_DATA_VECTOR);
1664
1665   brw_pop_insn_state(p);
1666}
1667
1668
1669/**
1670 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1671 * using a constant offset per channel.
1672 *
1673 * The offset must be aligned to oword size (16 bytes).  Used for
1674 * register spilling.
1675 */
1676void brw_oword_block_write_scratch(struct brw_compile *p,
1677				   struct brw_reg mrf,
1678				   int num_regs,
1679				   GLuint offset)
1680{
1681   struct intel_context *intel = &p->brw->intel;
1682   uint32_t msg_control, msg_type;
1683   int mlen;
1684
1685   if (intel->gen >= 6)
1686      offset /= 16;
1687
1688   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1689
1690   if (num_regs == 1) {
1691      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1692      mlen = 2;
1693   } else {
1694      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1695      mlen = 3;
1696   }
1697
1698   /* Set up the message header.  This is g0, with g0.2 filled with
1699    * the offset.  We don't want to leave our offset around in g0 or
1700    * it'll screw up texture samples, so set it up inside the message
1701    * reg.
1702    */
1703   {
1704      brw_push_insn_state(p);
1705      brw_set_mask_control(p, BRW_MASK_DISABLE);
1706      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1707
1708      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1709
1710      /* set message header global offset field (reg 0, element 2) */
1711      brw_MOV(p,
1712	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1713				  mrf.nr,
1714				  2), BRW_REGISTER_TYPE_UD),
1715	      brw_imm_ud(offset));
1716
1717      brw_pop_insn_state(p);
1718   }
1719
1720   {
1721      struct brw_reg dest;
1722      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1723      int send_commit_msg;
1724      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1725					 BRW_REGISTER_TYPE_UW);
1726
1727      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1728	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1729	 src_header = vec16(src_header);
1730      }
1731      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1732      insn->header.destreg__conditionalmod = mrf.nr;
1733
1734      /* Until gen6, writes followed by reads from the same location
1735       * are not guaranteed to be ordered unless write_commit is set.
1736       * If set, then a no-op write is issued to the destination
1737       * register to set a dependency, and a read from the destination
1738       * can be used to ensure the ordering.
1739       *
1740       * For gen6, only writes between different threads need ordering
1741       * protection.  Our use of DP writes is all about register
1742       * spilling within a thread.
1743       */
1744      if (intel->gen >= 6) {
1745	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1746	 send_commit_msg = 0;
1747      } else {
1748	 dest = src_header;
1749	 send_commit_msg = 1;
1750      }
1751
1752      brw_set_dest(p, insn, dest);
1753      if (intel->gen >= 6) {
1754	 brw_set_src0(p, insn, mrf);
1755      } else {
1756	 brw_set_src0(p, insn, brw_null_reg());
1757      }
1758
1759      if (intel->gen >= 6)
1760	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1761      else
1762	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1763
1764      brw_set_dp_write_message(p,
1765			       insn,
1766			       255, /* binding table index (255=stateless) */
1767			       msg_control,
1768			       msg_type,
1769			       mlen,
1770			       true, /* header_present */
1771			       0, /* pixel scoreboard */
1772			       send_commit_msg, /* response_length */
1773			       0, /* eot */
1774			       send_commit_msg);
1775   }
1776}
1777
1778
1779/**
1780 * Read a block of owords (half a GRF each) from the scratch buffer
1781 * using a constant index per channel.
1782 *
1783 * Offset must be aligned to oword size (16 bytes).  Used for register
1784 * spilling.
1785 */
1786void
1787brw_oword_block_read_scratch(struct brw_compile *p,
1788			     struct brw_reg dest,
1789			     struct brw_reg mrf,
1790			     int num_regs,
1791			     GLuint offset)
1792{
1793   struct intel_context *intel = &p->brw->intel;
1794   uint32_t msg_control;
1795   int rlen;
1796
1797   if (intel->gen >= 6)
1798      offset /= 16;
1799
1800   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1801   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1802
1803   if (num_regs == 1) {
1804      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1805      rlen = 1;
1806   } else {
1807      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1808      rlen = 2;
1809   }
1810
1811   {
1812      brw_push_insn_state(p);
1813      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1814      brw_set_mask_control(p, BRW_MASK_DISABLE);
1815
1816      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1817
1818      /* set message header global offset field (reg 0, element 2) */
1819      brw_MOV(p,
1820	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1821				  mrf.nr,
1822				  2), BRW_REGISTER_TYPE_UD),
1823	      brw_imm_ud(offset));
1824
1825      brw_pop_insn_state(p);
1826   }
1827
1828   {
1829      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1830
1831      assert(insn->header.predicate_control == 0);
1832      insn->header.compression_control = BRW_COMPRESSION_NONE;
1833      insn->header.destreg__conditionalmod = mrf.nr;
1834
1835      brw_set_dest(p, insn, dest);	/* UW? */
1836      if (intel->gen >= 6) {
1837	 brw_set_src0(p, insn, mrf);
1838      } else {
1839	 brw_set_src0(p, insn, brw_null_reg());
1840      }
1841
1842      brw_set_dp_read_message(p,
1843			      insn,
1844			      255, /* binding table index (255=stateless) */
1845			      msg_control,
1846			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1847			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1848			      1, /* msg_length */
1849			      rlen);
1850   }
1851}
1852
1853/**
1854 * Read a float[4] vector from the data port Data Cache (const buffer).
1855 * Location (in buffer) should be a multiple of 16.
1856 * Used for fetching shader constants.
1857 */
1858void brw_oword_block_read(struct brw_compile *p,
1859			  struct brw_reg dest,
1860			  struct brw_reg mrf,
1861			  uint32_t offset,
1862			  uint32_t bind_table_index)
1863{
1864   struct intel_context *intel = &p->brw->intel;
1865
1866   /* On newer hardware, offset is in units of owords. */
1867   if (intel->gen >= 6)
1868      offset /= 16;
1869
1870   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1871
1872   brw_push_insn_state(p);
1873   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1874   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1875   brw_set_mask_control(p, BRW_MASK_DISABLE);
1876
1877   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1878
1879   /* set message header global offset field (reg 0, element 2) */
1880   brw_MOV(p,
1881	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1882			       mrf.nr,
1883			       2), BRW_REGISTER_TYPE_UD),
1884	   brw_imm_ud(offset));
1885
1886   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1887   insn->header.destreg__conditionalmod = mrf.nr;
1888
1889   /* cast dest to a uword[8] vector */
1890   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1891
1892   brw_set_dest(p, insn, dest);
1893   if (intel->gen >= 6) {
1894      brw_set_src0(p, insn, mrf);
1895   } else {
1896      brw_set_src0(p, insn, brw_null_reg());
1897   }
1898
1899   brw_set_dp_read_message(p,
1900			   insn,
1901			   bind_table_index,
1902			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1903			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1904			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1905			   1, /* msg_length */
1906			   1); /* response_length (1 reg, 2 owords!) */
1907
1908   brw_pop_insn_state(p);
1909}
1910
1911/**
1912 * Read a set of dwords from the data port Data Cache (const buffer).
1913 *
1914 * Location (in buffer) appears as UD offsets in the register after
1915 * the provided mrf header reg.
1916 */
1917void brw_dword_scattered_read(struct brw_compile *p,
1918			      struct brw_reg dest,
1919			      struct brw_reg mrf,
1920			      uint32_t bind_table_index)
1921{
1922   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1923
1924   brw_push_insn_state(p);
1925   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1926   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1927   brw_set_mask_control(p, BRW_MASK_DISABLE);
1928   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1929   brw_pop_insn_state(p);
1930
1931   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1932   insn->header.destreg__conditionalmod = mrf.nr;
1933
1934   /* cast dest to a uword[8] vector */
1935   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1936
1937   brw_set_dest(p, insn, dest);
1938   brw_set_src0(p, insn, brw_null_reg());
1939
1940   brw_set_dp_read_message(p,
1941			   insn,
1942			   bind_table_index,
1943			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1944			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1945			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1946			   2, /* msg_length */
1947			   1); /* response_length */
1948}
1949
1950
1951
1952/**
1953 * Read float[4] constant(s) from VS constant buffer.
1954 * For relative addressing, two float[4] constants will be read into 'dest'.
1955 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1956 */
1957void brw_dp_READ_4_vs(struct brw_compile *p,
1958                      struct brw_reg dest,
1959                      GLuint location,
1960                      GLuint bind_table_index)
1961{
1962   struct intel_context *intel = &p->brw->intel;
1963   struct brw_instruction *insn;
1964   GLuint msg_reg_nr = 1;
1965
1966   if (intel->gen >= 6)
1967      location /= 16;
1968
1969   /* Setup MRF[1] with location/offset into const buffer */
1970   brw_push_insn_state(p);
1971   brw_set_access_mode(p, BRW_ALIGN_1);
1972   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1973   brw_set_mask_control(p, BRW_MASK_DISABLE);
1974   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1975   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1976		     BRW_REGISTER_TYPE_UD),
1977	   brw_imm_ud(location));
1978   brw_pop_insn_state(p);
1979
1980   insn = next_insn(p, BRW_OPCODE_SEND);
1981
1982   insn->header.predicate_control = BRW_PREDICATE_NONE;
1983   insn->header.compression_control = BRW_COMPRESSION_NONE;
1984   insn->header.destreg__conditionalmod = msg_reg_nr;
1985   insn->header.mask_control = BRW_MASK_DISABLE;
1986
1987   brw_set_dest(p, insn, dest);
1988   if (intel->gen >= 6) {
1989      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
1990   } else {
1991      brw_set_src0(p, insn, brw_null_reg());
1992   }
1993
1994   brw_set_dp_read_message(p,
1995			   insn,
1996			   bind_table_index,
1997			   0,
1998			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1999			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2000			   1, /* msg_length */
2001			   1); /* response_length (1 Oword) */
2002}
2003
2004/**
2005 * Read a float[4] constant per vertex from VS constant buffer, with
2006 * relative addressing.
2007 */
2008void brw_dp_READ_4_vs_relative(struct brw_compile *p,
2009			       struct brw_reg dest,
2010			       struct brw_reg addr_reg,
2011			       GLuint offset,
2012			       GLuint bind_table_index)
2013{
2014   struct intel_context *intel = &p->brw->intel;
2015   struct brw_reg src = brw_vec8_grf(0, 0);
2016   int msg_type;
2017
2018   /* Setup MRF[1] with offset into const buffer */
2019   brw_push_insn_state(p);
2020   brw_set_access_mode(p, BRW_ALIGN_1);
2021   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2022   brw_set_mask_control(p, BRW_MASK_DISABLE);
2023   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2024
2025   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
2026    * fields ignored.
2027    */
2028   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
2029	   addr_reg, brw_imm_d(offset));
2030   brw_pop_insn_state(p);
2031
2032   gen6_resolve_implied_move(p, &src, 0);
2033   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2034
2035   insn->header.predicate_control = BRW_PREDICATE_NONE;
2036   insn->header.compression_control = BRW_COMPRESSION_NONE;
2037   insn->header.destreg__conditionalmod = 0;
2038   insn->header.mask_control = BRW_MASK_DISABLE;
2039
2040   brw_set_dest(p, insn, dest);
2041   brw_set_src0(p, insn, src);
2042
2043   if (intel->gen >= 6)
2044      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2045   else if (intel->gen == 5 || intel->is_g4x)
2046      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2047   else
2048      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
2049
2050   brw_set_dp_read_message(p,
2051			   insn,
2052			   bind_table_index,
2053			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2054			   msg_type,
2055			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2056			   2, /* msg_length */
2057			   1); /* response_length */
2058}
2059
2060
2061
2062void brw_fb_WRITE(struct brw_compile *p,
2063		  int dispatch_width,
2064                  GLuint msg_reg_nr,
2065                  struct brw_reg src0,
2066                  GLuint binding_table_index,
2067                  GLuint msg_length,
2068                  GLuint response_length,
2069                  bool eot,
2070                  bool header_present)
2071{
2072   struct intel_context *intel = &p->brw->intel;
2073   struct brw_instruction *insn;
2074   GLuint msg_control, msg_type;
2075   struct brw_reg dest;
2076
2077   if (dispatch_width == 16)
2078      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2079   else
2080      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2081
2082   if (intel->gen >= 6 && binding_table_index == 0) {
2083      insn = next_insn(p, BRW_OPCODE_SENDC);
2084   } else {
2085      insn = next_insn(p, BRW_OPCODE_SEND);
2086   }
2087   /* The execution mask is ignored for render target writes. */
2088   insn->header.predicate_control = 0;
2089   insn->header.compression_control = BRW_COMPRESSION_NONE;
2090
2091   if (intel->gen >= 6) {
2092      /* headerless version, just submit color payload */
2093      src0 = brw_message_reg(msg_reg_nr);
2094
2095      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2096   } else {
2097      insn->header.destreg__conditionalmod = msg_reg_nr;
2098
2099      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2100   }
2101
2102   if (dispatch_width == 16)
2103      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2104   else
2105      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2106
2107   brw_set_dest(p, insn, dest);
2108   brw_set_src0(p, insn, src0);
2109   brw_set_dp_write_message(p,
2110			    insn,
2111			    binding_table_index,
2112			    msg_control,
2113			    msg_type,
2114			    msg_length,
2115			    header_present,
2116			    1,	/* pixel scoreboard */
2117			    response_length,
2118			    eot,
2119			    0 /* send_commit_msg */);
2120}
2121
2122
2123/**
2124 * Texture sample instruction.
2125 * Note: the msg_type plus msg_length values determine exactly what kind
2126 * of sampling operation is performed.  See volume 4, page 161 of docs.
2127 */
2128void brw_SAMPLE(struct brw_compile *p,
2129		struct brw_reg dest,
2130		GLuint msg_reg_nr,
2131		struct brw_reg src0,
2132		GLuint binding_table_index,
2133		GLuint sampler,
2134		GLuint writemask,
2135		GLuint msg_type,
2136		GLuint response_length,
2137		GLuint msg_length,
2138		bool eot,
2139		GLuint header_present,
2140		GLuint simd_mode)
2141{
2142   struct intel_context *intel = &p->brw->intel;
2143   bool need_stall = 0;
2144
2145   if (writemask == 0) {
2146      /*printf("%s: zero writemask??\n", __FUNCTION__); */
2147      return;
2148   }
2149
2150   /* Hardware doesn't do destination dependency checking on send
2151    * instructions properly.  Add a workaround which generates the
2152    * dependency by other means.  In practice it seems like this bug
2153    * only crops up for texture samples, and only where registers are
2154    * written by the send and then written again later without being
2155    * read in between.  Luckily for us, we already track that
2156    * information and use it to modify the writemask for the
2157    * instruction, so that is a guide for whether a workaround is
2158    * needed.
2159    */
2160   if (writemask != WRITEMASK_XYZW) {
2161      GLuint dst_offset = 0;
2162      GLuint i, newmask = 0, len = 0;
2163
2164      for (i = 0; i < 4; i++) {
2165	 if (writemask & (1<<i))
2166	    break;
2167	 dst_offset += 2;
2168      }
2169      for (; i < 4; i++) {
2170	 if (!(writemask & (1<<i)))
2171	    break;
2172	 newmask |= 1<<i;
2173	 len++;
2174      }
2175
2176      if (newmask != writemask) {
2177	 need_stall = 1;
2178         /* printf("need stall %x %x\n", newmask , writemask); */
2179      }
2180      else {
2181	 bool dispatch_16 = false;
2182
2183	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2184
2185	 guess_execution_size(p, p->current, dest);
2186	 if (p->current->header.execution_size == BRW_EXECUTE_16)
2187	    dispatch_16 = true;
2188
2189	 newmask = ~newmask & WRITEMASK_XYZW;
2190
2191	 brw_push_insn_state(p);
2192
2193	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2194	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2195
2196	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2197		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2198  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2199
2200	 brw_pop_insn_state(p);
2201
2202  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2203	 dest = offset(dest, dst_offset);
2204
2205	 /* For 16-wide dispatch, masked channels are skipped in the
2206	  * response.  For 8-wide, masked channels still take up slots,
2207	  * and are just not written to.
2208	  */
2209	 if (dispatch_16)
2210	    response_length = len * 2;
2211      }
2212   }
2213
2214   {
2215      struct brw_instruction *insn;
2216
2217      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2218
2219      insn = next_insn(p, BRW_OPCODE_SEND);
2220      insn->header.predicate_control = 0; /* XXX */
2221      insn->header.compression_control = BRW_COMPRESSION_NONE;
2222      if (intel->gen < 6)
2223	  insn->header.destreg__conditionalmod = msg_reg_nr;
2224
2225      brw_set_dest(p, insn, dest);
2226      brw_set_src0(p, insn, src0);
2227      brw_set_sampler_message(p, insn,
2228			      binding_table_index,
2229			      sampler,
2230			      msg_type,
2231			      response_length,
2232			      msg_length,
2233			      eot,
2234			      header_present,
2235			      simd_mode);
2236   }
2237
2238   if (need_stall) {
2239      struct brw_reg reg = vec8(offset(dest, response_length-1));
2240
2241      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2242       */
2243      brw_push_insn_state(p);
2244      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2245      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2246	      retype(reg, BRW_REGISTER_TYPE_UD));
2247      brw_pop_insn_state(p);
2248   }
2249
2250}
2251
2252/* All these variables are pretty confusing - we might be better off
2253 * using bitmasks and macros for this, in the old style.  Or perhaps
2254 * just having the caller instantiate the fields in dword3 itself.
2255 */
2256void brw_urb_WRITE(struct brw_compile *p,
2257		   struct brw_reg dest,
2258		   GLuint msg_reg_nr,
2259		   struct brw_reg src0,
2260		   bool allocate,
2261		   bool used,
2262		   GLuint msg_length,
2263		   GLuint response_length,
2264		   bool eot,
2265		   bool writes_complete,
2266		   GLuint offset,
2267		   GLuint swizzle)
2268{
2269   struct intel_context *intel = &p->brw->intel;
2270   struct brw_instruction *insn;
2271
2272   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2273
2274   if (intel->gen == 7) {
2275      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2276      brw_push_insn_state(p);
2277      brw_set_access_mode(p, BRW_ALIGN_1);
2278      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2279		       BRW_REGISTER_TYPE_UD),
2280	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2281		brw_imm_ud(0xff00));
2282      brw_pop_insn_state(p);
2283   }
2284
2285   insn = next_insn(p, BRW_OPCODE_SEND);
2286
2287   assert(msg_length < BRW_MAX_MRF);
2288
2289   brw_set_dest(p, insn, dest);
2290   brw_set_src0(p, insn, src0);
2291   brw_set_src1(p, insn, brw_imm_d(0));
2292
2293   if (intel->gen < 6)
2294      insn->header.destreg__conditionalmod = msg_reg_nr;
2295
2296   brw_set_urb_message(p,
2297		       insn,
2298		       allocate,
2299		       used,
2300		       msg_length,
2301		       response_length,
2302		       eot,
2303		       writes_complete,
2304		       offset,
2305		       swizzle);
2306}
2307
2308static int
2309brw_find_next_block_end(struct brw_compile *p, int start)
2310{
2311   int ip;
2312
2313   for (ip = start + 1; ip < p->nr_insn; ip++) {
2314      struct brw_instruction *insn = &p->store[ip];
2315
2316      switch (insn->header.opcode) {
2317      case BRW_OPCODE_ENDIF:
2318      case BRW_OPCODE_ELSE:
2319      case BRW_OPCODE_WHILE:
2320	 return ip;
2321      }
2322   }
2323   assert(!"not reached");
2324   return start + 1;
2325}
2326
2327/* There is no DO instruction on gen6, so to find the end of the loop
2328 * we have to see if the loop is jumping back before our start
2329 * instruction.
2330 */
2331static int
2332brw_find_loop_end(struct brw_compile *p, int start)
2333{
2334   struct intel_context *intel = &p->brw->intel;
2335   int ip;
2336   int br = 2;
2337
2338   for (ip = start + 1; ip < p->nr_insn; ip++) {
2339      struct brw_instruction *insn = &p->store[ip];
2340
2341      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2342	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2343				   : insn->bits3.break_cont.jip;
2344	 if (ip + jip / br <= start)
2345	    return ip;
2346      }
2347   }
2348   assert(!"not reached");
2349   return start + 1;
2350}
2351
2352/* After program generation, go back and update the UIP and JIP of
2353 * BREAK and CONT instructions to their correct locations.
2354 */
2355void
2356brw_set_uip_jip(struct brw_compile *p)
2357{
2358   struct intel_context *intel = &p->brw->intel;
2359   int ip;
2360   int br = 2;
2361
2362   if (intel->gen < 6)
2363      return;
2364
2365   for (ip = 0; ip < p->nr_insn; ip++) {
2366      struct brw_instruction *insn = &p->store[ip];
2367
2368      switch (insn->header.opcode) {
2369      case BRW_OPCODE_BREAK:
2370	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2371	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2372	 insn->bits3.break_cont.uip =
2373	    br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2374	 break;
2375      case BRW_OPCODE_CONTINUE:
2376	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2377	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2378
2379	 assert(insn->bits3.break_cont.uip != 0);
2380	 assert(insn->bits3.break_cont.jip != 0);
2381	 break;
2382      }
2383   }
2384}
2385
2386void brw_ff_sync(struct brw_compile *p,
2387		   struct brw_reg dest,
2388		   GLuint msg_reg_nr,
2389		   struct brw_reg src0,
2390		   bool allocate,
2391		   GLuint response_length,
2392		   bool eot)
2393{
2394   struct intel_context *intel = &p->brw->intel;
2395   struct brw_instruction *insn;
2396
2397   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2398
2399   insn = next_insn(p, BRW_OPCODE_SEND);
2400   brw_set_dest(p, insn, dest);
2401   brw_set_src0(p, insn, src0);
2402   brw_set_src1(p, insn, brw_imm_d(0));
2403
2404   if (intel->gen < 6)
2405      insn->header.destreg__conditionalmod = msg_reg_nr;
2406
2407   brw_set_ff_sync_message(p,
2408			   insn,
2409			   allocate,
2410			   response_length,
2411			   eot);
2412}
2413