brw_eu_emit.c revision 973b4ddd0e2f25cfd72cb945fbd38aed629a6fed
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct intel_context *intel = &p->brw->intel;
67   if (intel->gen < 6)
68      return;
69
70   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
71      brw_push_insn_state(p);
72      brw_set_mask_control(p, BRW_MASK_DISABLE);
73      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
74      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
75	      retype(*src, BRW_REGISTER_TYPE_UD));
76      brw_pop_insn_state(p);
77   }
78   *src = brw_message_reg(msg_reg_nr);
79}
80
81static void
82gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
83{
84   struct intel_context *intel = &p->brw->intel;
85   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
86      reg->file = BRW_GENERAL_REGISTER_FILE;
87      reg->nr += 111;
88   }
89}
90
91
92void
93brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
94	     struct brw_reg dest)
95{
96   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
97       dest.file != BRW_MESSAGE_REGISTER_FILE)
98      assert(dest.nr < 128);
99
100   gen7_convert_mrf_to_grf(p, &dest);
101
102   insn->bits1.da1.dest_reg_file = dest.file;
103   insn->bits1.da1.dest_reg_type = dest.type;
104   insn->bits1.da1.dest_address_mode = dest.address_mode;
105
106   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
107      insn->bits1.da1.dest_reg_nr = dest.nr;
108
109      if (insn->header.access_mode == BRW_ALIGN_1) {
110	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
111	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
112	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
113	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
114      }
115      else {
116	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
117	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
118	 /* even ignored in da16, still need to set as '01' */
119	 insn->bits1.da16.dest_horiz_stride = 1;
120      }
121   }
122   else {
123      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
124
125      /* These are different sizes in align1 vs align16:
126       */
127      if (insn->header.access_mode == BRW_ALIGN_1) {
128	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
129	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
130	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
131	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
132      }
133      else {
134	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
135	 /* even ignored in da16, still need to set as '01' */
136	 insn->bits1.ia16.dest_horiz_stride = 1;
137      }
138   }
139
140   /* NEW: Set the execution size based on dest.width and
141    * insn->compression_control:
142    */
143   guess_execution_size(p, insn, dest);
144}
145
146extern int reg_type_size[];
147
148static void
149validate_reg(struct brw_instruction *insn, struct brw_reg reg)
150{
151   int hstride_for_reg[] = {0, 1, 2, 4};
152   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
153   int width_for_reg[] = {1, 2, 4, 8, 16};
154   int execsize_for_reg[] = {1, 2, 4, 8, 16};
155   int width, hstride, vstride, execsize;
156
157   if (reg.file == BRW_IMMEDIATE_VALUE) {
158      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
159       * mean the destination has to be 128-bit aligned and the
160       * destination horiz stride has to be a word.
161       */
162      if (reg.type == BRW_REGISTER_TYPE_V) {
163	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
164		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
165      }
166
167      return;
168   }
169
170   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
171       reg.file == BRW_ARF_NULL)
172      return;
173
174   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
175   hstride = hstride_for_reg[reg.hstride];
176
177   if (reg.vstride == 0xf) {
178      vstride = -1;
179   } else {
180      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
181      vstride = vstride_for_reg[reg.vstride];
182   }
183
184   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
185   width = width_for_reg[reg.width];
186
187   assert(insn->header.execution_size >= 0 &&
188	  insn->header.execution_size < Elements(execsize_for_reg));
189   execsize = execsize_for_reg[insn->header.execution_size];
190
191   /* Restrictions from 3.3.10: Register Region Restrictions. */
192   /* 3. */
193   assert(execsize >= width);
194
195   /* 4. */
196   if (execsize == width && hstride != 0) {
197      assert(vstride == -1 || vstride == width * hstride);
198   }
199
200   /* 5. */
201   if (execsize == width && hstride == 0) {
202      /* no restriction on vstride. */
203   }
204
205   /* 6. */
206   if (width == 1) {
207      assert(hstride == 0);
208   }
209
210   /* 7. */
211   if (execsize == 1 && width == 1) {
212      assert(hstride == 0);
213      assert(vstride == 0);
214   }
215
216   /* 8. */
217   if (vstride == 0 && hstride == 0) {
218      assert(width == 1);
219   }
220
221   /* 10. Check destination issues. */
222}
223
224void
225brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
226	     struct brw_reg reg)
227{
228   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
229      assert(reg.nr < 128);
230
231   gen7_convert_mrf_to_grf(p, &reg);
232
233   validate_reg(insn, reg);
234
235   insn->bits1.da1.src0_reg_file = reg.file;
236   insn->bits1.da1.src0_reg_type = reg.type;
237   insn->bits2.da1.src0_abs = reg.abs;
238   insn->bits2.da1.src0_negate = reg.negate;
239   insn->bits2.da1.src0_address_mode = reg.address_mode;
240
241   if (reg.file == BRW_IMMEDIATE_VALUE) {
242      insn->bits3.ud = reg.dw1.ud;
243
244      /* Required to set some fields in src1 as well:
245       */
246      insn->bits1.da1.src1_reg_file = 0; /* arf */
247      insn->bits1.da1.src1_reg_type = reg.type;
248   }
249   else
250   {
251      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
252	 if (insn->header.access_mode == BRW_ALIGN_1) {
253	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
254	    insn->bits2.da1.src0_reg_nr = reg.nr;
255	 }
256	 else {
257	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
258	    insn->bits2.da16.src0_reg_nr = reg.nr;
259	 }
260      }
261      else {
262	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
263
264	 if (insn->header.access_mode == BRW_ALIGN_1) {
265	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
266	 }
267	 else {
268	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
269	 }
270      }
271
272      if (insn->header.access_mode == BRW_ALIGN_1) {
273	 if (reg.width == BRW_WIDTH_1 &&
274	     insn->header.execution_size == BRW_EXECUTE_1) {
275	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
276	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
277	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
278	 }
279	 else {
280	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
281	    insn->bits2.da1.src0_width = reg.width;
282	    insn->bits2.da1.src0_vert_stride = reg.vstride;
283	 }
284      }
285      else {
286	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
287	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
288	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
289	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
290
291	 /* This is an oddity of the fact we're using the same
292	  * descriptions for registers in align_16 as align_1:
293	  */
294	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
295	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
296	 else
297	    insn->bits2.da16.src0_vert_stride = reg.vstride;
298      }
299   }
300}
301
302
303void brw_set_src1(struct brw_compile *p,
304		  struct brw_instruction *insn,
305		  struct brw_reg reg)
306{
307   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
308
309   assert(reg.nr < 128);
310
311   gen7_convert_mrf_to_grf(p, &reg);
312
313   validate_reg(insn, reg);
314
315   insn->bits1.da1.src1_reg_file = reg.file;
316   insn->bits1.da1.src1_reg_type = reg.type;
317   insn->bits3.da1.src1_abs = reg.abs;
318   insn->bits3.da1.src1_negate = reg.negate;
319
320   /* Only src1 can be immediate in two-argument instructions.
321    */
322   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
323
324   if (reg.file == BRW_IMMEDIATE_VALUE) {
325      insn->bits3.ud = reg.dw1.ud;
326   }
327   else {
328      /* This is a hardware restriction, which may or may not be lifted
329       * in the future:
330       */
331      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
332      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
333
334      if (insn->header.access_mode == BRW_ALIGN_1) {
335	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
336	 insn->bits3.da1.src1_reg_nr = reg.nr;
337      }
338      else {
339	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
340	 insn->bits3.da16.src1_reg_nr = reg.nr;
341      }
342
343      if (insn->header.access_mode == BRW_ALIGN_1) {
344	 if (reg.width == BRW_WIDTH_1 &&
345	     insn->header.execution_size == BRW_EXECUTE_1) {
346	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
347	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
348	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
349	 }
350	 else {
351	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
352	    insn->bits3.da1.src1_width = reg.width;
353	    insn->bits3.da1.src1_vert_stride = reg.vstride;
354	 }
355      }
356      else {
357	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
358	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
359	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
360	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
361
362	 /* This is an oddity of the fact we're using the same
363	  * descriptions for registers in align_16 as align_1:
364	  */
365	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
366	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
367	 else
368	    insn->bits3.da16.src1_vert_stride = reg.vstride;
369      }
370   }
371}
372
373/**
374 * Set the Message Descriptor and Extended Message Descriptor fields
375 * for SEND messages.
376 *
377 * \note This zeroes out the Function Control bits, so it must be called
378 *       \b before filling out any message-specific data.  Callers can
379 *       choose not to fill in irrelevant bits; they will be zero.
380 */
381static void
382brw_set_message_descriptor(struct brw_compile *p,
383			   struct brw_instruction *inst,
384			   enum brw_message_target sfid,
385			   unsigned msg_length,
386			   unsigned response_length,
387			   bool header_present,
388			   bool end_of_thread)
389{
390   struct intel_context *intel = &p->brw->intel;
391
392   brw_set_src1(p, inst, brw_imm_d(0));
393
394   if (intel->gen >= 5) {
395      inst->bits3.generic_gen5.header_present = header_present;
396      inst->bits3.generic_gen5.response_length = response_length;
397      inst->bits3.generic_gen5.msg_length = msg_length;
398      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
399
400      if (intel->gen >= 6) {
401	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
402	 inst->header.destreg__conditionalmod = sfid;
403      } else {
404	 /* Set Extended Message Descriptor (ex_desc) */
405	 inst->bits2.send_gen5.sfid = sfid;
406	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
407      }
408   } else {
409      inst->bits3.generic.response_length = response_length;
410      inst->bits3.generic.msg_length = msg_length;
411      inst->bits3.generic.msg_target = sfid;
412      inst->bits3.generic.end_of_thread = end_of_thread;
413   }
414}
415
416static void brw_set_math_message( struct brw_compile *p,
417				  struct brw_instruction *insn,
418				  GLuint function,
419				  GLuint integer_type,
420				  bool low_precision,
421				  bool saturate,
422				  GLuint dataType )
423{
424   struct brw_context *brw = p->brw;
425   struct intel_context *intel = &brw->intel;
426   unsigned msg_length;
427   unsigned response_length;
428
429   /* Infer message length from the function */
430   switch (function) {
431   case BRW_MATH_FUNCTION_POW:
432   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
433   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
434   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
435      msg_length = 2;
436      break;
437   default:
438      msg_length = 1;
439      break;
440   }
441
442   /* Infer response length from the function */
443   switch (function) {
444   case BRW_MATH_FUNCTION_SINCOS:
445   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
446      response_length = 2;
447      break;
448   default:
449      response_length = 1;
450      break;
451   }
452
453   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
454			      msg_length, response_length, false, false);
455   if (intel->gen == 5) {
456      insn->bits3.math_gen5.function = function;
457      insn->bits3.math_gen5.int_type = integer_type;
458      insn->bits3.math_gen5.precision = low_precision;
459      insn->bits3.math_gen5.saturate = saturate;
460      insn->bits3.math_gen5.data_type = dataType;
461      insn->bits3.math_gen5.snapshot = 0;
462   } else {
463      insn->bits3.math.function = function;
464      insn->bits3.math.int_type = integer_type;
465      insn->bits3.math.precision = low_precision;
466      insn->bits3.math.saturate = saturate;
467      insn->bits3.math.data_type = dataType;
468   }
469}
470
471
472static void brw_set_ff_sync_message(struct brw_compile *p,
473				    struct brw_instruction *insn,
474				    bool allocate,
475				    GLuint response_length,
476				    bool end_of_thread)
477{
478   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
479			      1, response_length, true, end_of_thread);
480   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
481   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
482   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
483   insn->bits3.urb_gen5.allocate = allocate;
484   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
485   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
486}
487
488static void brw_set_urb_message( struct brw_compile *p,
489				 struct brw_instruction *insn,
490				 bool allocate,
491				 bool used,
492				 GLuint msg_length,
493				 GLuint response_length,
494				 bool end_of_thread,
495				 bool complete,
496				 GLuint offset,
497				 GLuint swizzle_control )
498{
499   struct brw_context *brw = p->brw;
500   struct intel_context *intel = &brw->intel;
501
502   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
503			      msg_length, response_length, true, end_of_thread);
504   if (intel->gen == 7) {
505      insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
506      insn->bits3.urb_gen7.offset = offset;
507      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
508      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
509      /* per_slot_offset = 0 makes it ignore offsets in message header */
510      insn->bits3.urb_gen7.per_slot_offset = 0;
511      insn->bits3.urb_gen7.complete = complete;
512   } else if (intel->gen >= 5) {
513      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
514      insn->bits3.urb_gen5.offset = offset;
515      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
516      insn->bits3.urb_gen5.allocate = allocate;
517      insn->bits3.urb_gen5.used = used;	/* ? */
518      insn->bits3.urb_gen5.complete = complete;
519   } else {
520      insn->bits3.urb.opcode = 0;	/* ? */
521      insn->bits3.urb.offset = offset;
522      insn->bits3.urb.swizzle_control = swizzle_control;
523      insn->bits3.urb.allocate = allocate;
524      insn->bits3.urb.used = used;	/* ? */
525      insn->bits3.urb.complete = complete;
526   }
527}
528
529void
530brw_set_dp_write_message(struct brw_compile *p,
531			 struct brw_instruction *insn,
532			 GLuint binding_table_index,
533			 GLuint msg_control,
534			 GLuint msg_type,
535			 GLuint msg_length,
536			 bool header_present,
537			 GLuint last_render_target,
538			 GLuint response_length,
539			 GLuint end_of_thread,
540			 GLuint send_commit_msg)
541{
542   struct brw_context *brw = p->brw;
543   struct intel_context *intel = &brw->intel;
544   unsigned sfid;
545
546   if (intel->gen >= 7) {
547      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
548      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
549	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
550      else
551	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
552   } else if (intel->gen == 6) {
553      /* Use the render cache for all write messages. */
554      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
555   } else {
556      sfid = BRW_SFID_DATAPORT_WRITE;
557   }
558
559   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
560			      header_present, end_of_thread);
561
562   if (intel->gen >= 7) {
563      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
564      insn->bits3.gen7_dp.msg_control = msg_control;
565      insn->bits3.gen7_dp.last_render_target = last_render_target;
566      insn->bits3.gen7_dp.msg_type = msg_type;
567   } else if (intel->gen == 6) {
568      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
569      insn->bits3.gen6_dp.msg_control = msg_control;
570      insn->bits3.gen6_dp.last_render_target = last_render_target;
571      insn->bits3.gen6_dp.msg_type = msg_type;
572      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
573   } else if (intel->gen == 5) {
574      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
575      insn->bits3.dp_write_gen5.msg_control = msg_control;
576      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
577      insn->bits3.dp_write_gen5.msg_type = msg_type;
578      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
579   } else {
580      insn->bits3.dp_write.binding_table_index = binding_table_index;
581      insn->bits3.dp_write.msg_control = msg_control;
582      insn->bits3.dp_write.last_render_target = last_render_target;
583      insn->bits3.dp_write.msg_type = msg_type;
584      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
585   }
586}
587
588void
589brw_set_dp_read_message(struct brw_compile *p,
590			struct brw_instruction *insn,
591			GLuint binding_table_index,
592			GLuint msg_control,
593			GLuint msg_type,
594			GLuint target_cache,
595			GLuint msg_length,
596			GLuint response_length)
597{
598   struct brw_context *brw = p->brw;
599   struct intel_context *intel = &brw->intel;
600   unsigned sfid;
601
602   if (intel->gen >= 7) {
603      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
604   } else if (intel->gen == 6) {
605      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
606	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
607      else
608	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
609   } else {
610      sfid = BRW_SFID_DATAPORT_READ;
611   }
612
613   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
614			      true, false);
615
616   if (intel->gen >= 7) {
617      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
618      insn->bits3.gen7_dp.msg_control = msg_control;
619      insn->bits3.gen7_dp.last_render_target = 0;
620      insn->bits3.gen7_dp.msg_type = msg_type;
621   } else if (intel->gen == 6) {
622      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
623      insn->bits3.gen6_dp.msg_control = msg_control;
624      insn->bits3.gen6_dp.last_render_target = 0;
625      insn->bits3.gen6_dp.msg_type = msg_type;
626      insn->bits3.gen6_dp.send_commit_msg = 0;
627   } else if (intel->gen == 5) {
628      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
629      insn->bits3.dp_read_gen5.msg_control = msg_control;
630      insn->bits3.dp_read_gen5.msg_type = msg_type;
631      insn->bits3.dp_read_gen5.target_cache = target_cache;
632   } else if (intel->is_g4x) {
633      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
634      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
635      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
636      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
637   } else {
638      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
639      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
640      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
641      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
642   }
643}
644
645static void brw_set_sampler_message(struct brw_compile *p,
646                                    struct brw_instruction *insn,
647                                    GLuint binding_table_index,
648                                    GLuint sampler,
649                                    GLuint msg_type,
650                                    GLuint response_length,
651                                    GLuint msg_length,
652                                    GLuint header_present,
653                                    GLuint simd_mode)
654{
655   struct brw_context *brw = p->brw;
656   struct intel_context *intel = &brw->intel;
657
658   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
659			      response_length, header_present, false);
660
661   if (intel->gen >= 7) {
662      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
663      insn->bits3.sampler_gen7.sampler = sampler;
664      insn->bits3.sampler_gen7.msg_type = msg_type;
665      insn->bits3.sampler_gen7.simd_mode = simd_mode;
666   } else if (intel->gen >= 5) {
667      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
668      insn->bits3.sampler_gen5.sampler = sampler;
669      insn->bits3.sampler_gen5.msg_type = msg_type;
670      insn->bits3.sampler_gen5.simd_mode = simd_mode;
671   } else if (intel->is_g4x) {
672      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
673      insn->bits3.sampler_g4x.sampler = sampler;
674      insn->bits3.sampler_g4x.msg_type = msg_type;
675   } else {
676      insn->bits3.sampler.binding_table_index = binding_table_index;
677      insn->bits3.sampler.sampler = sampler;
678      insn->bits3.sampler.msg_type = msg_type;
679      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
680   }
681}
682
683
684#define next_insn brw_next_insn
685struct brw_instruction *
686brw_next_insn(struct brw_compile *p, GLuint opcode)
687{
688   struct brw_instruction *insn;
689
690   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
691
692   insn = &p->store[p->nr_insn++];
693   memcpy(insn, p->current, sizeof(*insn));
694
695   /* Reset this one-shot flag:
696    */
697
698   if (p->current->header.destreg__conditionalmod) {
699      p->current->header.destreg__conditionalmod = 0;
700      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
701   }
702
703   insn->header.opcode = opcode;
704   return insn;
705}
706
707static struct brw_instruction *brw_alu1( struct brw_compile *p,
708					 GLuint opcode,
709					 struct brw_reg dest,
710					 struct brw_reg src )
711{
712   struct brw_instruction *insn = next_insn(p, opcode);
713   brw_set_dest(p, insn, dest);
714   brw_set_src0(p, insn, src);
715   return insn;
716}
717
718static struct brw_instruction *brw_alu2(struct brw_compile *p,
719					GLuint opcode,
720					struct brw_reg dest,
721					struct brw_reg src0,
722					struct brw_reg src1 )
723{
724   struct brw_instruction *insn = next_insn(p, opcode);
725   brw_set_dest(p, insn, dest);
726   brw_set_src0(p, insn, src0);
727   brw_set_src1(p, insn, src1);
728   return insn;
729}
730
731
732/***********************************************************************
733 * Convenience routines.
734 */
735#define ALU1(OP)					\
736struct brw_instruction *brw_##OP(struct brw_compile *p,	\
737	      struct brw_reg dest,			\
738	      struct brw_reg src0)   			\
739{							\
740   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
741}
742
743#define ALU2(OP)					\
744struct brw_instruction *brw_##OP(struct brw_compile *p,	\
745	      struct brw_reg dest,			\
746	      struct brw_reg src0,			\
747	      struct brw_reg src1)   			\
748{							\
749   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
750}
751
752/* Rounding operations (other than RNDD) require two instructions - the first
753 * stores a rounded value (possibly the wrong way) in the dest register, but
754 * also sets a per-channel "increment bit" in the flag register.  A predicated
755 * add of 1.0 fixes dest to contain the desired result.
756 *
757 * Sandybridge and later appear to round correctly without an ADD.
758 */
759#define ROUND(OP)							      \
760void brw_##OP(struct brw_compile *p,					      \
761	      struct brw_reg dest,					      \
762	      struct brw_reg src)					      \
763{									      \
764   struct brw_instruction *rnd, *add;					      \
765   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
766   brw_set_dest(p, rnd, dest);						      \
767   brw_set_src0(p, rnd, src);						      \
768									      \
769   if (p->brw->intel.gen < 6) {						      \
770      /* turn on round-increments */					      \
771      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
772      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
773      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
774   }									      \
775}
776
777
778ALU1(MOV)
779ALU2(SEL)
780ALU1(NOT)
781ALU2(AND)
782ALU2(OR)
783ALU2(XOR)
784ALU2(SHR)
785ALU2(SHL)
786ALU2(RSR)
787ALU2(RSL)
788ALU2(ASR)
789ALU1(FRC)
790ALU1(RNDD)
791ALU2(MAC)
792ALU2(MACH)
793ALU1(LZD)
794ALU2(DP4)
795ALU2(DPH)
796ALU2(DP3)
797ALU2(DP2)
798ALU2(LINE)
799ALU2(PLN)
800
801
802ROUND(RNDZ)
803ROUND(RNDE)
804
805
806struct brw_instruction *brw_ADD(struct brw_compile *p,
807				struct brw_reg dest,
808				struct brw_reg src0,
809				struct brw_reg src1)
810{
811   /* 6.2.2: add */
812   if (src0.type == BRW_REGISTER_TYPE_F ||
813       (src0.file == BRW_IMMEDIATE_VALUE &&
814	src0.type == BRW_REGISTER_TYPE_VF)) {
815      assert(src1.type != BRW_REGISTER_TYPE_UD);
816      assert(src1.type != BRW_REGISTER_TYPE_D);
817   }
818
819   if (src1.type == BRW_REGISTER_TYPE_F ||
820       (src1.file == BRW_IMMEDIATE_VALUE &&
821	src1.type == BRW_REGISTER_TYPE_VF)) {
822      assert(src0.type != BRW_REGISTER_TYPE_UD);
823      assert(src0.type != BRW_REGISTER_TYPE_D);
824   }
825
826   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
827}
828
829struct brw_instruction *brw_MUL(struct brw_compile *p,
830				struct brw_reg dest,
831				struct brw_reg src0,
832				struct brw_reg src1)
833{
834   /* 6.32.38: mul */
835   if (src0.type == BRW_REGISTER_TYPE_D ||
836       src0.type == BRW_REGISTER_TYPE_UD ||
837       src1.type == BRW_REGISTER_TYPE_D ||
838       src1.type == BRW_REGISTER_TYPE_UD) {
839      assert(dest.type != BRW_REGISTER_TYPE_F);
840   }
841
842   if (src0.type == BRW_REGISTER_TYPE_F ||
843       (src0.file == BRW_IMMEDIATE_VALUE &&
844	src0.type == BRW_REGISTER_TYPE_VF)) {
845      assert(src1.type != BRW_REGISTER_TYPE_UD);
846      assert(src1.type != BRW_REGISTER_TYPE_D);
847   }
848
849   if (src1.type == BRW_REGISTER_TYPE_F ||
850       (src1.file == BRW_IMMEDIATE_VALUE &&
851	src1.type == BRW_REGISTER_TYPE_VF)) {
852      assert(src0.type != BRW_REGISTER_TYPE_UD);
853      assert(src0.type != BRW_REGISTER_TYPE_D);
854   }
855
856   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
857	  src0.nr != BRW_ARF_ACCUMULATOR);
858   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
859	  src1.nr != BRW_ARF_ACCUMULATOR);
860
861   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
862}
863
864
865void brw_NOP(struct brw_compile *p)
866{
867   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
868   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
869   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
870   brw_set_src1(p, insn, brw_imm_ud(0x0));
871}
872
873
874
875
876
877/***********************************************************************
878 * Comparisons, if/else/endif
879 */
880
881struct brw_instruction *brw_JMPI(struct brw_compile *p,
882                                 struct brw_reg dest,
883                                 struct brw_reg src0,
884                                 struct brw_reg src1)
885{
886   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
887
888   insn->header.execution_size = 1;
889   insn->header.compression_control = BRW_COMPRESSION_NONE;
890   insn->header.mask_control = BRW_MASK_DISABLE;
891
892   p->current->header.predicate_control = BRW_PREDICATE_NONE;
893
894   return insn;
895}
896
897static void
898push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
899{
900   p->if_stack[p->if_stack_depth] = inst;
901
902   p->if_stack_depth++;
903   if (p->if_stack_array_size <= p->if_stack_depth) {
904      p->if_stack_array_size *= 2;
905      p->if_stack = reralloc(p->mem_ctx, p->if_stack, struct brw_instruction *,
906			     p->if_stack_array_size);
907   }
908}
909
910/* EU takes the value from the flag register and pushes it onto some
911 * sort of a stack (presumably merging with any flag value already on
912 * the stack).  Within an if block, the flags at the top of the stack
913 * control execution on each channel of the unit, eg. on each of the
914 * 16 pixel values in our wm programs.
915 *
916 * When the matching 'else' instruction is reached (presumably by
917 * countdown of the instruction count patched in by our ELSE/ENDIF
918 * functions), the relevent flags are inverted.
919 *
920 * When the matching 'endif' instruction is reached, the flags are
921 * popped off.  If the stack is now empty, normal execution resumes.
922 */
923struct brw_instruction *
924brw_IF(struct brw_compile *p, GLuint execute_size)
925{
926   struct intel_context *intel = &p->brw->intel;
927   struct brw_instruction *insn;
928
929   insn = next_insn(p, BRW_OPCODE_IF);
930
931   /* Override the defaults for this instruction:
932    */
933   if (intel->gen < 6) {
934      brw_set_dest(p, insn, brw_ip_reg());
935      brw_set_src0(p, insn, brw_ip_reg());
936      brw_set_src1(p, insn, brw_imm_d(0x0));
937   } else if (intel->gen == 6) {
938      brw_set_dest(p, insn, brw_imm_w(0));
939      insn->bits1.branch_gen6.jump_count = 0;
940      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
941      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
942   } else {
943      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
944      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
945      brw_set_src1(p, insn, brw_imm_ud(0));
946      insn->bits3.break_cont.jip = 0;
947      insn->bits3.break_cont.uip = 0;
948   }
949
950   insn->header.execution_size = execute_size;
951   insn->header.compression_control = BRW_COMPRESSION_NONE;
952   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
953   insn->header.mask_control = BRW_MASK_ENABLE;
954   if (!p->single_program_flow)
955      insn->header.thread_control = BRW_THREAD_SWITCH;
956
957   p->current->header.predicate_control = BRW_PREDICATE_NONE;
958
959   push_if_stack(p, insn);
960   return insn;
961}
962
963/* This function is only used for gen6-style IF instructions with an
964 * embedded comparison (conditional modifier).  It is not used on gen7.
965 */
966struct brw_instruction *
967gen6_IF(struct brw_compile *p, uint32_t conditional,
968	struct brw_reg src0, struct brw_reg src1)
969{
970   struct brw_instruction *insn;
971
972   insn = next_insn(p, BRW_OPCODE_IF);
973
974   brw_set_dest(p, insn, brw_imm_w(0));
975   if (p->compressed) {
976      insn->header.execution_size = BRW_EXECUTE_16;
977   } else {
978      insn->header.execution_size = BRW_EXECUTE_8;
979   }
980   insn->bits1.branch_gen6.jump_count = 0;
981   brw_set_src0(p, insn, src0);
982   brw_set_src1(p, insn, src1);
983
984   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
985   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
986   insn->header.destreg__conditionalmod = conditional;
987
988   if (!p->single_program_flow)
989      insn->header.thread_control = BRW_THREAD_SWITCH;
990
991   push_if_stack(p, insn);
992   return insn;
993}
994
995/**
996 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
997 */
998static void
999convert_IF_ELSE_to_ADD(struct brw_compile *p,
1000		       struct brw_instruction *if_inst,
1001		       struct brw_instruction *else_inst)
1002{
1003   /* The next instruction (where the ENDIF would be, if it existed) */
1004   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1005
1006   assert(p->single_program_flow);
1007   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1008   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1009   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1010
1011   /* Convert IF to an ADD instruction that moves the instruction pointer
1012    * to the first instruction of the ELSE block.  If there is no ELSE
1013    * block, point to where ENDIF would be.  Reverse the predicate.
1014    *
1015    * There's no need to execute an ENDIF since we don't need to do any
1016    * stack operations, and if we're currently executing, we just want to
1017    * continue normally.
1018    */
1019   if_inst->header.opcode = BRW_OPCODE_ADD;
1020   if_inst->header.predicate_inverse = 1;
1021
1022   if (else_inst != NULL) {
1023      /* Convert ELSE to an ADD instruction that points where the ENDIF
1024       * would be.
1025       */
1026      else_inst->header.opcode = BRW_OPCODE_ADD;
1027
1028      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1029      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1030   } else {
1031      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1032   }
1033}
1034
1035/**
1036 * Patch IF and ELSE instructions with appropriate jump targets.
1037 */
1038static void
1039patch_IF_ELSE(struct brw_compile *p,
1040	      struct brw_instruction *if_inst,
1041	      struct brw_instruction *else_inst,
1042	      struct brw_instruction *endif_inst)
1043{
1044   struct intel_context *intel = &p->brw->intel;
1045
1046   assert(!p->single_program_flow);
1047   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1048   assert(endif_inst != NULL);
1049   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1050
1051   unsigned br = 1;
1052   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1053    * requires 2 chunks.
1054    */
1055   if (intel->gen >= 5)
1056      br = 2;
1057
1058   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1059   endif_inst->header.execution_size = if_inst->header.execution_size;
1060
1061   if (else_inst == NULL) {
1062      /* Patch IF -> ENDIF */
1063      if (intel->gen < 6) {
1064	 /* Turn it into an IFF, which means no mask stack operations for
1065	  * all-false and jumping past the ENDIF.
1066	  */
1067	 if_inst->header.opcode = BRW_OPCODE_IFF;
1068	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1069	 if_inst->bits3.if_else.pop_count = 0;
1070	 if_inst->bits3.if_else.pad0 = 0;
1071      } else if (intel->gen == 6) {
1072	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1073	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1074      } else {
1075	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1076	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1077      }
1078   } else {
1079      else_inst->header.execution_size = if_inst->header.execution_size;
1080
1081      /* Patch IF -> ELSE */
1082      if (intel->gen < 6) {
1083	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1084	 if_inst->bits3.if_else.pop_count = 0;
1085	 if_inst->bits3.if_else.pad0 = 0;
1086      } else if (intel->gen == 6) {
1087	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1088      }
1089
1090      /* Patch ELSE -> ENDIF */
1091      if (intel->gen < 6) {
1092	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1093	  * matching ENDIF.
1094	  */
1095	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1096	 else_inst->bits3.if_else.pop_count = 1;
1097	 else_inst->bits3.if_else.pad0 = 0;
1098      } else if (intel->gen == 6) {
1099	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1100	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1101      } else {
1102	 /* The IF instruction's JIP should point just past the ELSE */
1103	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1104	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1105	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1106	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1107      }
1108   }
1109}
1110
1111void
1112brw_ELSE(struct brw_compile *p)
1113{
1114   struct intel_context *intel = &p->brw->intel;
1115   struct brw_instruction *insn;
1116
1117   insn = next_insn(p, BRW_OPCODE_ELSE);
1118
1119   if (intel->gen < 6) {
1120      brw_set_dest(p, insn, brw_ip_reg());
1121      brw_set_src0(p, insn, brw_ip_reg());
1122      brw_set_src1(p, insn, brw_imm_d(0x0));
1123   } else if (intel->gen == 6) {
1124      brw_set_dest(p, insn, brw_imm_w(0));
1125      insn->bits1.branch_gen6.jump_count = 0;
1126      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1127      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1128   } else {
1129      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1130      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1131      brw_set_src1(p, insn, brw_imm_ud(0));
1132      insn->bits3.break_cont.jip = 0;
1133      insn->bits3.break_cont.uip = 0;
1134   }
1135
1136   insn->header.compression_control = BRW_COMPRESSION_NONE;
1137   insn->header.mask_control = BRW_MASK_ENABLE;
1138   if (!p->single_program_flow)
1139      insn->header.thread_control = BRW_THREAD_SWITCH;
1140
1141   push_if_stack(p, insn);
1142}
1143
1144void
1145brw_ENDIF(struct brw_compile *p)
1146{
1147   struct intel_context *intel = &p->brw->intel;
1148   struct brw_instruction *insn;
1149   struct brw_instruction *else_inst = NULL;
1150   struct brw_instruction *if_inst = NULL;
1151
1152   /* Pop the IF and (optional) ELSE instructions from the stack */
1153   p->if_stack_depth--;
1154   if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
1155      else_inst = p->if_stack[p->if_stack_depth];
1156      p->if_stack_depth--;
1157   }
1158   if_inst = p->if_stack[p->if_stack_depth];
1159
1160   if (p->single_program_flow) {
1161      /* ENDIF is useless; don't bother emitting it. */
1162      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1163      return;
1164   }
1165
1166   insn = next_insn(p, BRW_OPCODE_ENDIF);
1167
1168   if (intel->gen < 6) {
1169      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1170      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1171      brw_set_src1(p, insn, brw_imm_d(0x0));
1172   } else if (intel->gen == 6) {
1173      brw_set_dest(p, insn, brw_imm_w(0));
1174      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1175      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1176   } else {
1177      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1178      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1179      brw_set_src1(p, insn, brw_imm_ud(0));
1180   }
1181
1182   insn->header.compression_control = BRW_COMPRESSION_NONE;
1183   insn->header.mask_control = BRW_MASK_ENABLE;
1184   insn->header.thread_control = BRW_THREAD_SWITCH;
1185
1186   /* Also pop item off the stack in the endif instruction: */
1187   if (intel->gen < 6) {
1188      insn->bits3.if_else.jump_count = 0;
1189      insn->bits3.if_else.pop_count = 1;
1190      insn->bits3.if_else.pad0 = 0;
1191   } else if (intel->gen == 6) {
1192      insn->bits1.branch_gen6.jump_count = 2;
1193   } else {
1194      insn->bits3.break_cont.jip = 2;
1195   }
1196   patch_IF_ELSE(p, if_inst, else_inst, insn);
1197}
1198
1199struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1200{
1201   struct intel_context *intel = &p->brw->intel;
1202   struct brw_instruction *insn;
1203
1204   insn = next_insn(p, BRW_OPCODE_BREAK);
1205   if (intel->gen >= 6) {
1206      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1207      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1208      brw_set_src1(p, insn, brw_imm_d(0x0));
1209   } else {
1210      brw_set_dest(p, insn, brw_ip_reg());
1211      brw_set_src0(p, insn, brw_ip_reg());
1212      brw_set_src1(p, insn, brw_imm_d(0x0));
1213      insn->bits3.if_else.pad0 = 0;
1214      insn->bits3.if_else.pop_count = pop_count;
1215   }
1216   insn->header.compression_control = BRW_COMPRESSION_NONE;
1217   insn->header.execution_size = BRW_EXECUTE_8;
1218
1219   return insn;
1220}
1221
1222struct brw_instruction *gen6_CONT(struct brw_compile *p,
1223				  struct brw_instruction *do_insn)
1224{
1225   struct brw_instruction *insn;
1226
1227   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1228   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1229   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1230   brw_set_dest(p, insn, brw_ip_reg());
1231   brw_set_src0(p, insn, brw_ip_reg());
1232   brw_set_src1(p, insn, brw_imm_d(0x0));
1233
1234   insn->header.compression_control = BRW_COMPRESSION_NONE;
1235   insn->header.execution_size = BRW_EXECUTE_8;
1236   return insn;
1237}
1238
1239struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1240{
1241   struct brw_instruction *insn;
1242   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1243   brw_set_dest(p, insn, brw_ip_reg());
1244   brw_set_src0(p, insn, brw_ip_reg());
1245   brw_set_src1(p, insn, brw_imm_d(0x0));
1246   insn->header.compression_control = BRW_COMPRESSION_NONE;
1247   insn->header.execution_size = BRW_EXECUTE_8;
1248   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1249   insn->bits3.if_else.pad0 = 0;
1250   insn->bits3.if_else.pop_count = pop_count;
1251   return insn;
1252}
1253
1254/* DO/WHILE loop:
1255 *
1256 * The DO/WHILE is just an unterminated loop -- break or continue are
1257 * used for control within the loop.  We have a few ways they can be
1258 * done.
1259 *
1260 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1261 * jip and no DO instruction.
1262 *
1263 * For non-uniform control flow pre-gen6, there's a DO instruction to
1264 * push the mask, and a WHILE to jump back, and BREAK to get out and
1265 * pop the mask.
1266 *
1267 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1268 * just points back to the first instruction of the loop.
1269 */
1270struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1271{
1272   struct intel_context *intel = &p->brw->intel;
1273
1274   if (intel->gen >= 6 || p->single_program_flow) {
1275      return &p->store[p->nr_insn];
1276   } else {
1277      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1278
1279      /* Override the defaults for this instruction:
1280       */
1281      brw_set_dest(p, insn, brw_null_reg());
1282      brw_set_src0(p, insn, brw_null_reg());
1283      brw_set_src1(p, insn, brw_null_reg());
1284
1285      insn->header.compression_control = BRW_COMPRESSION_NONE;
1286      insn->header.execution_size = execute_size;
1287      insn->header.predicate_control = BRW_PREDICATE_NONE;
1288      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1289      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1290
1291      return insn;
1292   }
1293}
1294
1295
1296
1297struct brw_instruction *brw_WHILE(struct brw_compile *p,
1298                                  struct brw_instruction *do_insn)
1299{
1300   struct intel_context *intel = &p->brw->intel;
1301   struct brw_instruction *insn;
1302   GLuint br = 1;
1303
1304   if (intel->gen >= 5)
1305      br = 2;
1306
1307   if (intel->gen >= 7) {
1308      insn = next_insn(p, BRW_OPCODE_WHILE);
1309
1310      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1311      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1312      brw_set_src1(p, insn, brw_imm_ud(0));
1313      insn->bits3.break_cont.jip = br * (do_insn - insn);
1314
1315      insn->header.execution_size = BRW_EXECUTE_8;
1316   } else if (intel->gen == 6) {
1317      insn = next_insn(p, BRW_OPCODE_WHILE);
1318
1319      brw_set_dest(p, insn, brw_imm_w(0));
1320      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1321      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1322      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1323
1324      insn->header.execution_size = BRW_EXECUTE_8;
1325   } else {
1326      if (p->single_program_flow) {
1327	 insn = next_insn(p, BRW_OPCODE_ADD);
1328
1329	 brw_set_dest(p, insn, brw_ip_reg());
1330	 brw_set_src0(p, insn, brw_ip_reg());
1331	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1332	 insn->header.execution_size = BRW_EXECUTE_1;
1333      } else {
1334	 insn = next_insn(p, BRW_OPCODE_WHILE);
1335
1336	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1337
1338	 brw_set_dest(p, insn, brw_ip_reg());
1339	 brw_set_src0(p, insn, brw_ip_reg());
1340	 brw_set_src1(p, insn, brw_imm_d(0));
1341
1342	 insn->header.execution_size = do_insn->header.execution_size;
1343	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1344	 insn->bits3.if_else.pop_count = 0;
1345	 insn->bits3.if_else.pad0 = 0;
1346      }
1347   }
1348   insn->header.compression_control = BRW_COMPRESSION_NONE;
1349   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1350
1351   return insn;
1352}
1353
1354
1355/* FORWARD JUMPS:
1356 */
1357void brw_land_fwd_jump(struct brw_compile *p,
1358		       struct brw_instruction *jmp_insn)
1359{
1360   struct intel_context *intel = &p->brw->intel;
1361   struct brw_instruction *landing = &p->store[p->nr_insn];
1362   GLuint jmpi = 1;
1363
1364   if (intel->gen >= 5)
1365      jmpi = 2;
1366
1367   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1368   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1369
1370   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1371}
1372
1373
1374
1375/* To integrate with the above, it makes sense that the comparison
1376 * instruction should populate the flag register.  It might be simpler
1377 * just to use the flag reg for most WM tasks?
1378 */
1379void brw_CMP(struct brw_compile *p,
1380	     struct brw_reg dest,
1381	     GLuint conditional,
1382	     struct brw_reg src0,
1383	     struct brw_reg src1)
1384{
1385   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1386
1387   insn->header.destreg__conditionalmod = conditional;
1388   brw_set_dest(p, insn, dest);
1389   brw_set_src0(p, insn, src0);
1390   brw_set_src1(p, insn, src1);
1391
1392/*    guess_execution_size(insn, src0); */
1393
1394
1395   /* Make it so that future instructions will use the computed flag
1396    * value until brw_set_predicate_control_flag_value() is called
1397    * again.
1398    */
1399   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1400       dest.nr == 0) {
1401      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1402      p->flag_value = 0xff;
1403   }
1404}
1405
1406/* Issue 'wait' instruction for n1, host could program MMIO
1407   to wake up thread. */
1408void brw_WAIT (struct brw_compile *p)
1409{
1410   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1411   struct brw_reg src = brw_notification_1_reg();
1412
1413   brw_set_dest(p, insn, src);
1414   brw_set_src0(p, insn, src);
1415   brw_set_src1(p, insn, brw_null_reg());
1416   insn->header.execution_size = 0; /* must */
1417   insn->header.predicate_control = 0;
1418   insn->header.compression_control = 0;
1419}
1420
1421
1422/***********************************************************************
1423 * Helpers for the various SEND message types:
1424 */
1425
1426/** Extended math function, float[8].
1427 */
1428void brw_math( struct brw_compile *p,
1429	       struct brw_reg dest,
1430	       GLuint function,
1431	       GLuint saturate,
1432	       GLuint msg_reg_nr,
1433	       struct brw_reg src,
1434	       GLuint data_type,
1435	       GLuint precision )
1436{
1437   struct intel_context *intel = &p->brw->intel;
1438
1439   if (intel->gen >= 6) {
1440      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1441
1442      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1443      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1444
1445      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1446      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1447
1448      /* Source modifiers are ignored for extended math instructions. */
1449      assert(!src.negate);
1450      assert(!src.abs);
1451
1452      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1453	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1454	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1455	 assert(src.type != BRW_REGISTER_TYPE_F);
1456      } else {
1457	 assert(src.type == BRW_REGISTER_TYPE_F);
1458      }
1459
1460      /* Math is the same ISA format as other opcodes, except that CondModifier
1461       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1462       */
1463      insn->header.destreg__conditionalmod = function;
1464      insn->header.saturate = saturate;
1465
1466      brw_set_dest(p, insn, dest);
1467      brw_set_src0(p, insn, src);
1468      brw_set_src1(p, insn, brw_null_reg());
1469   } else {
1470      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1471
1472      /* Example code doesn't set predicate_control for send
1473       * instructions.
1474       */
1475      insn->header.predicate_control = 0;
1476      insn->header.destreg__conditionalmod = msg_reg_nr;
1477
1478      brw_set_dest(p, insn, dest);
1479      brw_set_src0(p, insn, src);
1480      brw_set_math_message(p,
1481			   insn,
1482			   function,
1483			   src.type == BRW_REGISTER_TYPE_D,
1484			   precision,
1485			   saturate,
1486			   data_type);
1487   }
1488}
1489
1490/** Extended math function, float[8].
1491 */
1492void brw_math2(struct brw_compile *p,
1493	       struct brw_reg dest,
1494	       GLuint function,
1495	       struct brw_reg src0,
1496	       struct brw_reg src1)
1497{
1498   struct intel_context *intel = &p->brw->intel;
1499   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1500
1501   assert(intel->gen >= 6);
1502   (void) intel;
1503
1504
1505   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1506   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1507   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1508
1509   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1510   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1511   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1512
1513   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1514       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1515       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1516      assert(src0.type != BRW_REGISTER_TYPE_F);
1517      assert(src1.type != BRW_REGISTER_TYPE_F);
1518   } else {
1519      assert(src0.type == BRW_REGISTER_TYPE_F);
1520      assert(src1.type == BRW_REGISTER_TYPE_F);
1521   }
1522
1523   /* Source modifiers are ignored for extended math instructions. */
1524   assert(!src0.negate);
1525   assert(!src0.abs);
1526   assert(!src1.negate);
1527   assert(!src1.abs);
1528
1529   /* Math is the same ISA format as other opcodes, except that CondModifier
1530    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1531    */
1532   insn->header.destreg__conditionalmod = function;
1533
1534   brw_set_dest(p, insn, dest);
1535   brw_set_src0(p, insn, src0);
1536   brw_set_src1(p, insn, src1);
1537}
1538
1539/**
1540 * Extended math function, float[16].
1541 * Use 2 send instructions.
1542 */
1543void brw_math_16( struct brw_compile *p,
1544		  struct brw_reg dest,
1545		  GLuint function,
1546		  GLuint saturate,
1547		  GLuint msg_reg_nr,
1548		  struct brw_reg src,
1549		  GLuint precision )
1550{
1551   struct intel_context *intel = &p->brw->intel;
1552   struct brw_instruction *insn;
1553
1554   if (intel->gen >= 6) {
1555      insn = next_insn(p, BRW_OPCODE_MATH);
1556
1557      /* Math is the same ISA format as other opcodes, except that CondModifier
1558       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1559       */
1560      insn->header.destreg__conditionalmod = function;
1561      insn->header.saturate = saturate;
1562
1563      /* Source modifiers are ignored for extended math instructions. */
1564      assert(!src.negate);
1565      assert(!src.abs);
1566
1567      brw_set_dest(p, insn, dest);
1568      brw_set_src0(p, insn, src);
1569      brw_set_src1(p, insn, brw_null_reg());
1570      return;
1571   }
1572
1573   /* First instruction:
1574    */
1575   brw_push_insn_state(p);
1576   brw_set_predicate_control_flag_value(p, 0xff);
1577   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1578
1579   insn = next_insn(p, BRW_OPCODE_SEND);
1580   insn->header.destreg__conditionalmod = msg_reg_nr;
1581
1582   brw_set_dest(p, insn, dest);
1583   brw_set_src0(p, insn, src);
1584   brw_set_math_message(p,
1585			insn,
1586			function,
1587			BRW_MATH_INTEGER_UNSIGNED,
1588			precision,
1589			saturate,
1590			BRW_MATH_DATA_VECTOR);
1591
1592   /* Second instruction:
1593    */
1594   insn = next_insn(p, BRW_OPCODE_SEND);
1595   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1596   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1597
1598   brw_set_dest(p, insn, offset(dest,1));
1599   brw_set_src0(p, insn, src);
1600   brw_set_math_message(p,
1601			insn,
1602			function,
1603			BRW_MATH_INTEGER_UNSIGNED,
1604			precision,
1605			saturate,
1606			BRW_MATH_DATA_VECTOR);
1607
1608   brw_pop_insn_state(p);
1609}
1610
1611
1612/**
1613 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1614 * using a constant offset per channel.
1615 *
1616 * The offset must be aligned to oword size (16 bytes).  Used for
1617 * register spilling.
1618 */
1619void brw_oword_block_write_scratch(struct brw_compile *p,
1620				   struct brw_reg mrf,
1621				   int num_regs,
1622				   GLuint offset)
1623{
1624   struct intel_context *intel = &p->brw->intel;
1625   uint32_t msg_control, msg_type;
1626   int mlen;
1627
1628   if (intel->gen >= 6)
1629      offset /= 16;
1630
1631   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1632
1633   if (num_regs == 1) {
1634      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1635      mlen = 2;
1636   } else {
1637      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1638      mlen = 3;
1639   }
1640
1641   /* Set up the message header.  This is g0, with g0.2 filled with
1642    * the offset.  We don't want to leave our offset around in g0 or
1643    * it'll screw up texture samples, so set it up inside the message
1644    * reg.
1645    */
1646   {
1647      brw_push_insn_state(p);
1648      brw_set_mask_control(p, BRW_MASK_DISABLE);
1649      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1650
1651      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1652
1653      /* set message header global offset field (reg 0, element 2) */
1654      brw_MOV(p,
1655	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1656				  mrf.nr,
1657				  2), BRW_REGISTER_TYPE_UD),
1658	      brw_imm_ud(offset));
1659
1660      brw_pop_insn_state(p);
1661   }
1662
1663   {
1664      struct brw_reg dest;
1665      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1666      int send_commit_msg;
1667      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1668					 BRW_REGISTER_TYPE_UW);
1669
1670      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1671	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1672	 src_header = vec16(src_header);
1673      }
1674      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1675      insn->header.destreg__conditionalmod = mrf.nr;
1676
1677      /* Until gen6, writes followed by reads from the same location
1678       * are not guaranteed to be ordered unless write_commit is set.
1679       * If set, then a no-op write is issued to the destination
1680       * register to set a dependency, and a read from the destination
1681       * can be used to ensure the ordering.
1682       *
1683       * For gen6, only writes between different threads need ordering
1684       * protection.  Our use of DP writes is all about register
1685       * spilling within a thread.
1686       */
1687      if (intel->gen >= 6) {
1688	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1689	 send_commit_msg = 0;
1690      } else {
1691	 dest = src_header;
1692	 send_commit_msg = 1;
1693      }
1694
1695      brw_set_dest(p, insn, dest);
1696      if (intel->gen >= 6) {
1697	 brw_set_src0(p, insn, mrf);
1698      } else {
1699	 brw_set_src0(p, insn, brw_null_reg());
1700      }
1701
1702      if (intel->gen >= 6)
1703	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1704      else
1705	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1706
1707      brw_set_dp_write_message(p,
1708			       insn,
1709			       255, /* binding table index (255=stateless) */
1710			       msg_control,
1711			       msg_type,
1712			       mlen,
1713			       true, /* header_present */
1714			       0, /* not a render target */
1715			       send_commit_msg, /* response_length */
1716			       0, /* eot */
1717			       send_commit_msg);
1718   }
1719}
1720
1721
1722/**
1723 * Read a block of owords (half a GRF each) from the scratch buffer
1724 * using a constant index per channel.
1725 *
1726 * Offset must be aligned to oword size (16 bytes).  Used for register
1727 * spilling.
1728 */
1729void
1730brw_oword_block_read_scratch(struct brw_compile *p,
1731			     struct brw_reg dest,
1732			     struct brw_reg mrf,
1733			     int num_regs,
1734			     GLuint offset)
1735{
1736   struct intel_context *intel = &p->brw->intel;
1737   uint32_t msg_control;
1738   int rlen;
1739
1740   if (intel->gen >= 6)
1741      offset /= 16;
1742
1743   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1744   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1745
1746   if (num_regs == 1) {
1747      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1748      rlen = 1;
1749   } else {
1750      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1751      rlen = 2;
1752   }
1753
1754   {
1755      brw_push_insn_state(p);
1756      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1757      brw_set_mask_control(p, BRW_MASK_DISABLE);
1758
1759      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1760
1761      /* set message header global offset field (reg 0, element 2) */
1762      brw_MOV(p,
1763	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1764				  mrf.nr,
1765				  2), BRW_REGISTER_TYPE_UD),
1766	      brw_imm_ud(offset));
1767
1768      brw_pop_insn_state(p);
1769   }
1770
1771   {
1772      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1773
1774      assert(insn->header.predicate_control == 0);
1775      insn->header.compression_control = BRW_COMPRESSION_NONE;
1776      insn->header.destreg__conditionalmod = mrf.nr;
1777
1778      brw_set_dest(p, insn, dest);	/* UW? */
1779      if (intel->gen >= 6) {
1780	 brw_set_src0(p, insn, mrf);
1781      } else {
1782	 brw_set_src0(p, insn, brw_null_reg());
1783      }
1784
1785      brw_set_dp_read_message(p,
1786			      insn,
1787			      255, /* binding table index (255=stateless) */
1788			      msg_control,
1789			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1790			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1791			      1, /* msg_length */
1792			      rlen);
1793   }
1794}
1795
1796/**
1797 * Read a float[4] vector from the data port Data Cache (const buffer).
1798 * Location (in buffer) should be a multiple of 16.
1799 * Used for fetching shader constants.
1800 */
1801void brw_oword_block_read(struct brw_compile *p,
1802			  struct brw_reg dest,
1803			  struct brw_reg mrf,
1804			  uint32_t offset,
1805			  uint32_t bind_table_index)
1806{
1807   struct intel_context *intel = &p->brw->intel;
1808
1809   /* On newer hardware, offset is in units of owords. */
1810   if (intel->gen >= 6)
1811      offset /= 16;
1812
1813   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1814
1815   brw_push_insn_state(p);
1816   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1817   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1818   brw_set_mask_control(p, BRW_MASK_DISABLE);
1819
1820   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1821
1822   /* set message header global offset field (reg 0, element 2) */
1823   brw_MOV(p,
1824	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1825			       mrf.nr,
1826			       2), BRW_REGISTER_TYPE_UD),
1827	   brw_imm_ud(offset));
1828
1829   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1830   insn->header.destreg__conditionalmod = mrf.nr;
1831
1832   /* cast dest to a uword[8] vector */
1833   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1834
1835   brw_set_dest(p, insn, dest);
1836   if (intel->gen >= 6) {
1837      brw_set_src0(p, insn, mrf);
1838   } else {
1839      brw_set_src0(p, insn, brw_null_reg());
1840   }
1841
1842   brw_set_dp_read_message(p,
1843			   insn,
1844			   bind_table_index,
1845			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1846			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1847			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1848			   1, /* msg_length */
1849			   1); /* response_length (1 reg, 2 owords!) */
1850
1851   brw_pop_insn_state(p);
1852}
1853
1854/**
1855 * Read a set of dwords from the data port Data Cache (const buffer).
1856 *
1857 * Location (in buffer) appears as UD offsets in the register after
1858 * the provided mrf header reg.
1859 */
1860void brw_dword_scattered_read(struct brw_compile *p,
1861			      struct brw_reg dest,
1862			      struct brw_reg mrf,
1863			      uint32_t bind_table_index)
1864{
1865   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1866
1867   brw_push_insn_state(p);
1868   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1869   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1870   brw_set_mask_control(p, BRW_MASK_DISABLE);
1871   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1872   brw_pop_insn_state(p);
1873
1874   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1875   insn->header.destreg__conditionalmod = mrf.nr;
1876
1877   /* cast dest to a uword[8] vector */
1878   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1879
1880   brw_set_dest(p, insn, dest);
1881   brw_set_src0(p, insn, brw_null_reg());
1882
1883   brw_set_dp_read_message(p,
1884			   insn,
1885			   bind_table_index,
1886			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1887			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1888			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1889			   2, /* msg_length */
1890			   1); /* response_length */
1891}
1892
1893
1894
1895/**
1896 * Read float[4] constant(s) from VS constant buffer.
1897 * For relative addressing, two float[4] constants will be read into 'dest'.
1898 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1899 */
1900void brw_dp_READ_4_vs(struct brw_compile *p,
1901                      struct brw_reg dest,
1902                      GLuint location,
1903                      GLuint bind_table_index)
1904{
1905   struct intel_context *intel = &p->brw->intel;
1906   struct brw_instruction *insn;
1907   GLuint msg_reg_nr = 1;
1908
1909   if (intel->gen >= 6)
1910      location /= 16;
1911
1912   /* Setup MRF[1] with location/offset into const buffer */
1913   brw_push_insn_state(p);
1914   brw_set_access_mode(p, BRW_ALIGN_1);
1915   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1916   brw_set_mask_control(p, BRW_MASK_DISABLE);
1917   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1918   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1919		     BRW_REGISTER_TYPE_UD),
1920	   brw_imm_ud(location));
1921   brw_pop_insn_state(p);
1922
1923   insn = next_insn(p, BRW_OPCODE_SEND);
1924
1925   insn->header.predicate_control = BRW_PREDICATE_NONE;
1926   insn->header.compression_control = BRW_COMPRESSION_NONE;
1927   insn->header.destreg__conditionalmod = msg_reg_nr;
1928   insn->header.mask_control = BRW_MASK_DISABLE;
1929
1930   brw_set_dest(p, insn, dest);
1931   if (intel->gen >= 6) {
1932      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
1933   } else {
1934      brw_set_src0(p, insn, brw_null_reg());
1935   }
1936
1937   brw_set_dp_read_message(p,
1938			   insn,
1939			   bind_table_index,
1940			   0,
1941			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1942			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1943			   1, /* msg_length */
1944			   1); /* response_length (1 Oword) */
1945}
1946
1947/**
1948 * Read a float[4] constant per vertex from VS constant buffer, with
1949 * relative addressing.
1950 */
1951void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1952			       struct brw_reg dest,
1953			       struct brw_reg addr_reg,
1954			       GLuint offset,
1955			       GLuint bind_table_index)
1956{
1957   struct intel_context *intel = &p->brw->intel;
1958   struct brw_reg src = brw_vec8_grf(0, 0);
1959   int msg_type;
1960
1961   /* Setup MRF[1] with offset into const buffer */
1962   brw_push_insn_state(p);
1963   brw_set_access_mode(p, BRW_ALIGN_1);
1964   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1965   brw_set_mask_control(p, BRW_MASK_DISABLE);
1966   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1967
1968   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1969    * fields ignored.
1970    */
1971   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
1972	   addr_reg, brw_imm_d(offset));
1973   brw_pop_insn_state(p);
1974
1975   gen6_resolve_implied_move(p, &src, 0);
1976   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1977
1978   insn->header.predicate_control = BRW_PREDICATE_NONE;
1979   insn->header.compression_control = BRW_COMPRESSION_NONE;
1980   insn->header.destreg__conditionalmod = 0;
1981   insn->header.mask_control = BRW_MASK_DISABLE;
1982
1983   brw_set_dest(p, insn, dest);
1984   brw_set_src0(p, insn, src);
1985
1986   if (intel->gen >= 6)
1987      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1988   else if (intel->gen == 5 || intel->is_g4x)
1989      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1990   else
1991      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1992
1993   brw_set_dp_read_message(p,
1994			   insn,
1995			   bind_table_index,
1996			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1997			   msg_type,
1998			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1999			   2, /* msg_length */
2000			   1); /* response_length */
2001}
2002
2003
2004
2005void brw_fb_WRITE(struct brw_compile *p,
2006		  int dispatch_width,
2007                  GLuint msg_reg_nr,
2008                  struct brw_reg src0,
2009                  GLuint binding_table_index,
2010                  GLuint msg_length,
2011                  GLuint response_length,
2012                  bool eot,
2013                  bool header_present)
2014{
2015   struct intel_context *intel = &p->brw->intel;
2016   struct brw_instruction *insn;
2017   GLuint msg_control, msg_type;
2018   struct brw_reg dest;
2019
2020   if (dispatch_width == 16)
2021      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2022   else
2023      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2024
2025   if (intel->gen >= 6 && binding_table_index == 0) {
2026      insn = next_insn(p, BRW_OPCODE_SENDC);
2027   } else {
2028      insn = next_insn(p, BRW_OPCODE_SEND);
2029   }
2030   /* The execution mask is ignored for render target writes. */
2031   insn->header.predicate_control = 0;
2032   insn->header.compression_control = BRW_COMPRESSION_NONE;
2033
2034   if (intel->gen >= 6) {
2035      /* headerless version, just submit color payload */
2036      src0 = brw_message_reg(msg_reg_nr);
2037
2038      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2039   } else {
2040      insn->header.destreg__conditionalmod = msg_reg_nr;
2041
2042      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2043   }
2044
2045   if (dispatch_width == 16)
2046      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2047   else
2048      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2049
2050   brw_set_dest(p, insn, dest);
2051   brw_set_src0(p, insn, src0);
2052   brw_set_dp_write_message(p,
2053			    insn,
2054			    binding_table_index,
2055			    msg_control,
2056			    msg_type,
2057			    msg_length,
2058			    header_present,
2059			    1, /* last render target write */
2060			    response_length,
2061			    eot,
2062			    0 /* send_commit_msg */);
2063}
2064
2065
2066/**
2067 * Texture sample instruction.
2068 * Note: the msg_type plus msg_length values determine exactly what kind
2069 * of sampling operation is performed.  See volume 4, page 161 of docs.
2070 */
2071void brw_SAMPLE(struct brw_compile *p,
2072		struct brw_reg dest,
2073		GLuint msg_reg_nr,
2074		struct brw_reg src0,
2075		GLuint binding_table_index,
2076		GLuint sampler,
2077		GLuint writemask,
2078		GLuint msg_type,
2079		GLuint response_length,
2080		GLuint msg_length,
2081		GLuint header_present,
2082		GLuint simd_mode)
2083{
2084   struct intel_context *intel = &p->brw->intel;
2085   bool need_stall = 0;
2086
2087   if (writemask == 0) {
2088      /*printf("%s: zero writemask??\n", __FUNCTION__); */
2089      return;
2090   }
2091
2092   /* Hardware doesn't do destination dependency checking on send
2093    * instructions properly.  Add a workaround which generates the
2094    * dependency by other means.  In practice it seems like this bug
2095    * only crops up for texture samples, and only where registers are
2096    * written by the send and then written again later without being
2097    * read in between.  Luckily for us, we already track that
2098    * information and use it to modify the writemask for the
2099    * instruction, so that is a guide for whether a workaround is
2100    * needed.
2101    */
2102   if (writemask != WRITEMASK_XYZW) {
2103      GLuint dst_offset = 0;
2104      GLuint i, newmask = 0, len = 0;
2105
2106      for (i = 0; i < 4; i++) {
2107	 if (writemask & (1<<i))
2108	    break;
2109	 dst_offset += 2;
2110      }
2111      for (; i < 4; i++) {
2112	 if (!(writemask & (1<<i)))
2113	    break;
2114	 newmask |= 1<<i;
2115	 len++;
2116      }
2117
2118      if (newmask != writemask) {
2119	 need_stall = 1;
2120         /* printf("need stall %x %x\n", newmask , writemask); */
2121      }
2122      else {
2123	 bool dispatch_16 = false;
2124
2125	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2126
2127	 guess_execution_size(p, p->current, dest);
2128	 if (p->current->header.execution_size == BRW_EXECUTE_16)
2129	    dispatch_16 = true;
2130
2131	 newmask = ~newmask & WRITEMASK_XYZW;
2132
2133	 brw_push_insn_state(p);
2134
2135	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2136	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2137
2138	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2139		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2140  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2141
2142	 brw_pop_insn_state(p);
2143
2144  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2145	 dest = offset(dest, dst_offset);
2146
2147	 /* For 16-wide dispatch, masked channels are skipped in the
2148	  * response.  For 8-wide, masked channels still take up slots,
2149	  * and are just not written to.
2150	  */
2151	 if (dispatch_16)
2152	    response_length = len * 2;
2153      }
2154   }
2155
2156   {
2157      struct brw_instruction *insn;
2158
2159      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2160
2161      insn = next_insn(p, BRW_OPCODE_SEND);
2162      insn->header.predicate_control = 0; /* XXX */
2163      insn->header.compression_control = BRW_COMPRESSION_NONE;
2164      if (intel->gen < 6)
2165	  insn->header.destreg__conditionalmod = msg_reg_nr;
2166
2167      brw_set_dest(p, insn, dest);
2168      brw_set_src0(p, insn, src0);
2169      brw_set_sampler_message(p, insn,
2170			      binding_table_index,
2171			      sampler,
2172			      msg_type,
2173			      response_length,
2174			      msg_length,
2175			      header_present,
2176			      simd_mode);
2177   }
2178
2179   if (need_stall) {
2180      struct brw_reg reg = vec8(offset(dest, response_length-1));
2181
2182      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2183       */
2184      brw_push_insn_state(p);
2185      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2186      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2187	      retype(reg, BRW_REGISTER_TYPE_UD));
2188      brw_pop_insn_state(p);
2189   }
2190
2191}
2192
2193/* All these variables are pretty confusing - we might be better off
2194 * using bitmasks and macros for this, in the old style.  Or perhaps
2195 * just having the caller instantiate the fields in dword3 itself.
2196 */
2197void brw_urb_WRITE(struct brw_compile *p,
2198		   struct brw_reg dest,
2199		   GLuint msg_reg_nr,
2200		   struct brw_reg src0,
2201		   bool allocate,
2202		   bool used,
2203		   GLuint msg_length,
2204		   GLuint response_length,
2205		   bool eot,
2206		   bool writes_complete,
2207		   GLuint offset,
2208		   GLuint swizzle)
2209{
2210   struct intel_context *intel = &p->brw->intel;
2211   struct brw_instruction *insn;
2212
2213   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2214
2215   if (intel->gen == 7) {
2216      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2217      brw_push_insn_state(p);
2218      brw_set_access_mode(p, BRW_ALIGN_1);
2219      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2220		       BRW_REGISTER_TYPE_UD),
2221	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2222		brw_imm_ud(0xff00));
2223      brw_pop_insn_state(p);
2224   }
2225
2226   insn = next_insn(p, BRW_OPCODE_SEND);
2227
2228   assert(msg_length < BRW_MAX_MRF);
2229
2230   brw_set_dest(p, insn, dest);
2231   brw_set_src0(p, insn, src0);
2232   brw_set_src1(p, insn, brw_imm_d(0));
2233
2234   if (intel->gen < 6)
2235      insn->header.destreg__conditionalmod = msg_reg_nr;
2236
2237   brw_set_urb_message(p,
2238		       insn,
2239		       allocate,
2240		       used,
2241		       msg_length,
2242		       response_length,
2243		       eot,
2244		       writes_complete,
2245		       offset,
2246		       swizzle);
2247}
2248
2249static int
2250brw_find_next_block_end(struct brw_compile *p, int start)
2251{
2252   int ip;
2253
2254   for (ip = start + 1; ip < p->nr_insn; ip++) {
2255      struct brw_instruction *insn = &p->store[ip];
2256
2257      switch (insn->header.opcode) {
2258      case BRW_OPCODE_ENDIF:
2259      case BRW_OPCODE_ELSE:
2260      case BRW_OPCODE_WHILE:
2261	 return ip;
2262      }
2263   }
2264   assert(!"not reached");
2265   return start + 1;
2266}
2267
2268/* There is no DO instruction on gen6, so to find the end of the loop
2269 * we have to see if the loop is jumping back before our start
2270 * instruction.
2271 */
2272static int
2273brw_find_loop_end(struct brw_compile *p, int start)
2274{
2275   struct intel_context *intel = &p->brw->intel;
2276   int ip;
2277   int br = 2;
2278
2279   for (ip = start + 1; ip < p->nr_insn; ip++) {
2280      struct brw_instruction *insn = &p->store[ip];
2281
2282      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2283	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2284				   : insn->bits3.break_cont.jip;
2285	 if (ip + jip / br <= start)
2286	    return ip;
2287      }
2288   }
2289   assert(!"not reached");
2290   return start + 1;
2291}
2292
2293/* After program generation, go back and update the UIP and JIP of
2294 * BREAK and CONT instructions to their correct locations.
2295 */
2296void
2297brw_set_uip_jip(struct brw_compile *p)
2298{
2299   struct intel_context *intel = &p->brw->intel;
2300   int ip;
2301   int br = 2;
2302
2303   if (intel->gen < 6)
2304      return;
2305
2306   for (ip = 0; ip < p->nr_insn; ip++) {
2307      struct brw_instruction *insn = &p->store[ip];
2308
2309      switch (insn->header.opcode) {
2310      case BRW_OPCODE_BREAK:
2311	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2312	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2313	 insn->bits3.break_cont.uip =
2314	    br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2315	 break;
2316      case BRW_OPCODE_CONTINUE:
2317	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2318	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2319
2320	 assert(insn->bits3.break_cont.uip != 0);
2321	 assert(insn->bits3.break_cont.jip != 0);
2322	 break;
2323      }
2324   }
2325}
2326
2327void brw_ff_sync(struct brw_compile *p,
2328		   struct brw_reg dest,
2329		   GLuint msg_reg_nr,
2330		   struct brw_reg src0,
2331		   bool allocate,
2332		   GLuint response_length,
2333		   bool eot)
2334{
2335   struct intel_context *intel = &p->brw->intel;
2336   struct brw_instruction *insn;
2337
2338   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2339
2340   insn = next_insn(p, BRW_OPCODE_SEND);
2341   brw_set_dest(p, insn, dest);
2342   brw_set_src0(p, insn, src0);
2343   brw_set_src1(p, insn, brw_imm_d(0));
2344
2345   if (intel->gen < 6)
2346      insn->header.destreg__conditionalmod = msg_reg_nr;
2347
2348   brw_set_ff_sync_message(p,
2349			   insn,
2350			   allocate,
2351			   response_length,
2352			   eot);
2353}
2354