brw_eu_emit.c revision a73c65c5342bf41fa0dfefe7daa9197ce6a11db4
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct intel_context *intel = &p->brw->intel;
67   if (intel->gen < 6)
68      return;
69
70   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
71      brw_push_insn_state(p);
72      brw_set_mask_control(p, BRW_MASK_DISABLE);
73      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
74      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
75	      retype(*src, BRW_REGISTER_TYPE_UD));
76      brw_pop_insn_state(p);
77   }
78   *src = brw_message_reg(msg_reg_nr);
79}
80
81static void
82gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
83{
84   struct intel_context *intel = &p->brw->intel;
85   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
86      reg->file = BRW_GENERAL_REGISTER_FILE;
87      reg->nr += 111;
88   }
89}
90
91
92void
93brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
94	     struct brw_reg dest)
95{
96   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
97       dest.file != BRW_MESSAGE_REGISTER_FILE)
98      assert(dest.nr < 128);
99
100   gen7_convert_mrf_to_grf(p, &dest);
101
102   insn->bits1.da1.dest_reg_file = dest.file;
103   insn->bits1.da1.dest_reg_type = dest.type;
104   insn->bits1.da1.dest_address_mode = dest.address_mode;
105
106   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
107      insn->bits1.da1.dest_reg_nr = dest.nr;
108
109      if (insn->header.access_mode == BRW_ALIGN_1) {
110	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
111	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
112	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
113	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
114      }
115      else {
116	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
117	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
118	 /* even ignored in da16, still need to set as '01' */
119	 insn->bits1.da16.dest_horiz_stride = 1;
120      }
121   }
122   else {
123      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
124
125      /* These are different sizes in align1 vs align16:
126       */
127      if (insn->header.access_mode == BRW_ALIGN_1) {
128	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
129	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
130	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
131	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
132      }
133      else {
134	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
135	 /* even ignored in da16, still need to set as '01' */
136	 insn->bits1.ia16.dest_horiz_stride = 1;
137      }
138   }
139
140   /* NEW: Set the execution size based on dest.width and
141    * insn->compression_control:
142    */
143   guess_execution_size(p, insn, dest);
144}
145
146extern int reg_type_size[];
147
148static void
149validate_reg(struct brw_instruction *insn, struct brw_reg reg)
150{
151   int hstride_for_reg[] = {0, 1, 2, 4};
152   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
153   int width_for_reg[] = {1, 2, 4, 8, 16};
154   int execsize_for_reg[] = {1, 2, 4, 8, 16};
155   int width, hstride, vstride, execsize;
156
157   if (reg.file == BRW_IMMEDIATE_VALUE) {
158      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
159       * mean the destination has to be 128-bit aligned and the
160       * destination horiz stride has to be a word.
161       */
162      if (reg.type == BRW_REGISTER_TYPE_V) {
163	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
164		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
165      }
166
167      return;
168   }
169
170   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
171       reg.file == BRW_ARF_NULL)
172      return;
173
174   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
175   hstride = hstride_for_reg[reg.hstride];
176
177   if (reg.vstride == 0xf) {
178      vstride = -1;
179   } else {
180      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
181      vstride = vstride_for_reg[reg.vstride];
182   }
183
184   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
185   width = width_for_reg[reg.width];
186
187   assert(insn->header.execution_size >= 0 &&
188	  insn->header.execution_size < Elements(execsize_for_reg));
189   execsize = execsize_for_reg[insn->header.execution_size];
190
191   /* Restrictions from 3.3.10: Register Region Restrictions. */
192   /* 3. */
193   assert(execsize >= width);
194
195   /* 4. */
196   if (execsize == width && hstride != 0) {
197      assert(vstride == -1 || vstride == width * hstride);
198   }
199
200   /* 5. */
201   if (execsize == width && hstride == 0) {
202      /* no restriction on vstride. */
203   }
204
205   /* 6. */
206   if (width == 1) {
207      assert(hstride == 0);
208   }
209
210   /* 7. */
211   if (execsize == 1 && width == 1) {
212      assert(hstride == 0);
213      assert(vstride == 0);
214   }
215
216   /* 8. */
217   if (vstride == 0 && hstride == 0) {
218      assert(width == 1);
219   }
220
221   /* 10. Check destination issues. */
222}
223
224void
225brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
226	     struct brw_reg reg)
227{
228   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
229      assert(reg.nr < 128);
230
231   gen7_convert_mrf_to_grf(p, &reg);
232
233   validate_reg(insn, reg);
234
235   insn->bits1.da1.src0_reg_file = reg.file;
236   insn->bits1.da1.src0_reg_type = reg.type;
237   insn->bits2.da1.src0_abs = reg.abs;
238   insn->bits2.da1.src0_negate = reg.negate;
239   insn->bits2.da1.src0_address_mode = reg.address_mode;
240
241   if (reg.file == BRW_IMMEDIATE_VALUE) {
242      insn->bits3.ud = reg.dw1.ud;
243
244      /* Required to set some fields in src1 as well:
245       */
246      insn->bits1.da1.src1_reg_file = 0; /* arf */
247      insn->bits1.da1.src1_reg_type = reg.type;
248   }
249   else
250   {
251      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
252	 if (insn->header.access_mode == BRW_ALIGN_1) {
253	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
254	    insn->bits2.da1.src0_reg_nr = reg.nr;
255	 }
256	 else {
257	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
258	    insn->bits2.da16.src0_reg_nr = reg.nr;
259	 }
260      }
261      else {
262	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
263
264	 if (insn->header.access_mode == BRW_ALIGN_1) {
265	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
266	 }
267	 else {
268	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
269	 }
270      }
271
272      if (insn->header.access_mode == BRW_ALIGN_1) {
273	 if (reg.width == BRW_WIDTH_1 &&
274	     insn->header.execution_size == BRW_EXECUTE_1) {
275	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
276	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
277	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
278	 }
279	 else {
280	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
281	    insn->bits2.da1.src0_width = reg.width;
282	    insn->bits2.da1.src0_vert_stride = reg.vstride;
283	 }
284      }
285      else {
286	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
287	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
288	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
289	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
290
291	 /* This is an oddity of the fact we're using the same
292	  * descriptions for registers in align_16 as align_1:
293	  */
294	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
295	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
296	 else
297	    insn->bits2.da16.src0_vert_stride = reg.vstride;
298      }
299   }
300}
301
302
303void brw_set_src1(struct brw_compile *p,
304		  struct brw_instruction *insn,
305		  struct brw_reg reg)
306{
307   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
308
309   assert(reg.nr < 128);
310
311   gen7_convert_mrf_to_grf(p, &reg);
312
313   validate_reg(insn, reg);
314
315   insn->bits1.da1.src1_reg_file = reg.file;
316   insn->bits1.da1.src1_reg_type = reg.type;
317   insn->bits3.da1.src1_abs = reg.abs;
318   insn->bits3.da1.src1_negate = reg.negate;
319
320   /* Only src1 can be immediate in two-argument instructions.
321    */
322   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
323
324   if (reg.file == BRW_IMMEDIATE_VALUE) {
325      insn->bits3.ud = reg.dw1.ud;
326   }
327   else {
328      /* This is a hardware restriction, which may or may not be lifted
329       * in the future:
330       */
331      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
332      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
333
334      if (insn->header.access_mode == BRW_ALIGN_1) {
335	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
336	 insn->bits3.da1.src1_reg_nr = reg.nr;
337      }
338      else {
339	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
340	 insn->bits3.da16.src1_reg_nr = reg.nr;
341      }
342
343      if (insn->header.access_mode == BRW_ALIGN_1) {
344	 if (reg.width == BRW_WIDTH_1 &&
345	     insn->header.execution_size == BRW_EXECUTE_1) {
346	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
347	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
348	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
349	 }
350	 else {
351	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
352	    insn->bits3.da1.src1_width = reg.width;
353	    insn->bits3.da1.src1_vert_stride = reg.vstride;
354	 }
355      }
356      else {
357	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
358	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
359	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
360	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
361
362	 /* This is an oddity of the fact we're using the same
363	  * descriptions for registers in align_16 as align_1:
364	  */
365	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
366	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
367	 else
368	    insn->bits3.da16.src1_vert_stride = reg.vstride;
369      }
370   }
371}
372
373/**
374 * Set the Message Descriptor and Extended Message Descriptor fields
375 * for SEND messages.
376 *
377 * \note This zeroes out the Function Control bits, so it must be called
378 *       \b before filling out any message-specific data.  Callers can
379 *       choose not to fill in irrelevant bits; they will be zero.
380 */
381static void
382brw_set_message_descriptor(struct brw_compile *p,
383			   struct brw_instruction *inst,
384			   enum brw_message_target sfid,
385			   unsigned msg_length,
386			   unsigned response_length,
387			   bool header_present,
388			   bool end_of_thread)
389{
390   struct intel_context *intel = &p->brw->intel;
391
392   brw_set_src1(p, inst, brw_imm_d(0));
393
394   if (intel->gen >= 5) {
395      inst->bits3.generic_gen5.header_present = header_present;
396      inst->bits3.generic_gen5.response_length = response_length;
397      inst->bits3.generic_gen5.msg_length = msg_length;
398      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
399
400      if (intel->gen >= 6) {
401	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
402	 inst->header.destreg__conditionalmod = sfid;
403      } else {
404	 /* Set Extended Message Descriptor (ex_desc) */
405	 inst->bits2.send_gen5.sfid = sfid;
406	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
407      }
408   } else {
409      inst->bits3.generic.response_length = response_length;
410      inst->bits3.generic.msg_length = msg_length;
411      inst->bits3.generic.msg_target = sfid;
412      inst->bits3.generic.end_of_thread = end_of_thread;
413   }
414}
415
416static void brw_set_math_message( struct brw_compile *p,
417				  struct brw_instruction *insn,
418				  GLuint function,
419				  GLuint integer_type,
420				  bool low_precision,
421				  bool saturate,
422				  GLuint dataType )
423{
424   struct brw_context *brw = p->brw;
425   struct intel_context *intel = &brw->intel;
426   unsigned msg_length;
427   unsigned response_length;
428
429   /* Infer message length from the function */
430   switch (function) {
431   case BRW_MATH_FUNCTION_POW:
432   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
433   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
434   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
435      msg_length = 2;
436      break;
437   default:
438      msg_length = 1;
439      break;
440   }
441
442   /* Infer response length from the function */
443   switch (function) {
444   case BRW_MATH_FUNCTION_SINCOS:
445   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
446      response_length = 2;
447      break;
448   default:
449      response_length = 1;
450      break;
451   }
452
453   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
454			      msg_length, response_length, false, false);
455   if (intel->gen == 5) {
456      insn->bits3.math_gen5.function = function;
457      insn->bits3.math_gen5.int_type = integer_type;
458      insn->bits3.math_gen5.precision = low_precision;
459      insn->bits3.math_gen5.saturate = saturate;
460      insn->bits3.math_gen5.data_type = dataType;
461      insn->bits3.math_gen5.snapshot = 0;
462   } else {
463      insn->bits3.math.function = function;
464      insn->bits3.math.int_type = integer_type;
465      insn->bits3.math.precision = low_precision;
466      insn->bits3.math.saturate = saturate;
467      insn->bits3.math.data_type = dataType;
468   }
469}
470
471
472static void brw_set_ff_sync_message(struct brw_compile *p,
473				    struct brw_instruction *insn,
474				    bool allocate,
475				    GLuint response_length,
476				    bool end_of_thread)
477{
478   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
479			      1, response_length, true, end_of_thread);
480   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
481   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
482   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
483   insn->bits3.urb_gen5.allocate = allocate;
484   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
485   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
486}
487
488static void brw_set_urb_message( struct brw_compile *p,
489				 struct brw_instruction *insn,
490				 bool allocate,
491				 bool used,
492				 GLuint msg_length,
493				 GLuint response_length,
494				 bool end_of_thread,
495				 bool complete,
496				 GLuint offset,
497				 GLuint swizzle_control )
498{
499   struct brw_context *brw = p->brw;
500   struct intel_context *intel = &brw->intel;
501
502   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
503			      msg_length, response_length, true, end_of_thread);
504   if (intel->gen == 7) {
505      insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
506      insn->bits3.urb_gen7.offset = offset;
507      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
508      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
509      /* per_slot_offset = 0 makes it ignore offsets in message header */
510      insn->bits3.urb_gen7.per_slot_offset = 0;
511      insn->bits3.urb_gen7.complete = complete;
512   } else if (intel->gen >= 5) {
513      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
514      insn->bits3.urb_gen5.offset = offset;
515      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
516      insn->bits3.urb_gen5.allocate = allocate;
517      insn->bits3.urb_gen5.used = used;	/* ? */
518      insn->bits3.urb_gen5.complete = complete;
519   } else {
520      insn->bits3.urb.opcode = 0;	/* ? */
521      insn->bits3.urb.offset = offset;
522      insn->bits3.urb.swizzle_control = swizzle_control;
523      insn->bits3.urb.allocate = allocate;
524      insn->bits3.urb.used = used;	/* ? */
525      insn->bits3.urb.complete = complete;
526   }
527}
528
529void
530brw_set_dp_write_message(struct brw_compile *p,
531			 struct brw_instruction *insn,
532			 GLuint binding_table_index,
533			 GLuint msg_control,
534			 GLuint msg_type,
535			 GLuint msg_length,
536			 bool header_present,
537			 GLuint last_render_target,
538			 GLuint response_length,
539			 GLuint end_of_thread,
540			 GLuint send_commit_msg)
541{
542   struct brw_context *brw = p->brw;
543   struct intel_context *intel = &brw->intel;
544   unsigned sfid;
545
546   if (intel->gen >= 7) {
547      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
548      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
549	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
550      else
551	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
552   } else if (intel->gen == 6) {
553      /* Use the render cache for all write messages. */
554      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
555   } else {
556      sfid = BRW_SFID_DATAPORT_WRITE;
557   }
558
559   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
560			      header_present, end_of_thread);
561
562   if (intel->gen >= 7) {
563      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
564      insn->bits3.gen7_dp.msg_control = msg_control;
565      insn->bits3.gen7_dp.last_render_target = last_render_target;
566      insn->bits3.gen7_dp.msg_type = msg_type;
567   } else if (intel->gen == 6) {
568      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
569      insn->bits3.gen6_dp.msg_control = msg_control;
570      insn->bits3.gen6_dp.last_render_target = last_render_target;
571      insn->bits3.gen6_dp.msg_type = msg_type;
572      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
573   } else if (intel->gen == 5) {
574      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
575      insn->bits3.dp_write_gen5.msg_control = msg_control;
576      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
577      insn->bits3.dp_write_gen5.msg_type = msg_type;
578      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
579   } else {
580      insn->bits3.dp_write.binding_table_index = binding_table_index;
581      insn->bits3.dp_write.msg_control = msg_control;
582      insn->bits3.dp_write.last_render_target = last_render_target;
583      insn->bits3.dp_write.msg_type = msg_type;
584      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
585   }
586}
587
588void
589brw_set_dp_read_message(struct brw_compile *p,
590			struct brw_instruction *insn,
591			GLuint binding_table_index,
592			GLuint msg_control,
593			GLuint msg_type,
594			GLuint target_cache,
595			GLuint msg_length,
596			GLuint response_length)
597{
598   struct brw_context *brw = p->brw;
599   struct intel_context *intel = &brw->intel;
600   unsigned sfid;
601
602   if (intel->gen >= 7) {
603      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
604   } else if (intel->gen == 6) {
605      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
606	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
607      else
608	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
609   } else {
610      sfid = BRW_SFID_DATAPORT_READ;
611   }
612
613   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
614			      true, false);
615
616   if (intel->gen >= 7) {
617      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
618      insn->bits3.gen7_dp.msg_control = msg_control;
619      insn->bits3.gen7_dp.last_render_target = 0;
620      insn->bits3.gen7_dp.msg_type = msg_type;
621   } else if (intel->gen == 6) {
622      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
623      insn->bits3.gen6_dp.msg_control = msg_control;
624      insn->bits3.gen6_dp.last_render_target = 0;
625      insn->bits3.gen6_dp.msg_type = msg_type;
626      insn->bits3.gen6_dp.send_commit_msg = 0;
627   } else if (intel->gen == 5) {
628      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
629      insn->bits3.dp_read_gen5.msg_control = msg_control;
630      insn->bits3.dp_read_gen5.msg_type = msg_type;
631      insn->bits3.dp_read_gen5.target_cache = target_cache;
632   } else if (intel->is_g4x) {
633      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
634      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
635      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
636      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
637   } else {
638      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
639      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
640      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
641      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
642   }
643}
644
645static void brw_set_sampler_message(struct brw_compile *p,
646                                    struct brw_instruction *insn,
647                                    GLuint binding_table_index,
648                                    GLuint sampler,
649                                    GLuint msg_type,
650                                    GLuint response_length,
651                                    GLuint msg_length,
652                                    GLuint header_present,
653                                    GLuint simd_mode)
654{
655   struct brw_context *brw = p->brw;
656   struct intel_context *intel = &brw->intel;
657
658   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
659			      response_length, header_present, false);
660
661   if (intel->gen >= 7) {
662      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
663      insn->bits3.sampler_gen7.sampler = sampler;
664      insn->bits3.sampler_gen7.msg_type = msg_type;
665      insn->bits3.sampler_gen7.simd_mode = simd_mode;
666   } else if (intel->gen >= 5) {
667      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
668      insn->bits3.sampler_gen5.sampler = sampler;
669      insn->bits3.sampler_gen5.msg_type = msg_type;
670      insn->bits3.sampler_gen5.simd_mode = simd_mode;
671   } else if (intel->is_g4x) {
672      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
673      insn->bits3.sampler_g4x.sampler = sampler;
674      insn->bits3.sampler_g4x.msg_type = msg_type;
675   } else {
676      insn->bits3.sampler.binding_table_index = binding_table_index;
677      insn->bits3.sampler.sampler = sampler;
678      insn->bits3.sampler.msg_type = msg_type;
679      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
680   }
681}
682
683
684#define next_insn brw_next_insn
685struct brw_instruction *
686brw_next_insn(struct brw_compile *p, GLuint opcode)
687{
688   struct brw_instruction *insn;
689
690   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
691
692   insn = &p->store[p->nr_insn++];
693   memcpy(insn, p->current, sizeof(*insn));
694
695   /* Reset this one-shot flag:
696    */
697
698   if (p->current->header.destreg__conditionalmod) {
699      p->current->header.destreg__conditionalmod = 0;
700      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
701   }
702
703   insn->header.opcode = opcode;
704   return insn;
705}
706
707static struct brw_instruction *brw_alu1( struct brw_compile *p,
708					 GLuint opcode,
709					 struct brw_reg dest,
710					 struct brw_reg src )
711{
712   struct brw_instruction *insn = next_insn(p, opcode);
713   brw_set_dest(p, insn, dest);
714   brw_set_src0(p, insn, src);
715   return insn;
716}
717
718static struct brw_instruction *brw_alu2(struct brw_compile *p,
719					GLuint opcode,
720					struct brw_reg dest,
721					struct brw_reg src0,
722					struct brw_reg src1 )
723{
724   struct brw_instruction *insn = next_insn(p, opcode);
725   brw_set_dest(p, insn, dest);
726   brw_set_src0(p, insn, src0);
727   brw_set_src1(p, insn, src1);
728   return insn;
729}
730
731
732/***********************************************************************
733 * Convenience routines.
734 */
735#define ALU1(OP)					\
736struct brw_instruction *brw_##OP(struct brw_compile *p,	\
737	      struct brw_reg dest,			\
738	      struct brw_reg src0)   			\
739{							\
740   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
741}
742
743#define ALU2(OP)					\
744struct brw_instruction *brw_##OP(struct brw_compile *p,	\
745	      struct brw_reg dest,			\
746	      struct brw_reg src0,			\
747	      struct brw_reg src1)   			\
748{							\
749   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
750}
751
752/* Rounding operations (other than RNDD) require two instructions - the first
753 * stores a rounded value (possibly the wrong way) in the dest register, but
754 * also sets a per-channel "increment bit" in the flag register.  A predicated
755 * add of 1.0 fixes dest to contain the desired result.
756 *
757 * Sandybridge and later appear to round correctly without an ADD.
758 */
759#define ROUND(OP)							      \
760void brw_##OP(struct brw_compile *p,					      \
761	      struct brw_reg dest,					      \
762	      struct brw_reg src)					      \
763{									      \
764   struct brw_instruction *rnd, *add;					      \
765   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
766   brw_set_dest(p, rnd, dest);						      \
767   brw_set_src0(p, rnd, src);						      \
768									      \
769   if (p->brw->intel.gen < 6) {						      \
770      /* turn on round-increments */					      \
771      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
772      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
773      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
774   }									      \
775}
776
777
778ALU1(MOV)
779ALU2(SEL)
780ALU1(NOT)
781ALU2(AND)
782ALU2(OR)
783ALU2(XOR)
784ALU2(SHR)
785ALU2(SHL)
786ALU2(RSR)
787ALU2(RSL)
788ALU2(ASR)
789ALU1(FRC)
790ALU1(RNDD)
791ALU2(MAC)
792ALU2(MACH)
793ALU1(LZD)
794ALU2(DP4)
795ALU2(DPH)
796ALU2(DP3)
797ALU2(DP2)
798ALU2(LINE)
799ALU2(PLN)
800
801
802ROUND(RNDZ)
803ROUND(RNDE)
804
805
806struct brw_instruction *brw_ADD(struct brw_compile *p,
807				struct brw_reg dest,
808				struct brw_reg src0,
809				struct brw_reg src1)
810{
811   /* 6.2.2: add */
812   if (src0.type == BRW_REGISTER_TYPE_F ||
813       (src0.file == BRW_IMMEDIATE_VALUE &&
814	src0.type == BRW_REGISTER_TYPE_VF)) {
815      assert(src1.type != BRW_REGISTER_TYPE_UD);
816      assert(src1.type != BRW_REGISTER_TYPE_D);
817   }
818
819   if (src1.type == BRW_REGISTER_TYPE_F ||
820       (src1.file == BRW_IMMEDIATE_VALUE &&
821	src1.type == BRW_REGISTER_TYPE_VF)) {
822      assert(src0.type != BRW_REGISTER_TYPE_UD);
823      assert(src0.type != BRW_REGISTER_TYPE_D);
824   }
825
826   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
827}
828
829struct brw_instruction *brw_MUL(struct brw_compile *p,
830				struct brw_reg dest,
831				struct brw_reg src0,
832				struct brw_reg src1)
833{
834   /* 6.32.38: mul */
835   if (src0.type == BRW_REGISTER_TYPE_D ||
836       src0.type == BRW_REGISTER_TYPE_UD ||
837       src1.type == BRW_REGISTER_TYPE_D ||
838       src1.type == BRW_REGISTER_TYPE_UD) {
839      assert(dest.type != BRW_REGISTER_TYPE_F);
840   }
841
842   if (src0.type == BRW_REGISTER_TYPE_F ||
843       (src0.file == BRW_IMMEDIATE_VALUE &&
844	src0.type == BRW_REGISTER_TYPE_VF)) {
845      assert(src1.type != BRW_REGISTER_TYPE_UD);
846      assert(src1.type != BRW_REGISTER_TYPE_D);
847   }
848
849   if (src1.type == BRW_REGISTER_TYPE_F ||
850       (src1.file == BRW_IMMEDIATE_VALUE &&
851	src1.type == BRW_REGISTER_TYPE_VF)) {
852      assert(src0.type != BRW_REGISTER_TYPE_UD);
853      assert(src0.type != BRW_REGISTER_TYPE_D);
854   }
855
856   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
857	  src0.nr != BRW_ARF_ACCUMULATOR);
858   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
859	  src1.nr != BRW_ARF_ACCUMULATOR);
860
861   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
862}
863
864
865void brw_NOP(struct brw_compile *p)
866{
867   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
868   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
869   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
870   brw_set_src1(p, insn, brw_imm_ud(0x0));
871}
872
873
874
875
876
877/***********************************************************************
878 * Comparisons, if/else/endif
879 */
880
881struct brw_instruction *brw_JMPI(struct brw_compile *p,
882                                 struct brw_reg dest,
883                                 struct brw_reg src0,
884                                 struct brw_reg src1)
885{
886   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
887
888   insn->header.execution_size = 1;
889   insn->header.compression_control = BRW_COMPRESSION_NONE;
890   insn->header.mask_control = BRW_MASK_DISABLE;
891
892   p->current->header.predicate_control = BRW_PREDICATE_NONE;
893
894   return insn;
895}
896
897static void
898push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
899{
900   p->if_stack[p->if_stack_depth] = inst;
901
902   p->if_stack_depth++;
903   if (p->if_stack_array_size <= p->if_stack_depth) {
904      p->if_stack_array_size *= 2;
905      p->if_stack = reralloc(p->mem_ctx, p->if_stack, struct brw_instruction *,
906			     p->if_stack_array_size);
907   }
908}
909
910/* EU takes the value from the flag register and pushes it onto some
911 * sort of a stack (presumably merging with any flag value already on
912 * the stack).  Within an if block, the flags at the top of the stack
913 * control execution on each channel of the unit, eg. on each of the
914 * 16 pixel values in our wm programs.
915 *
916 * When the matching 'else' instruction is reached (presumably by
917 * countdown of the instruction count patched in by our ELSE/ENDIF
918 * functions), the relevent flags are inverted.
919 *
920 * When the matching 'endif' instruction is reached, the flags are
921 * popped off.  If the stack is now empty, normal execution resumes.
922 */
923struct brw_instruction *
924brw_IF(struct brw_compile *p, GLuint execute_size)
925{
926   struct intel_context *intel = &p->brw->intel;
927   struct brw_instruction *insn;
928
929   insn = next_insn(p, BRW_OPCODE_IF);
930
931   /* Override the defaults for this instruction:
932    */
933   if (intel->gen < 6) {
934      brw_set_dest(p, insn, brw_ip_reg());
935      brw_set_src0(p, insn, brw_ip_reg());
936      brw_set_src1(p, insn, brw_imm_d(0x0));
937   } else if (intel->gen == 6) {
938      brw_set_dest(p, insn, brw_imm_w(0));
939      insn->bits1.branch_gen6.jump_count = 0;
940      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
941      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
942   } else {
943      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
944      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
945      brw_set_src1(p, insn, brw_imm_ud(0));
946      insn->bits3.break_cont.jip = 0;
947      insn->bits3.break_cont.uip = 0;
948   }
949
950   insn->header.execution_size = execute_size;
951   insn->header.compression_control = BRW_COMPRESSION_NONE;
952   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
953   insn->header.mask_control = BRW_MASK_ENABLE;
954   if (!p->single_program_flow)
955      insn->header.thread_control = BRW_THREAD_SWITCH;
956
957   p->current->header.predicate_control = BRW_PREDICATE_NONE;
958
959   push_if_stack(p, insn);
960   return insn;
961}
962
963/* This function is only used for gen6-style IF instructions with an
964 * embedded comparison (conditional modifier).  It is not used on gen7.
965 */
966struct brw_instruction *
967gen6_IF(struct brw_compile *p, uint32_t conditional,
968	struct brw_reg src0, struct brw_reg src1)
969{
970   struct brw_instruction *insn;
971
972   insn = next_insn(p, BRW_OPCODE_IF);
973
974   brw_set_dest(p, insn, brw_imm_w(0));
975   if (p->compressed) {
976      insn->header.execution_size = BRW_EXECUTE_16;
977   } else {
978      insn->header.execution_size = BRW_EXECUTE_8;
979   }
980   insn->bits1.branch_gen6.jump_count = 0;
981   brw_set_src0(p, insn, src0);
982   brw_set_src1(p, insn, src1);
983
984   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
985   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
986   insn->header.destreg__conditionalmod = conditional;
987
988   if (!p->single_program_flow)
989      insn->header.thread_control = BRW_THREAD_SWITCH;
990
991   push_if_stack(p, insn);
992   return insn;
993}
994
995/**
996 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
997 */
998static void
999convert_IF_ELSE_to_ADD(struct brw_compile *p,
1000		       struct brw_instruction *if_inst,
1001		       struct brw_instruction *else_inst)
1002{
1003   /* The next instruction (where the ENDIF would be, if it existed) */
1004   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1005
1006   assert(p->single_program_flow);
1007   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1008   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1009   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1010
1011   /* Convert IF to an ADD instruction that moves the instruction pointer
1012    * to the first instruction of the ELSE block.  If there is no ELSE
1013    * block, point to where ENDIF would be.  Reverse the predicate.
1014    *
1015    * There's no need to execute an ENDIF since we don't need to do any
1016    * stack operations, and if we're currently executing, we just want to
1017    * continue normally.
1018    */
1019   if_inst->header.opcode = BRW_OPCODE_ADD;
1020   if_inst->header.predicate_inverse = 1;
1021
1022   if (else_inst != NULL) {
1023      /* Convert ELSE to an ADD instruction that points where the ENDIF
1024       * would be.
1025       */
1026      else_inst->header.opcode = BRW_OPCODE_ADD;
1027
1028      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1029      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1030   } else {
1031      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1032   }
1033}
1034
1035/**
1036 * Patch IF and ELSE instructions with appropriate jump targets.
1037 */
1038static void
1039patch_IF_ELSE(struct brw_compile *p,
1040	      struct brw_instruction *if_inst,
1041	      struct brw_instruction *else_inst,
1042	      struct brw_instruction *endif_inst)
1043{
1044   struct intel_context *intel = &p->brw->intel;
1045
1046   assert(!p->single_program_flow);
1047   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1048   assert(endif_inst != NULL);
1049   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1050
1051   unsigned br = 1;
1052   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1053    * requires 2 chunks.
1054    */
1055   if (intel->gen >= 5)
1056      br = 2;
1057
1058   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1059   endif_inst->header.execution_size = if_inst->header.execution_size;
1060
1061   if (else_inst == NULL) {
1062      /* Patch IF -> ENDIF */
1063      if (intel->gen < 6) {
1064	 /* Turn it into an IFF, which means no mask stack operations for
1065	  * all-false and jumping past the ENDIF.
1066	  */
1067	 if_inst->header.opcode = BRW_OPCODE_IFF;
1068	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1069	 if_inst->bits3.if_else.pop_count = 0;
1070	 if_inst->bits3.if_else.pad0 = 0;
1071      } else if (intel->gen == 6) {
1072	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1073	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1074      } else {
1075	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1076	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1077      }
1078   } else {
1079      else_inst->header.execution_size = if_inst->header.execution_size;
1080
1081      /* Patch IF -> ELSE */
1082      if (intel->gen < 6) {
1083	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1084	 if_inst->bits3.if_else.pop_count = 0;
1085	 if_inst->bits3.if_else.pad0 = 0;
1086      } else if (intel->gen == 6) {
1087	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1088      }
1089
1090      /* Patch ELSE -> ENDIF */
1091      if (intel->gen < 6) {
1092	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1093	  * matching ENDIF.
1094	  */
1095	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1096	 else_inst->bits3.if_else.pop_count = 1;
1097	 else_inst->bits3.if_else.pad0 = 0;
1098      } else if (intel->gen == 6) {
1099	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1100	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1101      } else {
1102	 /* The IF instruction's JIP should point just past the ELSE */
1103	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1104	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1105	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1106	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1107      }
1108   }
1109}
1110
1111void
1112brw_ELSE(struct brw_compile *p)
1113{
1114   struct intel_context *intel = &p->brw->intel;
1115   struct brw_instruction *insn;
1116
1117   insn = next_insn(p, BRW_OPCODE_ELSE);
1118
1119   if (intel->gen < 6) {
1120      brw_set_dest(p, insn, brw_ip_reg());
1121      brw_set_src0(p, insn, brw_ip_reg());
1122      brw_set_src1(p, insn, brw_imm_d(0x0));
1123   } else if (intel->gen == 6) {
1124      brw_set_dest(p, insn, brw_imm_w(0));
1125      insn->bits1.branch_gen6.jump_count = 0;
1126      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1127      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1128   } else {
1129      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1130      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1131      brw_set_src1(p, insn, brw_imm_ud(0));
1132      insn->bits3.break_cont.jip = 0;
1133      insn->bits3.break_cont.uip = 0;
1134   }
1135
1136   insn->header.compression_control = BRW_COMPRESSION_NONE;
1137   insn->header.mask_control = BRW_MASK_ENABLE;
1138   if (!p->single_program_flow)
1139      insn->header.thread_control = BRW_THREAD_SWITCH;
1140
1141   push_if_stack(p, insn);
1142}
1143
1144void
1145brw_ENDIF(struct brw_compile *p)
1146{
1147   struct intel_context *intel = &p->brw->intel;
1148   struct brw_instruction *insn;
1149   struct brw_instruction *else_inst = NULL;
1150   struct brw_instruction *if_inst = NULL;
1151
1152   /* Pop the IF and (optional) ELSE instructions from the stack */
1153   p->if_stack_depth--;
1154   if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
1155      else_inst = p->if_stack[p->if_stack_depth];
1156      p->if_stack_depth--;
1157   }
1158   if_inst = p->if_stack[p->if_stack_depth];
1159
1160   if (p->single_program_flow) {
1161      /* ENDIF is useless; don't bother emitting it. */
1162      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1163      return;
1164   }
1165
1166   insn = next_insn(p, BRW_OPCODE_ENDIF);
1167
1168   if (intel->gen < 6) {
1169      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1170      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1171      brw_set_src1(p, insn, brw_imm_d(0x0));
1172   } else if (intel->gen == 6) {
1173      brw_set_dest(p, insn, brw_imm_w(0));
1174      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1175      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1176   } else {
1177      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1178      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1179      brw_set_src1(p, insn, brw_imm_ud(0));
1180   }
1181
1182   insn->header.compression_control = BRW_COMPRESSION_NONE;
1183   insn->header.mask_control = BRW_MASK_ENABLE;
1184   insn->header.thread_control = BRW_THREAD_SWITCH;
1185
1186   /* Also pop item off the stack in the endif instruction: */
1187   if (intel->gen < 6) {
1188      insn->bits3.if_else.jump_count = 0;
1189      insn->bits3.if_else.pop_count = 1;
1190      insn->bits3.if_else.pad0 = 0;
1191   } else if (intel->gen == 6) {
1192      insn->bits1.branch_gen6.jump_count = 2;
1193   } else {
1194      insn->bits3.break_cont.jip = 2;
1195   }
1196   patch_IF_ELSE(p, if_inst, else_inst, insn);
1197}
1198
1199struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1200{
1201   struct intel_context *intel = &p->brw->intel;
1202   struct brw_instruction *insn;
1203
1204   insn = next_insn(p, BRW_OPCODE_BREAK);
1205   if (intel->gen >= 6) {
1206      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1207      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1208      brw_set_src1(p, insn, brw_imm_d(0x0));
1209   } else {
1210      brw_set_dest(p, insn, brw_ip_reg());
1211      brw_set_src0(p, insn, brw_ip_reg());
1212      brw_set_src1(p, insn, brw_imm_d(0x0));
1213      insn->bits3.if_else.pad0 = 0;
1214      insn->bits3.if_else.pop_count = pop_count;
1215   }
1216   insn->header.compression_control = BRW_COMPRESSION_NONE;
1217   insn->header.execution_size = BRW_EXECUTE_8;
1218
1219   return insn;
1220}
1221
1222struct brw_instruction *gen6_CONT(struct brw_compile *p,
1223				  struct brw_instruction *do_insn)
1224{
1225   struct brw_instruction *insn;
1226
1227   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1228   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1229   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1230   brw_set_dest(p, insn, brw_ip_reg());
1231   brw_set_src0(p, insn, brw_ip_reg());
1232   brw_set_src1(p, insn, brw_imm_d(0x0));
1233
1234   insn->header.compression_control = BRW_COMPRESSION_NONE;
1235   insn->header.execution_size = BRW_EXECUTE_8;
1236   return insn;
1237}
1238
1239struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1240{
1241   struct brw_instruction *insn;
1242   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1243   brw_set_dest(p, insn, brw_ip_reg());
1244   brw_set_src0(p, insn, brw_ip_reg());
1245   brw_set_src1(p, insn, brw_imm_d(0x0));
1246   insn->header.compression_control = BRW_COMPRESSION_NONE;
1247   insn->header.execution_size = BRW_EXECUTE_8;
1248   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1249   insn->bits3.if_else.pad0 = 0;
1250   insn->bits3.if_else.pop_count = pop_count;
1251   return insn;
1252}
1253
1254/* DO/WHILE loop:
1255 *
1256 * The DO/WHILE is just an unterminated loop -- break or continue are
1257 * used for control within the loop.  We have a few ways they can be
1258 * done.
1259 *
1260 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1261 * jip and no DO instruction.
1262 *
1263 * For non-uniform control flow pre-gen6, there's a DO instruction to
1264 * push the mask, and a WHILE to jump back, and BREAK to get out and
1265 * pop the mask.
1266 *
1267 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1268 * just points back to the first instruction of the loop.
1269 */
1270struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1271{
1272   struct intel_context *intel = &p->brw->intel;
1273
1274   if (intel->gen >= 6 || p->single_program_flow) {
1275      return &p->store[p->nr_insn];
1276   } else {
1277      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1278
1279      /* Override the defaults for this instruction:
1280       */
1281      brw_set_dest(p, insn, brw_null_reg());
1282      brw_set_src0(p, insn, brw_null_reg());
1283      brw_set_src1(p, insn, brw_null_reg());
1284
1285      insn->header.compression_control = BRW_COMPRESSION_NONE;
1286      insn->header.execution_size = execute_size;
1287      insn->header.predicate_control = BRW_PREDICATE_NONE;
1288      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1289      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1290
1291      return insn;
1292   }
1293}
1294
1295
1296
1297struct brw_instruction *brw_WHILE(struct brw_compile *p,
1298                                  struct brw_instruction *do_insn)
1299{
1300   struct intel_context *intel = &p->brw->intel;
1301   struct brw_instruction *insn;
1302   GLuint br = 1;
1303
1304   if (intel->gen >= 5)
1305      br = 2;
1306
1307   if (intel->gen >= 7) {
1308      insn = next_insn(p, BRW_OPCODE_WHILE);
1309
1310      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1311      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1312      brw_set_src1(p, insn, brw_imm_ud(0));
1313      insn->bits3.break_cont.jip = br * (do_insn - insn);
1314
1315      insn->header.execution_size = BRW_EXECUTE_8;
1316   } else if (intel->gen == 6) {
1317      insn = next_insn(p, BRW_OPCODE_WHILE);
1318
1319      brw_set_dest(p, insn, brw_imm_w(0));
1320      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1321      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1322      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1323
1324      insn->header.execution_size = BRW_EXECUTE_8;
1325   } else {
1326      if (p->single_program_flow) {
1327	 insn = next_insn(p, BRW_OPCODE_ADD);
1328
1329	 brw_set_dest(p, insn, brw_ip_reg());
1330	 brw_set_src0(p, insn, brw_ip_reg());
1331	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1332	 insn->header.execution_size = BRW_EXECUTE_1;
1333      } else {
1334	 insn = next_insn(p, BRW_OPCODE_WHILE);
1335
1336	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1337
1338	 brw_set_dest(p, insn, brw_ip_reg());
1339	 brw_set_src0(p, insn, brw_ip_reg());
1340	 brw_set_src1(p, insn, brw_imm_d(0));
1341
1342	 insn->header.execution_size = do_insn->header.execution_size;
1343	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1344	 insn->bits3.if_else.pop_count = 0;
1345	 insn->bits3.if_else.pad0 = 0;
1346      }
1347   }
1348   insn->header.compression_control = BRW_COMPRESSION_NONE;
1349   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1350
1351   return insn;
1352}
1353
1354
1355/* FORWARD JUMPS:
1356 */
1357void brw_land_fwd_jump(struct brw_compile *p,
1358		       struct brw_instruction *jmp_insn)
1359{
1360   struct intel_context *intel = &p->brw->intel;
1361   struct brw_instruction *landing = &p->store[p->nr_insn];
1362   GLuint jmpi = 1;
1363
1364   if (intel->gen >= 5)
1365      jmpi = 2;
1366
1367   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1368   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1369
1370   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1371}
1372
1373
1374
1375/* To integrate with the above, it makes sense that the comparison
1376 * instruction should populate the flag register.  It might be simpler
1377 * just to use the flag reg for most WM tasks?
1378 */
1379void brw_CMP(struct brw_compile *p,
1380	     struct brw_reg dest,
1381	     GLuint conditional,
1382	     struct brw_reg src0,
1383	     struct brw_reg src1)
1384{
1385   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1386
1387   insn->header.destreg__conditionalmod = conditional;
1388   brw_set_dest(p, insn, dest);
1389   brw_set_src0(p, insn, src0);
1390   brw_set_src1(p, insn, src1);
1391
1392/*    guess_execution_size(insn, src0); */
1393
1394
1395   /* Make it so that future instructions will use the computed flag
1396    * value until brw_set_predicate_control_flag_value() is called
1397    * again.
1398    */
1399   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1400       dest.nr == 0) {
1401      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1402      p->flag_value = 0xff;
1403   }
1404}
1405
1406/* Issue 'wait' instruction for n1, host could program MMIO
1407   to wake up thread. */
1408void brw_WAIT (struct brw_compile *p)
1409{
1410   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1411   struct brw_reg src = brw_notification_1_reg();
1412
1413   brw_set_dest(p, insn, src);
1414   brw_set_src0(p, insn, src);
1415   brw_set_src1(p, insn, brw_null_reg());
1416   insn->header.execution_size = 0; /* must */
1417   insn->header.predicate_control = 0;
1418   insn->header.compression_control = 0;
1419}
1420
1421
1422/***********************************************************************
1423 * Helpers for the various SEND message types:
1424 */
1425
1426/** Extended math function, float[8].
1427 */
1428void brw_math( struct brw_compile *p,
1429	       struct brw_reg dest,
1430	       GLuint function,
1431	       GLuint saturate,
1432	       GLuint msg_reg_nr,
1433	       struct brw_reg src,
1434	       GLuint data_type,
1435	       GLuint precision )
1436{
1437   struct intel_context *intel = &p->brw->intel;
1438
1439   if (intel->gen >= 6) {
1440      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1441
1442      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1443      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1444
1445      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1446      if (intel->gen == 6)
1447	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1448
1449      /* Source modifiers are ignored for extended math instructions on Gen6. */
1450      if (intel->gen == 6) {
1451	 assert(!src.negate);
1452	 assert(!src.abs);
1453      }
1454
1455      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1456	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1457	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1458	 assert(src.type != BRW_REGISTER_TYPE_F);
1459      } else {
1460	 assert(src.type == BRW_REGISTER_TYPE_F);
1461      }
1462
1463      /* Math is the same ISA format as other opcodes, except that CondModifier
1464       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1465       */
1466      insn->header.destreg__conditionalmod = function;
1467      insn->header.saturate = saturate;
1468
1469      brw_set_dest(p, insn, dest);
1470      brw_set_src0(p, insn, src);
1471      brw_set_src1(p, insn, brw_null_reg());
1472   } else {
1473      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1474
1475      /* Example code doesn't set predicate_control for send
1476       * instructions.
1477       */
1478      insn->header.predicate_control = 0;
1479      insn->header.destreg__conditionalmod = msg_reg_nr;
1480
1481      brw_set_dest(p, insn, dest);
1482      brw_set_src0(p, insn, src);
1483      brw_set_math_message(p,
1484			   insn,
1485			   function,
1486			   src.type == BRW_REGISTER_TYPE_D,
1487			   precision,
1488			   saturate,
1489			   data_type);
1490   }
1491}
1492
1493/** Extended math function, float[8].
1494 */
1495void brw_math2(struct brw_compile *p,
1496	       struct brw_reg dest,
1497	       GLuint function,
1498	       struct brw_reg src0,
1499	       struct brw_reg src1)
1500{
1501   struct intel_context *intel = &p->brw->intel;
1502   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1503
1504   assert(intel->gen >= 6);
1505   (void) intel;
1506
1507
1508   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1509   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1510   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1511
1512   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1513   if (intel->gen == 6) {
1514      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1515      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1516   }
1517
1518   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1519       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1520       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1521      assert(src0.type != BRW_REGISTER_TYPE_F);
1522      assert(src1.type != BRW_REGISTER_TYPE_F);
1523   } else {
1524      assert(src0.type == BRW_REGISTER_TYPE_F);
1525      assert(src1.type == BRW_REGISTER_TYPE_F);
1526   }
1527
1528   /* Source modifiers are ignored for extended math instructions on Gen6. */
1529   if (intel->gen == 6) {
1530      assert(!src0.negate);
1531      assert(!src0.abs);
1532      assert(!src1.negate);
1533      assert(!src1.abs);
1534   }
1535
1536   /* Math is the same ISA format as other opcodes, except that CondModifier
1537    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1538    */
1539   insn->header.destreg__conditionalmod = function;
1540
1541   brw_set_dest(p, insn, dest);
1542   brw_set_src0(p, insn, src0);
1543   brw_set_src1(p, insn, src1);
1544}
1545
1546/**
1547 * Extended math function, float[16].
1548 * Use 2 send instructions.
1549 */
1550void brw_math_16( struct brw_compile *p,
1551		  struct brw_reg dest,
1552		  GLuint function,
1553		  GLuint saturate,
1554		  GLuint msg_reg_nr,
1555		  struct brw_reg src,
1556		  GLuint precision )
1557{
1558   struct intel_context *intel = &p->brw->intel;
1559   struct brw_instruction *insn;
1560
1561   if (intel->gen >= 6) {
1562      insn = next_insn(p, BRW_OPCODE_MATH);
1563
1564      /* Math is the same ISA format as other opcodes, except that CondModifier
1565       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1566       */
1567      insn->header.destreg__conditionalmod = function;
1568      insn->header.saturate = saturate;
1569
1570      /* Source modifiers are ignored for extended math instructions. */
1571      assert(!src.negate);
1572      assert(!src.abs);
1573
1574      brw_set_dest(p, insn, dest);
1575      brw_set_src0(p, insn, src);
1576      brw_set_src1(p, insn, brw_null_reg());
1577      return;
1578   }
1579
1580   /* First instruction:
1581    */
1582   brw_push_insn_state(p);
1583   brw_set_predicate_control_flag_value(p, 0xff);
1584   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1585
1586   insn = next_insn(p, BRW_OPCODE_SEND);
1587   insn->header.destreg__conditionalmod = msg_reg_nr;
1588
1589   brw_set_dest(p, insn, dest);
1590   brw_set_src0(p, insn, src);
1591   brw_set_math_message(p,
1592			insn,
1593			function,
1594			BRW_MATH_INTEGER_UNSIGNED,
1595			precision,
1596			saturate,
1597			BRW_MATH_DATA_VECTOR);
1598
1599   /* Second instruction:
1600    */
1601   insn = next_insn(p, BRW_OPCODE_SEND);
1602   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1603   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1604
1605   brw_set_dest(p, insn, offset(dest,1));
1606   brw_set_src0(p, insn, src);
1607   brw_set_math_message(p,
1608			insn,
1609			function,
1610			BRW_MATH_INTEGER_UNSIGNED,
1611			precision,
1612			saturate,
1613			BRW_MATH_DATA_VECTOR);
1614
1615   brw_pop_insn_state(p);
1616}
1617
1618
1619/**
1620 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1621 * using a constant offset per channel.
1622 *
1623 * The offset must be aligned to oword size (16 bytes).  Used for
1624 * register spilling.
1625 */
1626void brw_oword_block_write_scratch(struct brw_compile *p,
1627				   struct brw_reg mrf,
1628				   int num_regs,
1629				   GLuint offset)
1630{
1631   struct intel_context *intel = &p->brw->intel;
1632   uint32_t msg_control, msg_type;
1633   int mlen;
1634
1635   if (intel->gen >= 6)
1636      offset /= 16;
1637
1638   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1639
1640   if (num_regs == 1) {
1641      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1642      mlen = 2;
1643   } else {
1644      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1645      mlen = 3;
1646   }
1647
1648   /* Set up the message header.  This is g0, with g0.2 filled with
1649    * the offset.  We don't want to leave our offset around in g0 or
1650    * it'll screw up texture samples, so set it up inside the message
1651    * reg.
1652    */
1653   {
1654      brw_push_insn_state(p);
1655      brw_set_mask_control(p, BRW_MASK_DISABLE);
1656      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1657
1658      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1659
1660      /* set message header global offset field (reg 0, element 2) */
1661      brw_MOV(p,
1662	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1663				  mrf.nr,
1664				  2), BRW_REGISTER_TYPE_UD),
1665	      brw_imm_ud(offset));
1666
1667      brw_pop_insn_state(p);
1668   }
1669
1670   {
1671      struct brw_reg dest;
1672      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1673      int send_commit_msg;
1674      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1675					 BRW_REGISTER_TYPE_UW);
1676
1677      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1678	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1679	 src_header = vec16(src_header);
1680      }
1681      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1682      insn->header.destreg__conditionalmod = mrf.nr;
1683
1684      /* Until gen6, writes followed by reads from the same location
1685       * are not guaranteed to be ordered unless write_commit is set.
1686       * If set, then a no-op write is issued to the destination
1687       * register to set a dependency, and a read from the destination
1688       * can be used to ensure the ordering.
1689       *
1690       * For gen6, only writes between different threads need ordering
1691       * protection.  Our use of DP writes is all about register
1692       * spilling within a thread.
1693       */
1694      if (intel->gen >= 6) {
1695	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1696	 send_commit_msg = 0;
1697      } else {
1698	 dest = src_header;
1699	 send_commit_msg = 1;
1700      }
1701
1702      brw_set_dest(p, insn, dest);
1703      if (intel->gen >= 6) {
1704	 brw_set_src0(p, insn, mrf);
1705      } else {
1706	 brw_set_src0(p, insn, brw_null_reg());
1707      }
1708
1709      if (intel->gen >= 6)
1710	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1711      else
1712	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1713
1714      brw_set_dp_write_message(p,
1715			       insn,
1716			       255, /* binding table index (255=stateless) */
1717			       msg_control,
1718			       msg_type,
1719			       mlen,
1720			       true, /* header_present */
1721			       0, /* not a render target */
1722			       send_commit_msg, /* response_length */
1723			       0, /* eot */
1724			       send_commit_msg);
1725   }
1726}
1727
1728
1729/**
1730 * Read a block of owords (half a GRF each) from the scratch buffer
1731 * using a constant index per channel.
1732 *
1733 * Offset must be aligned to oword size (16 bytes).  Used for register
1734 * spilling.
1735 */
1736void
1737brw_oword_block_read_scratch(struct brw_compile *p,
1738			     struct brw_reg dest,
1739			     struct brw_reg mrf,
1740			     int num_regs,
1741			     GLuint offset)
1742{
1743   struct intel_context *intel = &p->brw->intel;
1744   uint32_t msg_control;
1745   int rlen;
1746
1747   if (intel->gen >= 6)
1748      offset /= 16;
1749
1750   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1751   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1752
1753   if (num_regs == 1) {
1754      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1755      rlen = 1;
1756   } else {
1757      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1758      rlen = 2;
1759   }
1760
1761   {
1762      brw_push_insn_state(p);
1763      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1764      brw_set_mask_control(p, BRW_MASK_DISABLE);
1765
1766      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1767
1768      /* set message header global offset field (reg 0, element 2) */
1769      brw_MOV(p,
1770	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1771				  mrf.nr,
1772				  2), BRW_REGISTER_TYPE_UD),
1773	      brw_imm_ud(offset));
1774
1775      brw_pop_insn_state(p);
1776   }
1777
1778   {
1779      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1780
1781      assert(insn->header.predicate_control == 0);
1782      insn->header.compression_control = BRW_COMPRESSION_NONE;
1783      insn->header.destreg__conditionalmod = mrf.nr;
1784
1785      brw_set_dest(p, insn, dest);	/* UW? */
1786      if (intel->gen >= 6) {
1787	 brw_set_src0(p, insn, mrf);
1788      } else {
1789	 brw_set_src0(p, insn, brw_null_reg());
1790      }
1791
1792      brw_set_dp_read_message(p,
1793			      insn,
1794			      255, /* binding table index (255=stateless) */
1795			      msg_control,
1796			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1797			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1798			      1, /* msg_length */
1799			      rlen);
1800   }
1801}
1802
1803/**
1804 * Read a float[4] vector from the data port Data Cache (const buffer).
1805 * Location (in buffer) should be a multiple of 16.
1806 * Used for fetching shader constants.
1807 */
1808void brw_oword_block_read(struct brw_compile *p,
1809			  struct brw_reg dest,
1810			  struct brw_reg mrf,
1811			  uint32_t offset,
1812			  uint32_t bind_table_index)
1813{
1814   struct intel_context *intel = &p->brw->intel;
1815
1816   /* On newer hardware, offset is in units of owords. */
1817   if (intel->gen >= 6)
1818      offset /= 16;
1819
1820   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1821
1822   brw_push_insn_state(p);
1823   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1824   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1825   brw_set_mask_control(p, BRW_MASK_DISABLE);
1826
1827   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1828
1829   /* set message header global offset field (reg 0, element 2) */
1830   brw_MOV(p,
1831	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1832			       mrf.nr,
1833			       2), BRW_REGISTER_TYPE_UD),
1834	   brw_imm_ud(offset));
1835
1836   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1837   insn->header.destreg__conditionalmod = mrf.nr;
1838
1839   /* cast dest to a uword[8] vector */
1840   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1841
1842   brw_set_dest(p, insn, dest);
1843   if (intel->gen >= 6) {
1844      brw_set_src0(p, insn, mrf);
1845   } else {
1846      brw_set_src0(p, insn, brw_null_reg());
1847   }
1848
1849   brw_set_dp_read_message(p,
1850			   insn,
1851			   bind_table_index,
1852			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1853			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1854			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1855			   1, /* msg_length */
1856			   1); /* response_length (1 reg, 2 owords!) */
1857
1858   brw_pop_insn_state(p);
1859}
1860
1861/**
1862 * Read a set of dwords from the data port Data Cache (const buffer).
1863 *
1864 * Location (in buffer) appears as UD offsets in the register after
1865 * the provided mrf header reg.
1866 */
1867void brw_dword_scattered_read(struct brw_compile *p,
1868			      struct brw_reg dest,
1869			      struct brw_reg mrf,
1870			      uint32_t bind_table_index)
1871{
1872   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1873
1874   brw_push_insn_state(p);
1875   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1876   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1877   brw_set_mask_control(p, BRW_MASK_DISABLE);
1878   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1879   brw_pop_insn_state(p);
1880
1881   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1882   insn->header.destreg__conditionalmod = mrf.nr;
1883
1884   /* cast dest to a uword[8] vector */
1885   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1886
1887   brw_set_dest(p, insn, dest);
1888   brw_set_src0(p, insn, brw_null_reg());
1889
1890   brw_set_dp_read_message(p,
1891			   insn,
1892			   bind_table_index,
1893			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1894			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1895			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1896			   2, /* msg_length */
1897			   1); /* response_length */
1898}
1899
1900
1901
1902/**
1903 * Read float[4] constant(s) from VS constant buffer.
1904 * For relative addressing, two float[4] constants will be read into 'dest'.
1905 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1906 */
1907void brw_dp_READ_4_vs(struct brw_compile *p,
1908                      struct brw_reg dest,
1909                      GLuint location,
1910                      GLuint bind_table_index)
1911{
1912   struct intel_context *intel = &p->brw->intel;
1913   struct brw_instruction *insn;
1914   GLuint msg_reg_nr = 1;
1915
1916   if (intel->gen >= 6)
1917      location /= 16;
1918
1919   /* Setup MRF[1] with location/offset into const buffer */
1920   brw_push_insn_state(p);
1921   brw_set_access_mode(p, BRW_ALIGN_1);
1922   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1923   brw_set_mask_control(p, BRW_MASK_DISABLE);
1924   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1925   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1926		     BRW_REGISTER_TYPE_UD),
1927	   brw_imm_ud(location));
1928   brw_pop_insn_state(p);
1929
1930   insn = next_insn(p, BRW_OPCODE_SEND);
1931
1932   insn->header.predicate_control = BRW_PREDICATE_NONE;
1933   insn->header.compression_control = BRW_COMPRESSION_NONE;
1934   insn->header.destreg__conditionalmod = msg_reg_nr;
1935   insn->header.mask_control = BRW_MASK_DISABLE;
1936
1937   brw_set_dest(p, insn, dest);
1938   if (intel->gen >= 6) {
1939      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
1940   } else {
1941      brw_set_src0(p, insn, brw_null_reg());
1942   }
1943
1944   brw_set_dp_read_message(p,
1945			   insn,
1946			   bind_table_index,
1947			   0,
1948			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1949			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1950			   1, /* msg_length */
1951			   1); /* response_length (1 Oword) */
1952}
1953
1954/**
1955 * Read a float[4] constant per vertex from VS constant buffer, with
1956 * relative addressing.
1957 */
1958void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1959			       struct brw_reg dest,
1960			       struct brw_reg addr_reg,
1961			       GLuint offset,
1962			       GLuint bind_table_index)
1963{
1964   struct intel_context *intel = &p->brw->intel;
1965   struct brw_reg src = brw_vec8_grf(0, 0);
1966   int msg_type;
1967
1968   /* Setup MRF[1] with offset into const buffer */
1969   brw_push_insn_state(p);
1970   brw_set_access_mode(p, BRW_ALIGN_1);
1971   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1972   brw_set_mask_control(p, BRW_MASK_DISABLE);
1973   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1974
1975   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1976    * fields ignored.
1977    */
1978   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
1979	   addr_reg, brw_imm_d(offset));
1980   brw_pop_insn_state(p);
1981
1982   gen6_resolve_implied_move(p, &src, 0);
1983   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1984
1985   insn->header.predicate_control = BRW_PREDICATE_NONE;
1986   insn->header.compression_control = BRW_COMPRESSION_NONE;
1987   insn->header.destreg__conditionalmod = 0;
1988   insn->header.mask_control = BRW_MASK_DISABLE;
1989
1990   brw_set_dest(p, insn, dest);
1991   brw_set_src0(p, insn, src);
1992
1993   if (intel->gen >= 6)
1994      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1995   else if (intel->gen == 5 || intel->is_g4x)
1996      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1997   else
1998      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1999
2000   brw_set_dp_read_message(p,
2001			   insn,
2002			   bind_table_index,
2003			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
2004			   msg_type,
2005			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2006			   2, /* msg_length */
2007			   1); /* response_length */
2008}
2009
2010
2011
2012void brw_fb_WRITE(struct brw_compile *p,
2013		  int dispatch_width,
2014                  GLuint msg_reg_nr,
2015                  struct brw_reg src0,
2016                  GLuint binding_table_index,
2017                  GLuint msg_length,
2018                  GLuint response_length,
2019                  bool eot,
2020                  bool header_present)
2021{
2022   struct intel_context *intel = &p->brw->intel;
2023   struct brw_instruction *insn;
2024   GLuint msg_control, msg_type;
2025   struct brw_reg dest;
2026
2027   if (dispatch_width == 16)
2028      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2029   else
2030      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2031
2032   if (intel->gen >= 6 && binding_table_index == 0) {
2033      insn = next_insn(p, BRW_OPCODE_SENDC);
2034   } else {
2035      insn = next_insn(p, BRW_OPCODE_SEND);
2036   }
2037   /* The execution mask is ignored for render target writes. */
2038   insn->header.predicate_control = 0;
2039   insn->header.compression_control = BRW_COMPRESSION_NONE;
2040
2041   if (intel->gen >= 6) {
2042      /* headerless version, just submit color payload */
2043      src0 = brw_message_reg(msg_reg_nr);
2044
2045      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2046   } else {
2047      insn->header.destreg__conditionalmod = msg_reg_nr;
2048
2049      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2050   }
2051
2052   if (dispatch_width == 16)
2053      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2054   else
2055      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2056
2057   brw_set_dest(p, insn, dest);
2058   brw_set_src0(p, insn, src0);
2059   brw_set_dp_write_message(p,
2060			    insn,
2061			    binding_table_index,
2062			    msg_control,
2063			    msg_type,
2064			    msg_length,
2065			    header_present,
2066			    1, /* last render target write */
2067			    response_length,
2068			    eot,
2069			    0 /* send_commit_msg */);
2070}
2071
2072
2073/**
2074 * Texture sample instruction.
2075 * Note: the msg_type plus msg_length values determine exactly what kind
2076 * of sampling operation is performed.  See volume 4, page 161 of docs.
2077 */
2078void brw_SAMPLE(struct brw_compile *p,
2079		struct brw_reg dest,
2080		GLuint msg_reg_nr,
2081		struct brw_reg src0,
2082		GLuint binding_table_index,
2083		GLuint sampler,
2084		GLuint writemask,
2085		GLuint msg_type,
2086		GLuint response_length,
2087		GLuint msg_length,
2088		GLuint header_present,
2089		GLuint simd_mode)
2090{
2091   struct intel_context *intel = &p->brw->intel;
2092   bool need_stall = 0;
2093
2094   if (writemask == 0) {
2095      /*printf("%s: zero writemask??\n", __FUNCTION__); */
2096      return;
2097   }
2098
2099   /* Hardware doesn't do destination dependency checking on send
2100    * instructions properly.  Add a workaround which generates the
2101    * dependency by other means.  In practice it seems like this bug
2102    * only crops up for texture samples, and only where registers are
2103    * written by the send and then written again later without being
2104    * read in between.  Luckily for us, we already track that
2105    * information and use it to modify the writemask for the
2106    * instruction, so that is a guide for whether a workaround is
2107    * needed.
2108    */
2109   if (writemask != WRITEMASK_XYZW) {
2110      GLuint dst_offset = 0;
2111      GLuint i, newmask = 0, len = 0;
2112
2113      for (i = 0; i < 4; i++) {
2114	 if (writemask & (1<<i))
2115	    break;
2116	 dst_offset += 2;
2117      }
2118      for (; i < 4; i++) {
2119	 if (!(writemask & (1<<i)))
2120	    break;
2121	 newmask |= 1<<i;
2122	 len++;
2123      }
2124
2125      if (newmask != writemask) {
2126	 need_stall = 1;
2127         /* printf("need stall %x %x\n", newmask , writemask); */
2128      }
2129      else {
2130	 bool dispatch_16 = false;
2131
2132	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2133
2134	 guess_execution_size(p, p->current, dest);
2135	 if (p->current->header.execution_size == BRW_EXECUTE_16)
2136	    dispatch_16 = true;
2137
2138	 newmask = ~newmask & WRITEMASK_XYZW;
2139
2140	 brw_push_insn_state(p);
2141
2142	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2143	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2144
2145	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2146		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2147  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2148
2149	 brw_pop_insn_state(p);
2150
2151  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2152	 dest = offset(dest, dst_offset);
2153
2154	 /* For 16-wide dispatch, masked channels are skipped in the
2155	  * response.  For 8-wide, masked channels still take up slots,
2156	  * and are just not written to.
2157	  */
2158	 if (dispatch_16)
2159	    response_length = len * 2;
2160      }
2161   }
2162
2163   {
2164      struct brw_instruction *insn;
2165
2166      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2167
2168      insn = next_insn(p, BRW_OPCODE_SEND);
2169      insn->header.predicate_control = 0; /* XXX */
2170      insn->header.compression_control = BRW_COMPRESSION_NONE;
2171      if (intel->gen < 6)
2172	  insn->header.destreg__conditionalmod = msg_reg_nr;
2173
2174      brw_set_dest(p, insn, dest);
2175      brw_set_src0(p, insn, src0);
2176      brw_set_sampler_message(p, insn,
2177			      binding_table_index,
2178			      sampler,
2179			      msg_type,
2180			      response_length,
2181			      msg_length,
2182			      header_present,
2183			      simd_mode);
2184   }
2185
2186   if (need_stall) {
2187      struct brw_reg reg = vec8(offset(dest, response_length-1));
2188
2189      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2190       */
2191      brw_push_insn_state(p);
2192      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2193      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2194	      retype(reg, BRW_REGISTER_TYPE_UD));
2195      brw_pop_insn_state(p);
2196   }
2197
2198}
2199
2200/* All these variables are pretty confusing - we might be better off
2201 * using bitmasks and macros for this, in the old style.  Or perhaps
2202 * just having the caller instantiate the fields in dword3 itself.
2203 */
2204void brw_urb_WRITE(struct brw_compile *p,
2205		   struct brw_reg dest,
2206		   GLuint msg_reg_nr,
2207		   struct brw_reg src0,
2208		   bool allocate,
2209		   bool used,
2210		   GLuint msg_length,
2211		   GLuint response_length,
2212		   bool eot,
2213		   bool writes_complete,
2214		   GLuint offset,
2215		   GLuint swizzle)
2216{
2217   struct intel_context *intel = &p->brw->intel;
2218   struct brw_instruction *insn;
2219
2220   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2221
2222   if (intel->gen == 7) {
2223      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2224      brw_push_insn_state(p);
2225      brw_set_access_mode(p, BRW_ALIGN_1);
2226      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2227		       BRW_REGISTER_TYPE_UD),
2228	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2229		brw_imm_ud(0xff00));
2230      brw_pop_insn_state(p);
2231   }
2232
2233   insn = next_insn(p, BRW_OPCODE_SEND);
2234
2235   assert(msg_length < BRW_MAX_MRF);
2236
2237   brw_set_dest(p, insn, dest);
2238   brw_set_src0(p, insn, src0);
2239   brw_set_src1(p, insn, brw_imm_d(0));
2240
2241   if (intel->gen < 6)
2242      insn->header.destreg__conditionalmod = msg_reg_nr;
2243
2244   brw_set_urb_message(p,
2245		       insn,
2246		       allocate,
2247		       used,
2248		       msg_length,
2249		       response_length,
2250		       eot,
2251		       writes_complete,
2252		       offset,
2253		       swizzle);
2254}
2255
2256static int
2257brw_find_next_block_end(struct brw_compile *p, int start)
2258{
2259   int ip;
2260
2261   for (ip = start + 1; ip < p->nr_insn; ip++) {
2262      struct brw_instruction *insn = &p->store[ip];
2263
2264      switch (insn->header.opcode) {
2265      case BRW_OPCODE_ENDIF:
2266      case BRW_OPCODE_ELSE:
2267      case BRW_OPCODE_WHILE:
2268	 return ip;
2269      }
2270   }
2271   assert(!"not reached");
2272   return start + 1;
2273}
2274
2275/* There is no DO instruction on gen6, so to find the end of the loop
2276 * we have to see if the loop is jumping back before our start
2277 * instruction.
2278 */
2279static int
2280brw_find_loop_end(struct brw_compile *p, int start)
2281{
2282   struct intel_context *intel = &p->brw->intel;
2283   int ip;
2284   int br = 2;
2285
2286   for (ip = start + 1; ip < p->nr_insn; ip++) {
2287      struct brw_instruction *insn = &p->store[ip];
2288
2289      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2290	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2291				   : insn->bits3.break_cont.jip;
2292	 if (ip + jip / br <= start)
2293	    return ip;
2294      }
2295   }
2296   assert(!"not reached");
2297   return start + 1;
2298}
2299
2300/* After program generation, go back and update the UIP and JIP of
2301 * BREAK and CONT instructions to their correct locations.
2302 */
2303void
2304brw_set_uip_jip(struct brw_compile *p)
2305{
2306   struct intel_context *intel = &p->brw->intel;
2307   int ip;
2308   int br = 2;
2309
2310   if (intel->gen < 6)
2311      return;
2312
2313   for (ip = 0; ip < p->nr_insn; ip++) {
2314      struct brw_instruction *insn = &p->store[ip];
2315
2316      switch (insn->header.opcode) {
2317      case BRW_OPCODE_BREAK:
2318	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2319	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2320	 insn->bits3.break_cont.uip =
2321	    br * (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 1 : 0));
2322	 break;
2323      case BRW_OPCODE_CONTINUE:
2324	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2325	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip);
2326
2327	 assert(insn->bits3.break_cont.uip != 0);
2328	 assert(insn->bits3.break_cont.jip != 0);
2329	 break;
2330      }
2331   }
2332}
2333
2334void brw_ff_sync(struct brw_compile *p,
2335		   struct brw_reg dest,
2336		   GLuint msg_reg_nr,
2337		   struct brw_reg src0,
2338		   bool allocate,
2339		   GLuint response_length,
2340		   bool eot)
2341{
2342   struct intel_context *intel = &p->brw->intel;
2343   struct brw_instruction *insn;
2344
2345   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2346
2347   insn = next_insn(p, BRW_OPCODE_SEND);
2348   brw_set_dest(p, insn, dest);
2349   brw_set_src0(p, insn, src0);
2350   brw_set_src1(p, insn, brw_imm_d(0));
2351
2352   if (intel->gen < 6)
2353      insn->header.destreg__conditionalmod = msg_reg_nr;
2354
2355   brw_set_ff_sync_message(p,
2356			   insn,
2357			   allocate,
2358			   response_length,
2359			   eot);
2360}
2361