brw_eu_emit.c revision 220e208329e923faf50524c0adf72e4dcc931e49
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  unsigned msg_reg_nr)
65{
66   struct brw_context *brw = p->brw;
67   if (brw->gen < 6)
68      return;
69
70   if (src->file == BRW_MESSAGE_REGISTER_FILE)
71      return;
72
73   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74      brw_push_insn_state(p);
75      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
76      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
77      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	      retype(*src, BRW_REGISTER_TYPE_UD));
79      brw_pop_insn_state(p);
80   }
81   *src = brw_message_reg(msg_reg_nr);
82}
83
84static void
85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86{
87   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88    * "The send with EOT should use register space R112-R127 for <src>. This is
89    *  to enable loading of a new thread into the same slot while the message
90    *  with EOT for current thread is pending dispatch."
91    *
92    * Since we're pretending to have 16 MRFs anyway, we may as well use the
93    * registers required for messages with EOT.
94    */
95   struct brw_context *brw = p->brw;
96   if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97      reg->file = BRW_GENERAL_REGISTER_FILE;
98      reg->nr += GEN7_MRF_HACK_START;
99   }
100}
101
102/**
103 * Convert a brw_reg_type enumeration value into the hardware representation.
104 *
105 * The hardware encoding may depend on whether the value is an immediate.
106 */
107unsigned
108brw_reg_type_to_hw_type(const struct brw_context *brw,
109                        enum brw_reg_type type, unsigned file)
110{
111   if (file == BRW_IMMEDIATE_VALUE) {
112      const static int imm_hw_types[] = {
113         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
114         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
115         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
116         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
117         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
118         [BRW_REGISTER_TYPE_UB] = -1,
119         [BRW_REGISTER_TYPE_B]  = -1,
120         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
121         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
122         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
123         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
124         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
125         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
126         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
127      };
128      assert(type < ARRAY_SIZE(imm_hw_types));
129      assert(imm_hw_types[type] != -1);
130      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
131      return imm_hw_types[type];
132   } else {
133      /* Non-immediate registers */
134      const static int hw_types[] = {
135         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
136         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
137         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
138         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
139         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
140         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
141         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
142         [BRW_REGISTER_TYPE_UV] = -1,
143         [BRW_REGISTER_TYPE_VF] = -1,
144         [BRW_REGISTER_TYPE_V]  = -1,
145         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
146         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
147         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
148         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
149      };
150      assert(type < ARRAY_SIZE(hw_types));
151      assert(hw_types[type] != -1);
152      assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
153      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
154      return hw_types[type];
155   }
156}
157
158void
159brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
160	     struct brw_reg dest)
161{
162   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
163       dest.file != BRW_MESSAGE_REGISTER_FILE)
164      assert(dest.nr < 128);
165
166   gen7_convert_mrf_to_grf(p, &dest);
167
168   insn->bits1.da1.dest_reg_file = dest.file;
169   insn->bits1.da1.dest_reg_type =
170      brw_reg_type_to_hw_type(p->brw, dest.type, dest.file);
171   insn->bits1.da1.dest_address_mode = dest.address_mode;
172
173   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
174      insn->bits1.da1.dest_reg_nr = dest.nr;
175
176      if (insn->header.access_mode == BRW_ALIGN_1) {
177	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
178	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
179	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
180	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
181      } else {
182	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
183	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
184         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
185             dest.file == BRW_MESSAGE_REGISTER_FILE) {
186            assert(dest.dw1.bits.writemask != 0);
187         }
188	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
189	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
190	  *    this to be programmed as "01".
191	  */
192	 insn->bits1.da16.dest_horiz_stride = 1;
193      }
194   } else {
195      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
196
197      /* These are different sizes in align1 vs align16:
198       */
199      if (insn->header.access_mode == BRW_ALIGN_1) {
200	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
201	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
202	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
203	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
204      } else {
205	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
206	 /* even ignored in da16, still need to set as '01' */
207	 insn->bits1.ia16.dest_horiz_stride = 1;
208      }
209   }
210
211   /* NEW: Set the execution size based on dest.width and
212    * insn->compression_control:
213    */
214   guess_execution_size(p, insn, dest);
215}
216
217extern int reg_type_size[];
218
219static void
220validate_reg(struct brw_instruction *insn, struct brw_reg reg)
221{
222   int hstride_for_reg[] = {0, 1, 2, 4};
223   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
224   int width_for_reg[] = {1, 2, 4, 8, 16};
225   int execsize_for_reg[] = {1, 2, 4, 8, 16};
226   int width, hstride, vstride, execsize;
227
228   if (reg.file == BRW_IMMEDIATE_VALUE) {
229      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
230       * mean the destination has to be 128-bit aligned and the
231       * destination horiz stride has to be a word.
232       */
233      if (reg.type == BRW_REGISTER_TYPE_V) {
234	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
235		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
236      }
237
238      return;
239   }
240
241   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
242       reg.file == BRW_ARF_NULL)
243      return;
244
245   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
246   hstride = hstride_for_reg[reg.hstride];
247
248   if (reg.vstride == 0xf) {
249      vstride = -1;
250   } else {
251      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
252      vstride = vstride_for_reg[reg.vstride];
253   }
254
255   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
256   width = width_for_reg[reg.width];
257
258   assert(insn->header.execution_size >= 0 &&
259	  insn->header.execution_size < Elements(execsize_for_reg));
260   execsize = execsize_for_reg[insn->header.execution_size];
261
262   /* Restrictions from 3.3.10: Register Region Restrictions. */
263   /* 3. */
264   assert(execsize >= width);
265
266   /* 4. */
267   if (execsize == width && hstride != 0) {
268      assert(vstride == -1 || vstride == width * hstride);
269   }
270
271   /* 5. */
272   if (execsize == width && hstride == 0) {
273      /* no restriction on vstride. */
274   }
275
276   /* 6. */
277   if (width == 1) {
278      assert(hstride == 0);
279   }
280
281   /* 7. */
282   if (execsize == 1 && width == 1) {
283      assert(hstride == 0);
284      assert(vstride == 0);
285   }
286
287   /* 8. */
288   if (vstride == 0 && hstride == 0) {
289      assert(width == 1);
290   }
291
292   /* 10. Check destination issues. */
293}
294
295static bool
296is_compactable_immediate(unsigned imm)
297{
298   /* We get the low 12 bits as-is. */
299   imm &= ~0xfff;
300
301   /* We get one bit replicated through the top 20 bits. */
302   return imm == 0 || imm == 0xfffff000;
303}
304
305void
306brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
307	     struct brw_reg reg)
308{
309   struct brw_context *brw = p->brw;
310
311   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
312      assert(reg.nr < 128);
313
314   gen7_convert_mrf_to_grf(p, &reg);
315
316   if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
317                           insn->header.opcode == BRW_OPCODE_SENDC)) {
318      /* Any source modifiers or regions will be ignored, since this just
319       * identifies the MRF/GRF to start reading the message contents from.
320       * Check for some likely failures.
321       */
322      assert(!reg.negate);
323      assert(!reg.abs);
324      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
325   }
326
327   validate_reg(insn, reg);
328
329   insn->bits1.da1.src0_reg_file = reg.file;
330   insn->bits1.da1.src0_reg_type =
331      brw_reg_type_to_hw_type(brw, reg.type, reg.file);
332   insn->bits2.da1.src0_abs = reg.abs;
333   insn->bits2.da1.src0_negate = reg.negate;
334   insn->bits2.da1.src0_address_mode = reg.address_mode;
335
336   if (reg.file == BRW_IMMEDIATE_VALUE) {
337      insn->bits3.ud = reg.dw1.ud;
338
339      /* The Bspec's section titled "Non-present Operands" claims that if src0
340       * is an immediate that src1's type must be the same as that of src0.
341       *
342       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
343       * that do not follow this rule. E.g., from the IVB/HSW table:
344       *
345       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
346       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
347       *
348       * And from the SNB table:
349       *
350       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
351       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
352       *
353       * Neither of these cause warnings from the simulator when used,
354       * compacted or otherwise. In fact, all compaction mappings that have an
355       * immediate in src0 use a:ud for src1.
356       *
357       * The GM45 instruction compaction tables do not contain mapped meanings
358       * so it's not clear whether it has the restriction. We'll assume it was
359       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
360       */
361      insn->bits1.da1.src1_reg_file = 0; /* arf */
362      if (brw->gen < 6) {
363         insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type;
364      } else {
365         insn->bits1.da1.src1_reg_type = BRW_HW_REG_TYPE_UD;
366      }
367
368      /* Compacted instructions only have 12-bits (plus 1 for the other 20)
369       * for immediate values. Presumably the hardware engineers realized
370       * that the only useful floating-point value that could be represented
371       * in this format is 0.0, which can also be represented as a VF-typed
372       * immediate, so they gave us the previously mentioned mapping on IVB+.
373       *
374       * Strangely, we do have a mapping for imm:f in src1, so we don't need
375       * to do this there.
376       *
377       * If we see a 0.0:F, change the type to VF so that it can be compacted.
378       */
379      if (insn->bits3.ud == 0x0 &&
380          insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_F) {
381         insn->bits1.da1.src0_reg_type = BRW_HW_REG_IMM_TYPE_VF;
382      }
383
384      /* There are no mappings for dst:d | i:d, so if the immediate is suitable
385       * set the types to :UD so the instruction can be compacted.
386       */
387      if (is_compactable_immediate(insn->bits3.ud) &&
388          insn->header.destreg__conditionalmod == BRW_CONDITIONAL_NONE &&
389          insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_D &&
390          insn->bits1.da1.dest_reg_type == BRW_HW_REG_TYPE_D) {
391         insn->bits1.da1.src0_reg_type = BRW_HW_REG_TYPE_UD;
392         insn->bits1.da1.dest_reg_type = BRW_HW_REG_TYPE_UD;
393      }
394   } else {
395      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
396	 if (insn->header.access_mode == BRW_ALIGN_1) {
397	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
398	    insn->bits2.da1.src0_reg_nr = reg.nr;
399	 } else {
400	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
401	    insn->bits2.da16.src0_reg_nr = reg.nr;
402	 }
403      } else {
404	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
405
406	 if (insn->header.access_mode == BRW_ALIGN_1) {
407	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
408	 } else {
409	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
410	 }
411      }
412
413      if (insn->header.access_mode == BRW_ALIGN_1) {
414	 if (reg.width == BRW_WIDTH_1 &&
415	     insn->header.execution_size == BRW_EXECUTE_1) {
416	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
417	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
418	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
419	 } else {
420	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
421	    insn->bits2.da1.src0_width = reg.width;
422	    insn->bits2.da1.src0_vert_stride = reg.vstride;
423	 }
424      } else {
425	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
426	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
427	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
428	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
429
430	 /* This is an oddity of the fact we're using the same
431	  * descriptions for registers in align_16 as align_1:
432	  */
433	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
434	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
435	 else
436	    insn->bits2.da16.src0_vert_stride = reg.vstride;
437      }
438   }
439}
440
441
442void
443brw_set_src1(struct brw_compile *p,
444             struct brw_instruction *insn,
445             struct brw_reg reg)
446{
447   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
448
449   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
450      assert(reg.nr < 128);
451
452   gen7_convert_mrf_to_grf(p, &reg);
453
454   validate_reg(insn, reg);
455
456   insn->bits1.da1.src1_reg_file = reg.file;
457   insn->bits1.da1.src1_reg_type =
458      brw_reg_type_to_hw_type(p->brw, reg.type, reg.file);
459   insn->bits3.da1.src1_abs = reg.abs;
460   insn->bits3.da1.src1_negate = reg.negate;
461
462   /* Only src1 can be immediate in two-argument instructions.
463    */
464   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
465
466   if (reg.file == BRW_IMMEDIATE_VALUE) {
467      insn->bits3.ud = reg.dw1.ud;
468   } else {
469      /* This is a hardware restriction, which may or may not be lifted
470       * in the future:
471       */
472      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
473      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
474
475      if (insn->header.access_mode == BRW_ALIGN_1) {
476	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
477	 insn->bits3.da1.src1_reg_nr = reg.nr;
478      } else {
479	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
480	 insn->bits3.da16.src1_reg_nr = reg.nr;
481      }
482
483      if (insn->header.access_mode == BRW_ALIGN_1) {
484	 if (reg.width == BRW_WIDTH_1 &&
485	     insn->header.execution_size == BRW_EXECUTE_1) {
486	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
487	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
488	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
489	 } else {
490	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
491	    insn->bits3.da1.src1_width = reg.width;
492	    insn->bits3.da1.src1_vert_stride = reg.vstride;
493	 }
494      } else {
495	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
496	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
497	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
498	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
499
500	 /* This is an oddity of the fact we're using the same
501	  * descriptions for registers in align_16 as align_1:
502	  */
503	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
504	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
505	 else
506	    insn->bits3.da16.src1_vert_stride = reg.vstride;
507      }
508   }
509}
510
511/**
512 * Set the Message Descriptor and Extended Message Descriptor fields
513 * for SEND messages.
514 *
515 * \note This zeroes out the Function Control bits, so it must be called
516 *       \b before filling out any message-specific data.  Callers can
517 *       choose not to fill in irrelevant bits; they will be zero.
518 */
519static void
520brw_set_message_descriptor(struct brw_compile *p,
521			   struct brw_instruction *inst,
522			   enum brw_message_target sfid,
523			   unsigned msg_length,
524			   unsigned response_length,
525			   bool header_present,
526			   bool end_of_thread)
527{
528   struct brw_context *brw = p->brw;
529
530   brw_set_src1(p, inst, brw_imm_d(0));
531
532   if (brw->gen >= 5) {
533      inst->bits3.generic_gen5.header_present = header_present;
534      inst->bits3.generic_gen5.response_length = response_length;
535      inst->bits3.generic_gen5.msg_length = msg_length;
536      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
537
538      if (brw->gen >= 6) {
539	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
540	 inst->header.destreg__conditionalmod = sfid;
541      } else {
542	 /* Set Extended Message Descriptor (ex_desc) */
543	 inst->bits2.send_gen5.sfid = sfid;
544	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
545      }
546   } else {
547      inst->bits3.generic.response_length = response_length;
548      inst->bits3.generic.msg_length = msg_length;
549      inst->bits3.generic.msg_target = sfid;
550      inst->bits3.generic.end_of_thread = end_of_thread;
551   }
552}
553
554static void brw_set_math_message( struct brw_compile *p,
555				  struct brw_instruction *insn,
556				  unsigned function,
557				  unsigned integer_type,
558				  bool low_precision,
559				  unsigned dataType )
560{
561   struct brw_context *brw = p->brw;
562   unsigned msg_length;
563   unsigned response_length;
564
565   /* Infer message length from the function */
566   switch (function) {
567   case BRW_MATH_FUNCTION_POW:
568   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
569   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
570   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
571      msg_length = 2;
572      break;
573   default:
574      msg_length = 1;
575      break;
576   }
577
578   /* Infer response length from the function */
579   switch (function) {
580   case BRW_MATH_FUNCTION_SINCOS:
581   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
582      response_length = 2;
583      break;
584   default:
585      response_length = 1;
586      break;
587   }
588
589
590   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
591			      msg_length, response_length, false, false);
592   if (brw->gen == 5) {
593      insn->bits3.math_gen5.function = function;
594      insn->bits3.math_gen5.int_type = integer_type;
595      insn->bits3.math_gen5.precision = low_precision;
596      insn->bits3.math_gen5.saturate = insn->header.saturate;
597      insn->bits3.math_gen5.data_type = dataType;
598      insn->bits3.math_gen5.snapshot = 0;
599   } else {
600      insn->bits3.math.function = function;
601      insn->bits3.math.int_type = integer_type;
602      insn->bits3.math.precision = low_precision;
603      insn->bits3.math.saturate = insn->header.saturate;
604      insn->bits3.math.data_type = dataType;
605   }
606   insn->header.saturate = 0;
607}
608
609
610static void brw_set_ff_sync_message(struct brw_compile *p,
611				    struct brw_instruction *insn,
612				    bool allocate,
613				    unsigned response_length,
614				    bool end_of_thread)
615{
616   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
617			      1, response_length, true, end_of_thread);
618   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
619   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
620   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
621   insn->bits3.urb_gen5.allocate = allocate;
622   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
623   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
624}
625
626static void brw_set_urb_message( struct brw_compile *p,
627				 struct brw_instruction *insn,
628                                 enum brw_urb_write_flags flags,
629				 unsigned msg_length,
630				 unsigned response_length,
631				 unsigned offset,
632				 unsigned swizzle_control )
633{
634   struct brw_context *brw = p->brw;
635
636   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
637			      msg_length, response_length, true,
638                              flags & BRW_URB_WRITE_EOT);
639   if (brw->gen == 7) {
640      if (flags & BRW_URB_WRITE_OWORD) {
641         assert(msg_length == 2); /* header + one OWORD of data */
642         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
643      } else {
644         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
645      }
646      insn->bits3.urb_gen7.offset = offset;
647      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
648      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
649      insn->bits3.urb_gen7.per_slot_offset =
650         flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
651      insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
652   } else if (brw->gen >= 5) {
653      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
654      insn->bits3.urb_gen5.offset = offset;
655      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
656      insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
657      insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
658      insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
659   } else {
660      insn->bits3.urb.opcode = 0;	/* ? */
661      insn->bits3.urb.offset = offset;
662      insn->bits3.urb.swizzle_control = swizzle_control;
663      insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
664      insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
665      insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
666   }
667}
668
669void
670brw_set_dp_write_message(struct brw_compile *p,
671			 struct brw_instruction *insn,
672			 unsigned binding_table_index,
673			 unsigned msg_control,
674			 unsigned msg_type,
675			 unsigned msg_length,
676			 bool header_present,
677			 unsigned last_render_target,
678			 unsigned response_length,
679			 unsigned end_of_thread,
680			 unsigned send_commit_msg)
681{
682   struct brw_context *brw = p->brw;
683   unsigned sfid;
684
685   if (brw->gen >= 7) {
686      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
687      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
688	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
689      else
690	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
691   } else if (brw->gen == 6) {
692      /* Use the render cache for all write messages. */
693      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
694   } else {
695      sfid = BRW_SFID_DATAPORT_WRITE;
696   }
697
698   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
699			      header_present, end_of_thread);
700
701   if (brw->gen >= 7) {
702      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
703      insn->bits3.gen7_dp.msg_control = msg_control;
704      insn->bits3.gen7_dp.last_render_target = last_render_target;
705      insn->bits3.gen7_dp.msg_type = msg_type;
706   } else if (brw->gen == 6) {
707      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
708      insn->bits3.gen6_dp.msg_control = msg_control;
709      insn->bits3.gen6_dp.last_render_target = last_render_target;
710      insn->bits3.gen6_dp.msg_type = msg_type;
711      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
712   } else if (brw->gen == 5) {
713      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
714      insn->bits3.dp_write_gen5.msg_control = msg_control;
715      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
716      insn->bits3.dp_write_gen5.msg_type = msg_type;
717      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
718   } else {
719      insn->bits3.dp_write.binding_table_index = binding_table_index;
720      insn->bits3.dp_write.msg_control = msg_control;
721      insn->bits3.dp_write.last_render_target = last_render_target;
722      insn->bits3.dp_write.msg_type = msg_type;
723      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
724   }
725}
726
727void
728brw_set_dp_read_message(struct brw_compile *p,
729			struct brw_instruction *insn,
730			unsigned binding_table_index,
731			unsigned msg_control,
732			unsigned msg_type,
733			unsigned target_cache,
734			unsigned msg_length,
735                        bool header_present,
736			unsigned response_length)
737{
738   struct brw_context *brw = p->brw;
739   unsigned sfid;
740
741   if (brw->gen >= 7) {
742      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
743   } else if (brw->gen == 6) {
744      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
745	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
746      else
747	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
748   } else {
749      sfid = BRW_SFID_DATAPORT_READ;
750   }
751
752   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
753			      header_present, false);
754
755   if (brw->gen >= 7) {
756      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
757      insn->bits3.gen7_dp.msg_control = msg_control;
758      insn->bits3.gen7_dp.last_render_target = 0;
759      insn->bits3.gen7_dp.msg_type = msg_type;
760   } else if (brw->gen == 6) {
761      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
762      insn->bits3.gen6_dp.msg_control = msg_control;
763      insn->bits3.gen6_dp.last_render_target = 0;
764      insn->bits3.gen6_dp.msg_type = msg_type;
765      insn->bits3.gen6_dp.send_commit_msg = 0;
766   } else if (brw->gen == 5) {
767      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
768      insn->bits3.dp_read_gen5.msg_control = msg_control;
769      insn->bits3.dp_read_gen5.msg_type = msg_type;
770      insn->bits3.dp_read_gen5.target_cache = target_cache;
771   } else if (brw->is_g4x) {
772      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
773      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
774      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
775      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
776   } else {
777      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
778      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
779      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
780      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
781   }
782}
783
784void
785brw_set_sampler_message(struct brw_compile *p,
786                        struct brw_instruction *insn,
787                        unsigned binding_table_index,
788                        unsigned sampler,
789                        unsigned msg_type,
790                        unsigned response_length,
791                        unsigned msg_length,
792                        unsigned header_present,
793                        unsigned simd_mode,
794                        unsigned return_format)
795{
796   struct brw_context *brw = p->brw;
797
798   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
799			      response_length, header_present, false);
800
801   if (brw->gen >= 7) {
802      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
803      insn->bits3.sampler_gen7.sampler = sampler;
804      insn->bits3.sampler_gen7.msg_type = msg_type;
805      insn->bits3.sampler_gen7.simd_mode = simd_mode;
806   } else if (brw->gen >= 5) {
807      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
808      insn->bits3.sampler_gen5.sampler = sampler;
809      insn->bits3.sampler_gen5.msg_type = msg_type;
810      insn->bits3.sampler_gen5.simd_mode = simd_mode;
811   } else if (brw->is_g4x) {
812      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
813      insn->bits3.sampler_g4x.sampler = sampler;
814      insn->bits3.sampler_g4x.msg_type = msg_type;
815   } else {
816      insn->bits3.sampler.binding_table_index = binding_table_index;
817      insn->bits3.sampler.sampler = sampler;
818      insn->bits3.sampler.msg_type = msg_type;
819      insn->bits3.sampler.return_format = return_format;
820   }
821}
822
823
824#define next_insn brw_next_insn
825struct brw_instruction *
826brw_next_insn(struct brw_compile *p, unsigned opcode)
827{
828   struct brw_instruction *insn;
829
830   if (p->nr_insn + 1 > p->store_size) {
831      p->store_size <<= 1;
832      p->store = reralloc(p->mem_ctx, p->store,
833                          struct brw_instruction, p->store_size);
834   }
835
836   p->next_insn_offset += 16;
837   insn = &p->store[p->nr_insn++];
838   memcpy(insn, p->current, sizeof(*insn));
839
840   insn->header.opcode = opcode;
841   return insn;
842}
843
844static struct brw_instruction *brw_alu1( struct brw_compile *p,
845					 unsigned opcode,
846					 struct brw_reg dest,
847					 struct brw_reg src )
848{
849   struct brw_instruction *insn = next_insn(p, opcode);
850   brw_set_dest(p, insn, dest);
851   brw_set_src0(p, insn, src);
852   return insn;
853}
854
855static struct brw_instruction *brw_alu2(struct brw_compile *p,
856					unsigned opcode,
857					struct brw_reg dest,
858					struct brw_reg src0,
859					struct brw_reg src1 )
860{
861   struct brw_instruction *insn = next_insn(p, opcode);
862   brw_set_dest(p, insn, dest);
863   brw_set_src0(p, insn, src0);
864   brw_set_src1(p, insn, src1);
865   return insn;
866}
867
868static int
869get_3src_subreg_nr(struct brw_reg reg)
870{
871   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
872      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
873      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
874   } else {
875      return reg.subnr / 4;
876   }
877}
878
879static struct brw_instruction *brw_alu3(struct brw_compile *p,
880					unsigned opcode,
881					struct brw_reg dest,
882					struct brw_reg src0,
883					struct brw_reg src1,
884					struct brw_reg src2)
885{
886   struct brw_context *brw = p->brw;
887   struct brw_instruction *insn = next_insn(p, opcode);
888
889   gen7_convert_mrf_to_grf(p, &dest);
890
891   assert(insn->header.access_mode == BRW_ALIGN_16);
892
893   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
894	  dest.file == BRW_MESSAGE_REGISTER_FILE);
895   assert(dest.nr < 128);
896   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
897   assert(dest.type == BRW_REGISTER_TYPE_F ||
898          dest.type == BRW_REGISTER_TYPE_D ||
899          dest.type == BRW_REGISTER_TYPE_UD);
900   insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
901   insn->bits1.da3src.dest_reg_nr = dest.nr;
902   insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
903   insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
904   guess_execution_size(p, insn, dest);
905
906   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
907   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
908   assert(src0.nr < 128);
909   insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
910   insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
911   insn->bits2.da3src.src0_reg_nr = src0.nr;
912   insn->bits1.da3src.src0_abs = src0.abs;
913   insn->bits1.da3src.src0_negate = src0.negate;
914   insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
915
916   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
917   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
918   assert(src1.nr < 128);
919   insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
920   insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
921   insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
922   insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
923   insn->bits3.da3src.src1_reg_nr = src1.nr;
924   insn->bits1.da3src.src1_abs = src1.abs;
925   insn->bits1.da3src.src1_negate = src1.negate;
926
927   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
928   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
929   assert(src2.nr < 128);
930   insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
931   insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
932   insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
933   insn->bits3.da3src.src2_reg_nr = src2.nr;
934   insn->bits1.da3src.src2_abs = src2.abs;
935   insn->bits1.da3src.src2_negate = src2.negate;
936
937   if (brw->gen >= 7) {
938      /* Set both the source and destination types based on dest.type,
939       * ignoring the source register types.  The MAD and LRP emitters ensure
940       * that all four types are float.  The BFE and BFI2 emitters, however,
941       * may send us mixed D and UD types and want us to ignore that and use
942       * the destination type.
943       */
944      switch (dest.type) {
945      case BRW_REGISTER_TYPE_F:
946         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
947         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
948         break;
949      case BRW_REGISTER_TYPE_D:
950         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
951         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
952         break;
953      case BRW_REGISTER_TYPE_UD:
954         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
955         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
956         break;
957      }
958   }
959
960   return insn;
961}
962
963
964/***********************************************************************
965 * Convenience routines.
966 */
967#define ALU1(OP)					\
968struct brw_instruction *brw_##OP(struct brw_compile *p,	\
969	      struct brw_reg dest,			\
970	      struct brw_reg src0)   			\
971{							\
972   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
973}
974
975#define ALU2(OP)					\
976struct brw_instruction *brw_##OP(struct brw_compile *p,	\
977	      struct brw_reg dest,			\
978	      struct brw_reg src0,			\
979	      struct brw_reg src1)   			\
980{							\
981   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
982}
983
984#define ALU3(OP)					\
985struct brw_instruction *brw_##OP(struct brw_compile *p,	\
986	      struct brw_reg dest,			\
987	      struct brw_reg src0,			\
988	      struct brw_reg src1,			\
989	      struct brw_reg src2)   			\
990{							\
991   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
992}
993
994#define ALU3F(OP)                                               \
995struct brw_instruction *brw_##OP(struct brw_compile *p,         \
996                                 struct brw_reg dest,           \
997                                 struct brw_reg src0,           \
998                                 struct brw_reg src1,           \
999                                 struct brw_reg src2)           \
1000{                                                               \
1001   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
1002   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
1003   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
1004   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
1005   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1006}
1007
1008/* Rounding operations (other than RNDD) require two instructions - the first
1009 * stores a rounded value (possibly the wrong way) in the dest register, but
1010 * also sets a per-channel "increment bit" in the flag register.  A predicated
1011 * add of 1.0 fixes dest to contain the desired result.
1012 *
1013 * Sandybridge and later appear to round correctly without an ADD.
1014 */
1015#define ROUND(OP)							      \
1016void brw_##OP(struct brw_compile *p,					      \
1017	      struct brw_reg dest,					      \
1018	      struct brw_reg src)					      \
1019{									      \
1020   struct brw_instruction *rnd, *add;					      \
1021   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
1022   brw_set_dest(p, rnd, dest);						      \
1023   brw_set_src0(p, rnd, src);						      \
1024									      \
1025   if (p->brw->gen < 6) {						      \
1026      /* turn on round-increments */					      \
1027      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
1028      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
1029      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
1030   }									      \
1031}
1032
1033
1034ALU1(MOV)
1035ALU2(SEL)
1036ALU1(NOT)
1037ALU2(AND)
1038ALU2(OR)
1039ALU2(XOR)
1040ALU2(SHR)
1041ALU2(SHL)
1042ALU2(ASR)
1043ALU1(F32TO16)
1044ALU1(F16TO32)
1045ALU1(FRC)
1046ALU1(RNDD)
1047ALU2(MAC)
1048ALU2(MACH)
1049ALU1(LZD)
1050ALU2(DP4)
1051ALU2(DPH)
1052ALU2(DP3)
1053ALU2(DP2)
1054ALU2(LINE)
1055ALU2(PLN)
1056ALU3F(MAD)
1057ALU3F(LRP)
1058ALU1(BFREV)
1059ALU3(BFE)
1060ALU2(BFI1)
1061ALU3(BFI2)
1062ALU1(FBH)
1063ALU1(FBL)
1064ALU1(CBIT)
1065ALU2(ADDC)
1066ALU2(SUBB)
1067
1068ROUND(RNDZ)
1069ROUND(RNDE)
1070
1071
1072struct brw_instruction *brw_ADD(struct brw_compile *p,
1073				struct brw_reg dest,
1074				struct brw_reg src0,
1075				struct brw_reg src1)
1076{
1077   /* 6.2.2: add */
1078   if (src0.type == BRW_REGISTER_TYPE_F ||
1079       (src0.file == BRW_IMMEDIATE_VALUE &&
1080	src0.type == BRW_REGISTER_TYPE_VF)) {
1081      assert(src1.type != BRW_REGISTER_TYPE_UD);
1082      assert(src1.type != BRW_REGISTER_TYPE_D);
1083   }
1084
1085   if (src1.type == BRW_REGISTER_TYPE_F ||
1086       (src1.file == BRW_IMMEDIATE_VALUE &&
1087	src1.type == BRW_REGISTER_TYPE_VF)) {
1088      assert(src0.type != BRW_REGISTER_TYPE_UD);
1089      assert(src0.type != BRW_REGISTER_TYPE_D);
1090   }
1091
1092   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1093}
1094
1095struct brw_instruction *brw_AVG(struct brw_compile *p,
1096                                struct brw_reg dest,
1097                                struct brw_reg src0,
1098                                struct brw_reg src1)
1099{
1100   assert(dest.type == src0.type);
1101   assert(src0.type == src1.type);
1102   switch (src0.type) {
1103   case BRW_REGISTER_TYPE_B:
1104   case BRW_REGISTER_TYPE_UB:
1105   case BRW_REGISTER_TYPE_W:
1106   case BRW_REGISTER_TYPE_UW:
1107   case BRW_REGISTER_TYPE_D:
1108   case BRW_REGISTER_TYPE_UD:
1109      break;
1110   default:
1111      assert(!"Bad type for brw_AVG");
1112   }
1113
1114   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1115}
1116
1117struct brw_instruction *brw_MUL(struct brw_compile *p,
1118				struct brw_reg dest,
1119				struct brw_reg src0,
1120				struct brw_reg src1)
1121{
1122   /* 6.32.38: mul */
1123   if (src0.type == BRW_REGISTER_TYPE_D ||
1124       src0.type == BRW_REGISTER_TYPE_UD ||
1125       src1.type == BRW_REGISTER_TYPE_D ||
1126       src1.type == BRW_REGISTER_TYPE_UD) {
1127      assert(dest.type != BRW_REGISTER_TYPE_F);
1128   }
1129
1130   if (src0.type == BRW_REGISTER_TYPE_F ||
1131       (src0.file == BRW_IMMEDIATE_VALUE &&
1132	src0.type == BRW_REGISTER_TYPE_VF)) {
1133      assert(src1.type != BRW_REGISTER_TYPE_UD);
1134      assert(src1.type != BRW_REGISTER_TYPE_D);
1135   }
1136
1137   if (src1.type == BRW_REGISTER_TYPE_F ||
1138       (src1.file == BRW_IMMEDIATE_VALUE &&
1139	src1.type == BRW_REGISTER_TYPE_VF)) {
1140      assert(src0.type != BRW_REGISTER_TYPE_UD);
1141      assert(src0.type != BRW_REGISTER_TYPE_D);
1142   }
1143
1144   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1145	  src0.nr != BRW_ARF_ACCUMULATOR);
1146   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1147	  src1.nr != BRW_ARF_ACCUMULATOR);
1148
1149   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1150}
1151
1152
1153void brw_NOP(struct brw_compile *p)
1154{
1155   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1156   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1157   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1158   brw_set_src1(p, insn, brw_imm_ud(0x0));
1159}
1160
1161
1162
1163
1164
1165/***********************************************************************
1166 * Comparisons, if/else/endif
1167 */
1168
1169struct brw_instruction *brw_JMPI(struct brw_compile *p,
1170                                 struct brw_reg index,
1171                                 unsigned predicate_control)
1172{
1173   struct brw_reg ip = brw_ip_reg();
1174   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1175
1176   insn->header.execution_size = 1;
1177   insn->header.compression_control = BRW_COMPRESSION_NONE;
1178   insn->header.mask_control = BRW_MASK_DISABLE;
1179   insn->header.predicate_control = predicate_control;
1180
1181   return insn;
1182}
1183
1184static void
1185push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1186{
1187   p->if_stack[p->if_stack_depth] = inst - p->store;
1188
1189   p->if_stack_depth++;
1190   if (p->if_stack_array_size <= p->if_stack_depth) {
1191      p->if_stack_array_size *= 2;
1192      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1193			     p->if_stack_array_size);
1194   }
1195}
1196
1197static struct brw_instruction *
1198pop_if_stack(struct brw_compile *p)
1199{
1200   p->if_stack_depth--;
1201   return &p->store[p->if_stack[p->if_stack_depth]];
1202}
1203
1204static void
1205push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1206{
1207   if (p->loop_stack_array_size < p->loop_stack_depth) {
1208      p->loop_stack_array_size *= 2;
1209      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1210			       p->loop_stack_array_size);
1211      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1212				     p->loop_stack_array_size);
1213   }
1214
1215   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1216   p->loop_stack_depth++;
1217   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1218}
1219
1220static struct brw_instruction *
1221get_inner_do_insn(struct brw_compile *p)
1222{
1223   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1224}
1225
1226/* EU takes the value from the flag register and pushes it onto some
1227 * sort of a stack (presumably merging with any flag value already on
1228 * the stack).  Within an if block, the flags at the top of the stack
1229 * control execution on each channel of the unit, eg. on each of the
1230 * 16 pixel values in our wm programs.
1231 *
1232 * When the matching 'else' instruction is reached (presumably by
1233 * countdown of the instruction count patched in by our ELSE/ENDIF
1234 * functions), the relevent flags are inverted.
1235 *
1236 * When the matching 'endif' instruction is reached, the flags are
1237 * popped off.  If the stack is now empty, normal execution resumes.
1238 */
1239struct brw_instruction *
1240brw_IF(struct brw_compile *p, unsigned execute_size)
1241{
1242   struct brw_context *brw = p->brw;
1243   struct brw_instruction *insn;
1244
1245   insn = next_insn(p, BRW_OPCODE_IF);
1246
1247   /* Override the defaults for this instruction:
1248    */
1249   if (brw->gen < 6) {
1250      brw_set_dest(p, insn, brw_ip_reg());
1251      brw_set_src0(p, insn, brw_ip_reg());
1252      brw_set_src1(p, insn, brw_imm_d(0x0));
1253   } else if (brw->gen == 6) {
1254      brw_set_dest(p, insn, brw_imm_w(0));
1255      insn->bits1.branch_gen6.jump_count = 0;
1256      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1257      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1258   } else {
1259      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1260      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1261      brw_set_src1(p, insn, brw_imm_ud(0));
1262      insn->bits3.break_cont.jip = 0;
1263      insn->bits3.break_cont.uip = 0;
1264   }
1265
1266   insn->header.execution_size = execute_size;
1267   insn->header.compression_control = BRW_COMPRESSION_NONE;
1268   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1269   insn->header.mask_control = BRW_MASK_ENABLE;
1270   if (!p->single_program_flow)
1271      insn->header.thread_control = BRW_THREAD_SWITCH;
1272
1273   push_if_stack(p, insn);
1274   p->if_depth_in_loop[p->loop_stack_depth]++;
1275   return insn;
1276}
1277
1278/* This function is only used for gen6-style IF instructions with an
1279 * embedded comparison (conditional modifier).  It is not used on gen7.
1280 */
1281struct brw_instruction *
1282gen6_IF(struct brw_compile *p, uint32_t conditional,
1283	struct brw_reg src0, struct brw_reg src1)
1284{
1285   struct brw_instruction *insn;
1286
1287   insn = next_insn(p, BRW_OPCODE_IF);
1288
1289   brw_set_dest(p, insn, brw_imm_w(0));
1290   if (p->compressed) {
1291      insn->header.execution_size = BRW_EXECUTE_16;
1292   } else {
1293      insn->header.execution_size = BRW_EXECUTE_8;
1294   }
1295   insn->bits1.branch_gen6.jump_count = 0;
1296   brw_set_src0(p, insn, src0);
1297   brw_set_src1(p, insn, src1);
1298
1299   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1300   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1301   insn->header.destreg__conditionalmod = conditional;
1302
1303   if (!p->single_program_flow)
1304      insn->header.thread_control = BRW_THREAD_SWITCH;
1305
1306   push_if_stack(p, insn);
1307   return insn;
1308}
1309
1310/**
1311 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1312 */
1313static void
1314convert_IF_ELSE_to_ADD(struct brw_compile *p,
1315		       struct brw_instruction *if_inst,
1316		       struct brw_instruction *else_inst)
1317{
1318   /* The next instruction (where the ENDIF would be, if it existed) */
1319   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1320
1321   assert(p->single_program_flow);
1322   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1323   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1324   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1325
1326   /* Convert IF to an ADD instruction that moves the instruction pointer
1327    * to the first instruction of the ELSE block.  If there is no ELSE
1328    * block, point to where ENDIF would be.  Reverse the predicate.
1329    *
1330    * There's no need to execute an ENDIF since we don't need to do any
1331    * stack operations, and if we're currently executing, we just want to
1332    * continue normally.
1333    */
1334   if_inst->header.opcode = BRW_OPCODE_ADD;
1335   if_inst->header.predicate_inverse = 1;
1336
1337   if (else_inst != NULL) {
1338      /* Convert ELSE to an ADD instruction that points where the ENDIF
1339       * would be.
1340       */
1341      else_inst->header.opcode = BRW_OPCODE_ADD;
1342
1343      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1344      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1345   } else {
1346      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1347   }
1348}
1349
1350/**
1351 * Patch IF and ELSE instructions with appropriate jump targets.
1352 */
1353static void
1354patch_IF_ELSE(struct brw_compile *p,
1355	      struct brw_instruction *if_inst,
1356	      struct brw_instruction *else_inst,
1357	      struct brw_instruction *endif_inst)
1358{
1359   struct brw_context *brw = p->brw;
1360
1361   /* We shouldn't be patching IF and ELSE instructions in single program flow
1362    * mode when gen < 6, because in single program flow mode on those
1363    * platforms, we convert flow control instructions to conditional ADDs that
1364    * operate on IP (see brw_ENDIF).
1365    *
1366    * However, on Gen6, writing to IP doesn't work in single program flow mode
1367    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1368    * not be updated by non-flow control instructions.").  And on later
1369    * platforms, there is no significant benefit to converting control flow
1370    * instructions to conditional ADDs.  So we do patch IF and ELSE
1371    * instructions in single program flow mode on those platforms.
1372    */
1373   if (brw->gen < 6)
1374      assert(!p->single_program_flow);
1375
1376   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1377   assert(endif_inst != NULL);
1378   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1379
1380   unsigned br = 1;
1381   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1382    * requires 2 chunks.
1383    */
1384   if (brw->gen >= 5)
1385      br = 2;
1386
1387   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1388   endif_inst->header.execution_size = if_inst->header.execution_size;
1389
1390   if (else_inst == NULL) {
1391      /* Patch IF -> ENDIF */
1392      if (brw->gen < 6) {
1393	 /* Turn it into an IFF, which means no mask stack operations for
1394	  * all-false and jumping past the ENDIF.
1395	  */
1396	 if_inst->header.opcode = BRW_OPCODE_IFF;
1397	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1398	 if_inst->bits3.if_else.pop_count = 0;
1399	 if_inst->bits3.if_else.pad0 = 0;
1400      } else if (brw->gen == 6) {
1401	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1402	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1403      } else {
1404	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1405	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1406      }
1407   } else {
1408      else_inst->header.execution_size = if_inst->header.execution_size;
1409
1410      /* Patch IF -> ELSE */
1411      if (brw->gen < 6) {
1412	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1413	 if_inst->bits3.if_else.pop_count = 0;
1414	 if_inst->bits3.if_else.pad0 = 0;
1415      } else if (brw->gen == 6) {
1416	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1417      }
1418
1419      /* Patch ELSE -> ENDIF */
1420      if (brw->gen < 6) {
1421	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1422	  * matching ENDIF.
1423	  */
1424	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1425	 else_inst->bits3.if_else.pop_count = 1;
1426	 else_inst->bits3.if_else.pad0 = 0;
1427      } else if (brw->gen == 6) {
1428	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1429	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1430      } else {
1431	 /* The IF instruction's JIP should point just past the ELSE */
1432	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1433	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1434	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1435	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1436      }
1437   }
1438}
1439
1440void
1441brw_ELSE(struct brw_compile *p)
1442{
1443   struct brw_context *brw = p->brw;
1444   struct brw_instruction *insn;
1445
1446   insn = next_insn(p, BRW_OPCODE_ELSE);
1447
1448   if (brw->gen < 6) {
1449      brw_set_dest(p, insn, brw_ip_reg());
1450      brw_set_src0(p, insn, brw_ip_reg());
1451      brw_set_src1(p, insn, brw_imm_d(0x0));
1452   } else if (brw->gen == 6) {
1453      brw_set_dest(p, insn, brw_imm_w(0));
1454      insn->bits1.branch_gen6.jump_count = 0;
1455      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1456      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1457   } else {
1458      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1459      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1460      brw_set_src1(p, insn, brw_imm_ud(0));
1461      insn->bits3.break_cont.jip = 0;
1462      insn->bits3.break_cont.uip = 0;
1463   }
1464
1465   insn->header.compression_control = BRW_COMPRESSION_NONE;
1466   insn->header.mask_control = BRW_MASK_ENABLE;
1467   if (!p->single_program_flow)
1468      insn->header.thread_control = BRW_THREAD_SWITCH;
1469
1470   push_if_stack(p, insn);
1471}
1472
1473void
1474brw_ENDIF(struct brw_compile *p)
1475{
1476   struct brw_context *brw = p->brw;
1477   struct brw_instruction *insn = NULL;
1478   struct brw_instruction *else_inst = NULL;
1479   struct brw_instruction *if_inst = NULL;
1480   struct brw_instruction *tmp;
1481   bool emit_endif = true;
1482
1483   /* In single program flow mode, we can express IF and ELSE instructions
1484    * equivalently as ADD instructions that operate on IP.  On platforms prior
1485    * to Gen6, flow control instructions cause an implied thread switch, so
1486    * this is a significant savings.
1487    *
1488    * However, on Gen6, writing to IP doesn't work in single program flow mode
1489    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1490    * not be updated by non-flow control instructions.").  And on later
1491    * platforms, there is no significant benefit to converting control flow
1492    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1493    * Gen5.
1494    */
1495   if (brw->gen < 6 && p->single_program_flow)
1496      emit_endif = false;
1497
1498   /*
1499    * A single next_insn() may change the base adress of instruction store
1500    * memory(p->store), so call it first before referencing the instruction
1501    * store pointer from an index
1502    */
1503   if (emit_endif)
1504      insn = next_insn(p, BRW_OPCODE_ENDIF);
1505
1506   /* Pop the IF and (optional) ELSE instructions from the stack */
1507   p->if_depth_in_loop[p->loop_stack_depth]--;
1508   tmp = pop_if_stack(p);
1509   if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1510      else_inst = tmp;
1511      tmp = pop_if_stack(p);
1512   }
1513   if_inst = tmp;
1514
1515   if (!emit_endif) {
1516      /* ENDIF is useless; don't bother emitting it. */
1517      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1518      return;
1519   }
1520
1521   if (brw->gen < 6) {
1522      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1523      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1524      brw_set_src1(p, insn, brw_imm_d(0x0));
1525   } else if (brw->gen == 6) {
1526      brw_set_dest(p, insn, brw_imm_w(0));
1527      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1528      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1529   } else {
1530      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1531      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1532      brw_set_src1(p, insn, brw_imm_ud(0));
1533   }
1534
1535   insn->header.compression_control = BRW_COMPRESSION_NONE;
1536   insn->header.mask_control = BRW_MASK_ENABLE;
1537   insn->header.thread_control = BRW_THREAD_SWITCH;
1538
1539   /* Also pop item off the stack in the endif instruction: */
1540   if (brw->gen < 6) {
1541      insn->bits3.if_else.jump_count = 0;
1542      insn->bits3.if_else.pop_count = 1;
1543      insn->bits3.if_else.pad0 = 0;
1544   } else if (brw->gen == 6) {
1545      insn->bits1.branch_gen6.jump_count = 2;
1546   } else {
1547      insn->bits3.break_cont.jip = 2;
1548   }
1549   patch_IF_ELSE(p, if_inst, else_inst, insn);
1550}
1551
1552struct brw_instruction *brw_BREAK(struct brw_compile *p)
1553{
1554   struct brw_context *brw = p->brw;
1555   struct brw_instruction *insn;
1556
1557   insn = next_insn(p, BRW_OPCODE_BREAK);
1558   if (brw->gen >= 6) {
1559      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1560      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1561      brw_set_src1(p, insn, brw_imm_d(0x0));
1562   } else {
1563      brw_set_dest(p, insn, brw_ip_reg());
1564      brw_set_src0(p, insn, brw_ip_reg());
1565      brw_set_src1(p, insn, brw_imm_d(0x0));
1566      insn->bits3.if_else.pad0 = 0;
1567      insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1568   }
1569   insn->header.compression_control = BRW_COMPRESSION_NONE;
1570   insn->header.execution_size = BRW_EXECUTE_8;
1571
1572   return insn;
1573}
1574
1575struct brw_instruction *gen6_CONT(struct brw_compile *p)
1576{
1577   struct brw_instruction *insn;
1578
1579   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1580   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1581   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1582   brw_set_dest(p, insn, brw_ip_reg());
1583   brw_set_src0(p, insn, brw_ip_reg());
1584   brw_set_src1(p, insn, brw_imm_d(0x0));
1585
1586   insn->header.compression_control = BRW_COMPRESSION_NONE;
1587   insn->header.execution_size = BRW_EXECUTE_8;
1588   return insn;
1589}
1590
1591struct brw_instruction *brw_CONT(struct brw_compile *p)
1592{
1593   struct brw_instruction *insn;
1594   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1595   brw_set_dest(p, insn, brw_ip_reg());
1596   brw_set_src0(p, insn, brw_ip_reg());
1597   brw_set_src1(p, insn, brw_imm_d(0x0));
1598   insn->header.compression_control = BRW_COMPRESSION_NONE;
1599   insn->header.execution_size = BRW_EXECUTE_8;
1600   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1601   insn->bits3.if_else.pad0 = 0;
1602   insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1603   return insn;
1604}
1605
1606struct brw_instruction *gen6_HALT(struct brw_compile *p)
1607{
1608   struct brw_instruction *insn;
1609
1610   insn = next_insn(p, BRW_OPCODE_HALT);
1611   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1612   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1613   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1614
1615   if (p->compressed) {
1616      insn->header.execution_size = BRW_EXECUTE_16;
1617   } else {
1618      insn->header.compression_control = BRW_COMPRESSION_NONE;
1619      insn->header.execution_size = BRW_EXECUTE_8;
1620   }
1621   return insn;
1622}
1623
1624/* DO/WHILE loop:
1625 *
1626 * The DO/WHILE is just an unterminated loop -- break or continue are
1627 * used for control within the loop.  We have a few ways they can be
1628 * done.
1629 *
1630 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1631 * jip and no DO instruction.
1632 *
1633 * For non-uniform control flow pre-gen6, there's a DO instruction to
1634 * push the mask, and a WHILE to jump back, and BREAK to get out and
1635 * pop the mask.
1636 *
1637 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1638 * just points back to the first instruction of the loop.
1639 */
1640struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1641{
1642   struct brw_context *brw = p->brw;
1643
1644   if (brw->gen >= 6 || p->single_program_flow) {
1645      push_loop_stack(p, &p->store[p->nr_insn]);
1646      return &p->store[p->nr_insn];
1647   } else {
1648      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1649
1650      push_loop_stack(p, insn);
1651
1652      /* Override the defaults for this instruction:
1653       */
1654      brw_set_dest(p, insn, brw_null_reg());
1655      brw_set_src0(p, insn, brw_null_reg());
1656      brw_set_src1(p, insn, brw_null_reg());
1657
1658      insn->header.compression_control = BRW_COMPRESSION_NONE;
1659      insn->header.execution_size = execute_size;
1660      insn->header.predicate_control = BRW_PREDICATE_NONE;
1661      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1662      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1663
1664      return insn;
1665   }
1666}
1667
1668/**
1669 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1670 * instruction here.
1671 *
1672 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1673 * nesting, since it can always just point to the end of the block/current loop.
1674 */
1675static void
1676brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1677{
1678   struct brw_context *brw = p->brw;
1679   struct brw_instruction *do_inst = get_inner_do_insn(p);
1680   struct brw_instruction *inst;
1681   int br = (brw->gen == 5) ? 2 : 1;
1682
1683   for (inst = while_inst - 1; inst != do_inst; inst--) {
1684      /* If the jump count is != 0, that means that this instruction has already
1685       * been patched because it's part of a loop inside of the one we're
1686       * patching.
1687       */
1688      if (inst->header.opcode == BRW_OPCODE_BREAK &&
1689	  inst->bits3.if_else.jump_count == 0) {
1690	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1691      } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1692		 inst->bits3.if_else.jump_count == 0) {
1693	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1694      }
1695   }
1696}
1697
1698struct brw_instruction *brw_WHILE(struct brw_compile *p)
1699{
1700   struct brw_context *brw = p->brw;
1701   struct brw_instruction *insn, *do_insn;
1702   unsigned br = 1;
1703
1704   if (brw->gen >= 5)
1705      br = 2;
1706
1707   if (brw->gen >= 7) {
1708      insn = next_insn(p, BRW_OPCODE_WHILE);
1709      do_insn = get_inner_do_insn(p);
1710
1711      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1712      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1713      brw_set_src1(p, insn, brw_imm_ud(0));
1714      insn->bits3.break_cont.jip = br * (do_insn - insn);
1715
1716      insn->header.execution_size = BRW_EXECUTE_8;
1717   } else if (brw->gen == 6) {
1718      insn = next_insn(p, BRW_OPCODE_WHILE);
1719      do_insn = get_inner_do_insn(p);
1720
1721      brw_set_dest(p, insn, brw_imm_w(0));
1722      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1723      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1724      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1725
1726      insn->header.execution_size = BRW_EXECUTE_8;
1727   } else {
1728      if (p->single_program_flow) {
1729	 insn = next_insn(p, BRW_OPCODE_ADD);
1730         do_insn = get_inner_do_insn(p);
1731
1732	 brw_set_dest(p, insn, brw_ip_reg());
1733	 brw_set_src0(p, insn, brw_ip_reg());
1734	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1735	 insn->header.execution_size = BRW_EXECUTE_1;
1736      } else {
1737	 insn = next_insn(p, BRW_OPCODE_WHILE);
1738         do_insn = get_inner_do_insn(p);
1739
1740	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1741
1742	 brw_set_dest(p, insn, brw_ip_reg());
1743	 brw_set_src0(p, insn, brw_ip_reg());
1744	 brw_set_src1(p, insn, brw_imm_d(0));
1745
1746	 insn->header.execution_size = do_insn->header.execution_size;
1747	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1748	 insn->bits3.if_else.pop_count = 0;
1749	 insn->bits3.if_else.pad0 = 0;
1750
1751	 brw_patch_break_cont(p, insn);
1752      }
1753   }
1754   insn->header.compression_control = BRW_COMPRESSION_NONE;
1755
1756   p->loop_stack_depth--;
1757
1758   return insn;
1759}
1760
1761/* To integrate with the above, it makes sense that the comparison
1762 * instruction should populate the flag register.  It might be simpler
1763 * just to use the flag reg for most WM tasks?
1764 */
1765void brw_CMP(struct brw_compile *p,
1766	     struct brw_reg dest,
1767	     unsigned conditional,
1768	     struct brw_reg src0,
1769	     struct brw_reg src1)
1770{
1771   struct brw_context *brw = p->brw;
1772   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1773
1774   insn->header.destreg__conditionalmod = conditional;
1775   brw_set_dest(p, insn, dest);
1776   brw_set_src0(p, insn, src0);
1777   brw_set_src1(p, insn, src1);
1778
1779   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1780    * page says:
1781    *    "Any CMP instruction with a null destination must use a {switch}."
1782    *
1783    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1784    * mentioned on their work-arounds pages.
1785    */
1786   if (brw->gen == 7) {
1787      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1788          dest.nr == BRW_ARF_NULL) {
1789         insn->header.thread_control = BRW_THREAD_SWITCH;
1790      }
1791   }
1792}
1793
1794/* Issue 'wait' instruction for n1, host could program MMIO
1795   to wake up thread. */
1796void brw_WAIT (struct brw_compile *p)
1797{
1798   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1799   struct brw_reg src = brw_notification_1_reg();
1800
1801   brw_set_dest(p, insn, src);
1802   brw_set_src0(p, insn, src);
1803   brw_set_src1(p, insn, brw_null_reg());
1804   insn->header.execution_size = 0; /* must */
1805   insn->header.predicate_control = 0;
1806   insn->header.compression_control = 0;
1807}
1808
1809
1810/***********************************************************************
1811 * Helpers for the various SEND message types:
1812 */
1813
1814/** Extended math function, float[8].
1815 */
1816void brw_math( struct brw_compile *p,
1817	       struct brw_reg dest,
1818	       unsigned function,
1819	       unsigned msg_reg_nr,
1820	       struct brw_reg src,
1821	       unsigned data_type,
1822	       unsigned precision )
1823{
1824   struct brw_context *brw = p->brw;
1825
1826   if (brw->gen >= 6) {
1827      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1828
1829      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1830             (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1831      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1832
1833      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1834      if (brw->gen == 6)
1835	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1836
1837      /* Source modifiers are ignored for extended math instructions on Gen6. */
1838      if (brw->gen == 6) {
1839	 assert(!src.negate);
1840	 assert(!src.abs);
1841      }
1842
1843      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1844	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1845	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1846	 assert(src.type != BRW_REGISTER_TYPE_F);
1847      } else {
1848	 assert(src.type == BRW_REGISTER_TYPE_F);
1849      }
1850
1851      /* Math is the same ISA format as other opcodes, except that CondModifier
1852       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1853       */
1854      insn->header.destreg__conditionalmod = function;
1855
1856      brw_set_dest(p, insn, dest);
1857      brw_set_src0(p, insn, src);
1858      brw_set_src1(p, insn, brw_null_reg());
1859   } else {
1860      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1861
1862      /* Example code doesn't set predicate_control for send
1863       * instructions.
1864       */
1865      insn->header.predicate_control = 0;
1866      insn->header.destreg__conditionalmod = msg_reg_nr;
1867
1868      brw_set_dest(p, insn, dest);
1869      brw_set_src0(p, insn, src);
1870      brw_set_math_message(p,
1871			   insn,
1872			   function,
1873			   src.type == BRW_REGISTER_TYPE_D,
1874			   precision,
1875			   data_type);
1876   }
1877}
1878
1879/** Extended math function, float[8].
1880 */
1881void brw_math2(struct brw_compile *p,
1882	       struct brw_reg dest,
1883	       unsigned function,
1884	       struct brw_reg src0,
1885	       struct brw_reg src1)
1886{
1887   struct brw_context *brw = p->brw;
1888   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1889
1890   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1891          (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1892   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1893   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1894
1895   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1896   if (brw->gen == 6) {
1897      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1898      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1899   }
1900
1901   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1902       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1903       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1904      assert(src0.type != BRW_REGISTER_TYPE_F);
1905      assert(src1.type != BRW_REGISTER_TYPE_F);
1906   } else {
1907      assert(src0.type == BRW_REGISTER_TYPE_F);
1908      assert(src1.type == BRW_REGISTER_TYPE_F);
1909   }
1910
1911   /* Source modifiers are ignored for extended math instructions on Gen6. */
1912   if (brw->gen == 6) {
1913      assert(!src0.negate);
1914      assert(!src0.abs);
1915      assert(!src1.negate);
1916      assert(!src1.abs);
1917   }
1918
1919   /* Math is the same ISA format as other opcodes, except that CondModifier
1920    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1921    */
1922   insn->header.destreg__conditionalmod = function;
1923
1924   brw_set_dest(p, insn, dest);
1925   brw_set_src0(p, insn, src0);
1926   brw_set_src1(p, insn, src1);
1927}
1928
1929
1930/**
1931 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1932 * using a constant offset per channel.
1933 *
1934 * The offset must be aligned to oword size (16 bytes).  Used for
1935 * register spilling.
1936 */
1937void brw_oword_block_write_scratch(struct brw_compile *p,
1938				   struct brw_reg mrf,
1939				   int num_regs,
1940				   unsigned offset)
1941{
1942   struct brw_context *brw = p->brw;
1943   uint32_t msg_control, msg_type;
1944   int mlen;
1945
1946   if (brw->gen >= 6)
1947      offset /= 16;
1948
1949   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1950
1951   if (num_regs == 1) {
1952      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1953      mlen = 2;
1954   } else {
1955      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1956      mlen = 3;
1957   }
1958
1959   /* Set up the message header.  This is g0, with g0.2 filled with
1960    * the offset.  We don't want to leave our offset around in g0 or
1961    * it'll screw up texture samples, so set it up inside the message
1962    * reg.
1963    */
1964   {
1965      brw_push_insn_state(p);
1966      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1967      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1968
1969      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1970
1971      /* set message header global offset field (reg 0, element 2) */
1972      brw_MOV(p,
1973	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1974				  mrf.nr,
1975				  2), BRW_REGISTER_TYPE_UD),
1976	      brw_imm_ud(offset));
1977
1978      brw_pop_insn_state(p);
1979   }
1980
1981   {
1982      struct brw_reg dest;
1983      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1984      int send_commit_msg;
1985      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1986					 BRW_REGISTER_TYPE_UW);
1987
1988      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1989	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1990	 src_header = vec16(src_header);
1991      }
1992      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1993      insn->header.destreg__conditionalmod = mrf.nr;
1994
1995      /* Until gen6, writes followed by reads from the same location
1996       * are not guaranteed to be ordered unless write_commit is set.
1997       * If set, then a no-op write is issued to the destination
1998       * register to set a dependency, and a read from the destination
1999       * can be used to ensure the ordering.
2000       *
2001       * For gen6, only writes between different threads need ordering
2002       * protection.  Our use of DP writes is all about register
2003       * spilling within a thread.
2004       */
2005      if (brw->gen >= 6) {
2006	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2007	 send_commit_msg = 0;
2008      } else {
2009	 dest = src_header;
2010	 send_commit_msg = 1;
2011      }
2012
2013      brw_set_dest(p, insn, dest);
2014      if (brw->gen >= 6) {
2015	 brw_set_src0(p, insn, mrf);
2016      } else {
2017	 brw_set_src0(p, insn, brw_null_reg());
2018      }
2019
2020      if (brw->gen >= 6)
2021	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2022      else
2023	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2024
2025      brw_set_dp_write_message(p,
2026			       insn,
2027			       255, /* binding table index (255=stateless) */
2028			       msg_control,
2029			       msg_type,
2030			       mlen,
2031			       true, /* header_present */
2032			       0, /* not a render target */
2033			       send_commit_msg, /* response_length */
2034			       0, /* eot */
2035			       send_commit_msg);
2036   }
2037}
2038
2039
2040/**
2041 * Read a block of owords (half a GRF each) from the scratch buffer
2042 * using a constant index per channel.
2043 *
2044 * Offset must be aligned to oword size (16 bytes).  Used for register
2045 * spilling.
2046 */
2047void
2048brw_oword_block_read_scratch(struct brw_compile *p,
2049			     struct brw_reg dest,
2050			     struct brw_reg mrf,
2051			     int num_regs,
2052			     unsigned offset)
2053{
2054   struct brw_context *brw = p->brw;
2055   uint32_t msg_control;
2056   int rlen;
2057
2058   if (brw->gen >= 6)
2059      offset /= 16;
2060
2061   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2062   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2063
2064   if (num_regs == 1) {
2065      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2066      rlen = 1;
2067   } else {
2068      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2069      rlen = 2;
2070   }
2071
2072   {
2073      brw_push_insn_state(p);
2074      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2075      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2076
2077      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2078
2079      /* set message header global offset field (reg 0, element 2) */
2080      brw_MOV(p,
2081	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2082				  mrf.nr,
2083				  2), BRW_REGISTER_TYPE_UD),
2084	      brw_imm_ud(offset));
2085
2086      brw_pop_insn_state(p);
2087   }
2088
2089   {
2090      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2091
2092      assert(insn->header.predicate_control == 0);
2093      insn->header.compression_control = BRW_COMPRESSION_NONE;
2094      insn->header.destreg__conditionalmod = mrf.nr;
2095
2096      brw_set_dest(p, insn, dest);	/* UW? */
2097      if (brw->gen >= 6) {
2098	 brw_set_src0(p, insn, mrf);
2099      } else {
2100	 brw_set_src0(p, insn, brw_null_reg());
2101      }
2102
2103      brw_set_dp_read_message(p,
2104			      insn,
2105			      255, /* binding table index (255=stateless) */
2106			      msg_control,
2107			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2108			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2109			      1, /* msg_length */
2110                              true, /* header_present */
2111			      rlen);
2112   }
2113}
2114
2115void
2116gen7_block_read_scratch(struct brw_compile *p,
2117                        struct brw_reg dest,
2118                        int num_regs,
2119                        unsigned offset)
2120{
2121   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2122
2123   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2124
2125   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2126   insn->header.compression_control = BRW_COMPRESSION_NONE;
2127
2128   brw_set_dest(p, insn, dest);
2129
2130   /* The HW requires that the header is present; this is to get the g0.5
2131    * scratch offset.
2132    */
2133   bool header_present = true;
2134   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2135
2136   brw_set_message_descriptor(p, insn,
2137                              GEN7_SFID_DATAPORT_DATA_CACHE,
2138                              1, /* mlen: just g0 */
2139                              num_regs,
2140                              header_present,
2141                              false);
2142
2143   insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;
2144
2145   assert(num_regs == 1 || num_regs == 2 || num_regs == 4);
2146   insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;
2147
2148   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2149    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2150    * is 32 bytes, which happens to be the size of a register.
2151    */
2152   offset /= REG_SIZE;
2153   assert(offset < (1 << 12));
2154   insn->bits3.ud |= offset;
2155}
2156
2157/**
2158 * Read a float[4] vector from the data port Data Cache (const buffer).
2159 * Location (in buffer) should be a multiple of 16.
2160 * Used for fetching shader constants.
2161 */
2162void brw_oword_block_read(struct brw_compile *p,
2163			  struct brw_reg dest,
2164			  struct brw_reg mrf,
2165			  uint32_t offset,
2166			  uint32_t bind_table_index)
2167{
2168   struct brw_context *brw = p->brw;
2169
2170   /* On newer hardware, offset is in units of owords. */
2171   if (brw->gen >= 6)
2172      offset /= 16;
2173
2174   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2175
2176   brw_push_insn_state(p);
2177   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2178   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2179   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2180
2181   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2182
2183   /* set message header global offset field (reg 0, element 2) */
2184   brw_MOV(p,
2185	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2186			       mrf.nr,
2187			       2), BRW_REGISTER_TYPE_UD),
2188	   brw_imm_ud(offset));
2189
2190   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2191   insn->header.destreg__conditionalmod = mrf.nr;
2192
2193   /* cast dest to a uword[8] vector */
2194   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2195
2196   brw_set_dest(p, insn, dest);
2197   if (brw->gen >= 6) {
2198      brw_set_src0(p, insn, mrf);
2199   } else {
2200      brw_set_src0(p, insn, brw_null_reg());
2201   }
2202
2203   brw_set_dp_read_message(p,
2204			   insn,
2205			   bind_table_index,
2206			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2207			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2208			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2209			   1, /* msg_length */
2210                           true, /* header_present */
2211			   1); /* response_length (1 reg, 2 owords!) */
2212
2213   brw_pop_insn_state(p);
2214}
2215
2216
2217void brw_fb_WRITE(struct brw_compile *p,
2218		  int dispatch_width,
2219                  unsigned msg_reg_nr,
2220                  struct brw_reg src0,
2221                  unsigned msg_control,
2222                  unsigned binding_table_index,
2223                  unsigned msg_length,
2224                  unsigned response_length,
2225                  bool eot,
2226                  bool header_present)
2227{
2228   struct brw_context *brw = p->brw;
2229   struct brw_instruction *insn;
2230   unsigned msg_type;
2231   struct brw_reg dest;
2232
2233   if (dispatch_width == 16)
2234      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2235   else
2236      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2237
2238   if (brw->gen >= 6) {
2239      insn = next_insn(p, BRW_OPCODE_SENDC);
2240   } else {
2241      insn = next_insn(p, BRW_OPCODE_SEND);
2242   }
2243   insn->header.compression_control = BRW_COMPRESSION_NONE;
2244
2245   if (brw->gen >= 6) {
2246      /* headerless version, just submit color payload */
2247      src0 = brw_message_reg(msg_reg_nr);
2248
2249      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2250   } else {
2251      insn->header.destreg__conditionalmod = msg_reg_nr;
2252
2253      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2254   }
2255
2256   brw_set_dest(p, insn, dest);
2257   brw_set_src0(p, insn, src0);
2258   brw_set_dp_write_message(p,
2259			    insn,
2260			    binding_table_index,
2261			    msg_control,
2262			    msg_type,
2263			    msg_length,
2264			    header_present,
2265			    eot, /* last render target write */
2266			    response_length,
2267			    eot,
2268			    0 /* send_commit_msg */);
2269}
2270
2271
2272/**
2273 * Texture sample instruction.
2274 * Note: the msg_type plus msg_length values determine exactly what kind
2275 * of sampling operation is performed.  See volume 4, page 161 of docs.
2276 */
2277void brw_SAMPLE(struct brw_compile *p,
2278		struct brw_reg dest,
2279		unsigned msg_reg_nr,
2280		struct brw_reg src0,
2281		unsigned binding_table_index,
2282		unsigned sampler,
2283		unsigned msg_type,
2284		unsigned response_length,
2285		unsigned msg_length,
2286		unsigned header_present,
2287		unsigned simd_mode,
2288		unsigned return_format)
2289{
2290   struct brw_context *brw = p->brw;
2291   struct brw_instruction *insn;
2292
2293   if (msg_reg_nr != -1)
2294      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2295
2296   insn = next_insn(p, BRW_OPCODE_SEND);
2297   insn->header.predicate_control = 0; /* XXX */
2298
2299   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2300    *
2301    *    "Instruction compression is not allowed for this instruction (that
2302    *     is, send). The hardware behavior is undefined if this instruction is
2303    *     set as compressed. However, compress control can be set to "SecHalf"
2304    *     to affect the EMask generation."
2305    *
2306    * No similar wording is found in later PRMs, but there are examples
2307    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2308    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2309    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2310    */
2311   if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
2312      insn->header.compression_control = BRW_COMPRESSION_NONE;
2313
2314   if (brw->gen < 6)
2315      insn->header.destreg__conditionalmod = msg_reg_nr;
2316
2317   brw_set_dest(p, insn, dest);
2318   brw_set_src0(p, insn, src0);
2319   brw_set_sampler_message(p, insn,
2320                           binding_table_index,
2321                           sampler,
2322                           msg_type,
2323                           response_length,
2324                           msg_length,
2325                           header_present,
2326                           simd_mode,
2327                           return_format);
2328}
2329
2330/* All these variables are pretty confusing - we might be better off
2331 * using bitmasks and macros for this, in the old style.  Or perhaps
2332 * just having the caller instantiate the fields in dword3 itself.
2333 */
2334void brw_urb_WRITE(struct brw_compile *p,
2335		   struct brw_reg dest,
2336		   unsigned msg_reg_nr,
2337		   struct brw_reg src0,
2338                   enum brw_urb_write_flags flags,
2339		   unsigned msg_length,
2340		   unsigned response_length,
2341		   unsigned offset,
2342		   unsigned swizzle)
2343{
2344   struct brw_context *brw = p->brw;
2345   struct brw_instruction *insn;
2346
2347   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2348
2349   if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2350      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2351      brw_push_insn_state(p);
2352      brw_set_default_access_mode(p, BRW_ALIGN_1);
2353      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2354      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2355		       BRW_REGISTER_TYPE_UD),
2356	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2357		brw_imm_ud(0xff00));
2358      brw_pop_insn_state(p);
2359   }
2360
2361   insn = next_insn(p, BRW_OPCODE_SEND);
2362
2363   assert(msg_length < BRW_MAX_MRF);
2364
2365   brw_set_dest(p, insn, dest);
2366   brw_set_src0(p, insn, src0);
2367   brw_set_src1(p, insn, brw_imm_d(0));
2368
2369   if (brw->gen < 6)
2370      insn->header.destreg__conditionalmod = msg_reg_nr;
2371
2372   brw_set_urb_message(p,
2373		       insn,
2374		       flags,
2375		       msg_length,
2376		       response_length,
2377		       offset,
2378		       swizzle);
2379}
2380
2381static int
2382brw_find_next_block_end(struct brw_compile *p, int start_offset)
2383{
2384   int offset;
2385   void *store = p->store;
2386
2387   for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2388        offset = next_offset(store, offset)) {
2389      struct brw_instruction *insn = store + offset;
2390
2391      switch (insn->header.opcode) {
2392      case BRW_OPCODE_ENDIF:
2393      case BRW_OPCODE_ELSE:
2394      case BRW_OPCODE_WHILE:
2395      case BRW_OPCODE_HALT:
2396	 return offset;
2397      }
2398   }
2399
2400   return 0;
2401}
2402
2403/* There is no DO instruction on gen6, so to find the end of the loop
2404 * we have to see if the loop is jumping back before our start
2405 * instruction.
2406 */
2407static int
2408brw_find_loop_end(struct brw_compile *p, int start_offset)
2409{
2410   struct brw_context *brw = p->brw;
2411   int offset;
2412   int scale = 8;
2413   void *store = p->store;
2414
2415   /* Always start after the instruction (such as a WHILE) we're trying to fix
2416    * up.
2417    */
2418   for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2419        offset = next_offset(store, offset)) {
2420      struct brw_instruction *insn = store + offset;
2421
2422      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2423	 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2424				   : insn->bits3.break_cont.jip;
2425	 if (offset + jip * scale <= start_offset)
2426	    return offset;
2427      }
2428   }
2429   assert(!"not reached");
2430   return start_offset;
2431}
2432
2433/* After program generation, go back and update the UIP and JIP of
2434 * BREAK, CONT, and HALT instructions to their correct locations.
2435 */
2436void
2437brw_set_uip_jip(struct brw_compile *p)
2438{
2439   struct brw_context *brw = p->brw;
2440   int offset;
2441   int scale = 8;
2442   void *store = p->store;
2443
2444   if (brw->gen < 6)
2445      return;
2446
2447   for (offset = 0; offset < p->next_insn_offset;
2448        offset = next_offset(store, offset)) {
2449      struct brw_instruction *insn = store + offset;
2450
2451      if (insn->header.cmpt_control) {
2452	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2453	 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2454		insn->header.opcode != BRW_OPCODE_CONTINUE &&
2455		insn->header.opcode != BRW_OPCODE_HALT);
2456	 continue;
2457      }
2458
2459      int block_end_offset = brw_find_next_block_end(p, offset);
2460      switch (insn->header.opcode) {
2461      case BRW_OPCODE_BREAK:
2462         assert(block_end_offset != 0);
2463	 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2464	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2465	 insn->bits3.break_cont.uip =
2466	    (brw_find_loop_end(p, offset) - offset +
2467             (brw->gen == 6 ? 16 : 0)) / scale;
2468	 break;
2469      case BRW_OPCODE_CONTINUE:
2470         assert(block_end_offset != 0);
2471	 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2472	 insn->bits3.break_cont.uip =
2473            (brw_find_loop_end(p, offset) - offset) / scale;
2474
2475	 assert(insn->bits3.break_cont.uip != 0);
2476	 assert(insn->bits3.break_cont.jip != 0);
2477	 break;
2478
2479      case BRW_OPCODE_ENDIF:
2480         if (block_end_offset == 0)
2481            insn->bits3.break_cont.jip = 2;
2482         else
2483            insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2484	 break;
2485
2486      case BRW_OPCODE_HALT:
2487	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2488	  *
2489	  *    "In case of the halt instruction not inside any conditional
2490	  *     code block, the value of <JIP> and <UIP> should be the
2491	  *     same. In case of the halt instruction inside conditional code
2492	  *     block, the <UIP> should be the end of the program, and the
2493	  *     <JIP> should be end of the most inner conditional code block."
2494	  *
2495	  * The uip will have already been set by whoever set up the
2496	  * instruction.
2497	  */
2498	 if (block_end_offset == 0) {
2499	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2500	 } else {
2501	    insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2502	 }
2503	 assert(insn->bits3.break_cont.uip != 0);
2504	 assert(insn->bits3.break_cont.jip != 0);
2505	 break;
2506      }
2507   }
2508}
2509
2510void brw_ff_sync(struct brw_compile *p,
2511		   struct brw_reg dest,
2512		   unsigned msg_reg_nr,
2513		   struct brw_reg src0,
2514		   bool allocate,
2515		   unsigned response_length,
2516		   bool eot)
2517{
2518   struct brw_context *brw = p->brw;
2519   struct brw_instruction *insn;
2520
2521   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2522
2523   insn = next_insn(p, BRW_OPCODE_SEND);
2524   brw_set_dest(p, insn, dest);
2525   brw_set_src0(p, insn, src0);
2526   brw_set_src1(p, insn, brw_imm_d(0));
2527
2528   if (brw->gen < 6)
2529      insn->header.destreg__conditionalmod = msg_reg_nr;
2530
2531   brw_set_ff_sync_message(p,
2532			   insn,
2533			   allocate,
2534			   response_length,
2535			   eot);
2536}
2537
2538/**
2539 * Emit the SEND instruction necessary to generate stream output data on Gen6
2540 * (for transform feedback).
2541 *
2542 * If send_commit_msg is true, this is the last piece of stream output data
2543 * from this thread, so send the data as a committed write.  According to the
2544 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2545 *
2546 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2547 *   writes are complete by sending the final write as a committed write."
2548 */
2549void
2550brw_svb_write(struct brw_compile *p,
2551              struct brw_reg dest,
2552              unsigned msg_reg_nr,
2553              struct brw_reg src0,
2554              unsigned binding_table_index,
2555              bool   send_commit_msg)
2556{
2557   struct brw_instruction *insn;
2558
2559   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2560
2561   insn = next_insn(p, BRW_OPCODE_SEND);
2562   brw_set_dest(p, insn, dest);
2563   brw_set_src0(p, insn, src0);
2564   brw_set_src1(p, insn, brw_imm_d(0));
2565   brw_set_dp_write_message(p, insn,
2566                            binding_table_index,
2567                            0, /* msg_control: ignored */
2568                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2569                            1, /* msg_length */
2570                            true, /* header_present */
2571                            0, /* last_render_target: ignored */
2572                            send_commit_msg, /* response_length */
2573                            0, /* end_of_thread */
2574                            send_commit_msg); /* send_commit_msg */
2575}
2576
2577static void
2578brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2579                                  struct brw_instruction *insn,
2580                                  unsigned atomic_op,
2581                                  unsigned bind_table_index,
2582                                  unsigned msg_length,
2583                                  unsigned response_length,
2584                                  bool header_present)
2585{
2586   if (p->brw->is_haswell) {
2587      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2588                                 msg_length, response_length,
2589                                 header_present, false);
2590
2591
2592      if (insn->header.access_mode == BRW_ALIGN_1) {
2593         if (insn->header.execution_size != BRW_EXECUTE_16)
2594            insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2595
2596         insn->bits3.gen7_dp.msg_type =
2597            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2598      } else {
2599         insn->bits3.gen7_dp.msg_type =
2600            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
2601      }
2602   } else {
2603      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2604                                 msg_length, response_length,
2605                                 header_present, false);
2606
2607      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2608
2609      if (insn->header.execution_size != BRW_EXECUTE_16)
2610         insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2611   }
2612
2613   if (response_length)
2614      insn->bits3.ud |= 1 << 13; /* Return data expected */
2615
2616   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2617   insn->bits3.ud |= atomic_op << 8;
2618}
2619
2620void
2621brw_untyped_atomic(struct brw_compile *p,
2622                   struct brw_reg dest,
2623                   struct brw_reg mrf,
2624                   unsigned atomic_op,
2625                   unsigned bind_table_index,
2626                   unsigned msg_length,
2627                   unsigned response_length) {
2628   struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2629
2630   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2631   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2632   brw_set_src1(p, insn, brw_imm_d(0));
2633   brw_set_dp_untyped_atomic_message(
2634      p, insn, atomic_op, bind_table_index, msg_length, response_length,
2635      insn->header.access_mode == BRW_ALIGN_1);
2636}
2637
2638static void
2639brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2640                                        struct brw_instruction *insn,
2641                                        unsigned bind_table_index,
2642                                        unsigned msg_length,
2643                                        unsigned response_length,
2644                                        bool header_present)
2645{
2646   const unsigned dispatch_width =
2647      (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
2648   const unsigned num_channels = response_length / (dispatch_width / 8);
2649
2650   if (p->brw->is_haswell) {
2651      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2652                                 msg_length, response_length,
2653                                 header_present, false);
2654
2655      insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
2656   } else {
2657      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2658                                 msg_length, response_length,
2659                                 header_present, false);
2660
2661      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
2662   }
2663
2664   if (insn->header.access_mode == BRW_ALIGN_1) {
2665      if (dispatch_width == 16)
2666         insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
2667      else
2668         insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
2669   }
2670
2671   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2672
2673   /* Set mask of 32-bit channels to drop. */
2674   insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
2675}
2676
2677void
2678brw_untyped_surface_read(struct brw_compile *p,
2679                         struct brw_reg dest,
2680                         struct brw_reg mrf,
2681                         unsigned bind_table_index,
2682                         unsigned msg_length,
2683                         unsigned response_length)
2684{
2685   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2686
2687   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2688   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2689   brw_set_dp_untyped_surface_read_message(
2690      p, insn, bind_table_index, msg_length, response_length,
2691      insn->header.access_mode == BRW_ALIGN_1);
2692}
2693
2694/**
2695 * This instruction is generated as a single-channel align1 instruction by
2696 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2697 *
2698 * We can't use the typed atomic op in the FS because that has the execution
2699 * mask ANDed with the pixel mask, but we just want to write the one dword for
2700 * all the pixels.
2701 *
2702 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2703 * one u32.  So we use the same untyped atomic write message as the pixel
2704 * shader.
2705 *
2706 * The untyped atomic operation requires a BUFFER surface type with RAW
2707 * format, and is only accessible through the legacy DATA_CACHE dataport
2708 * messages.
2709 */
2710void brw_shader_time_add(struct brw_compile *p,
2711                         struct brw_reg payload,
2712                         uint32_t surf_index)
2713{
2714   struct brw_context *brw = p->brw;
2715   assert(brw->gen >= 7);
2716
2717   brw_push_insn_state(p);
2718   brw_set_default_access_mode(p, BRW_ALIGN_1);
2719   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2720   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2721   brw_pop_insn_state(p);
2722
2723   /* We use brw_vec1_reg and unmasked because we want to increment the given
2724    * offset only once.
2725    */
2726   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2727                                      BRW_ARF_NULL, 0));
2728   brw_set_src0(p, send, brw_vec1_reg(payload.file,
2729                                      payload.nr, 0));
2730   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2731                                     2 /* message length */,
2732                                     0 /* response length */,
2733                                     false /* header present */);
2734}
2735