brw_eu_emit.c revision fb977c90d1ef29f47b686c27500005025543cf11
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  unsigned msg_reg_nr)
65{
66   struct brw_context *brw = p->brw;
67   if (brw->gen < 6)
68      return;
69
70   if (src->file == BRW_MESSAGE_REGISTER_FILE)
71      return;
72
73   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74      brw_push_insn_state(p);
75      brw_set_mask_control(p, BRW_MASK_DISABLE);
76      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	      retype(*src, BRW_REGISTER_TYPE_UD));
79      brw_pop_insn_state(p);
80   }
81   *src = brw_message_reg(msg_reg_nr);
82}
83
84static void
85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86{
87   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88    * "The send with EOT should use register space R112-R127 for <src>. This is
89    *  to enable loading of a new thread into the same slot while the message
90    *  with EOT for current thread is pending dispatch."
91    *
92    * Since we're pretending to have 16 MRFs anyway, we may as well use the
93    * registers required for messages with EOT.
94    */
95   struct brw_context *brw = p->brw;
96   if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97      reg->file = BRW_GENERAL_REGISTER_FILE;
98      reg->nr += GEN7_MRF_HACK_START;
99   }
100}
101
102/**
103 * Convert a brw_reg_type enumeration value into the hardware representation.
104 *
105 * The hardware encoding may depend on whether the value is an immediate.
106 */
107unsigned
108brw_reg_type_to_hw_type(const struct brw_context *brw,
109                        enum brw_reg_type type, unsigned file)
110{
111   if (file == BRW_IMMEDIATE_VALUE) {
112      const static int imm_hw_types[] = {
113         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
114         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
115         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
116         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
117         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
118         [BRW_REGISTER_TYPE_UB] = -1,
119         [BRW_REGISTER_TYPE_B]  = -1,
120         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
121         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
122         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
123         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
124         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
125         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
126         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
127      };
128      assert(type < ARRAY_SIZE(imm_hw_types));
129      assert(imm_hw_types[type] != -1);
130      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
131      return imm_hw_types[type];
132   } else {
133      /* Non-immediate registers */
134      const static int hw_types[] = {
135         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
136         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
137         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
138         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
139         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
140         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
141         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
142         [BRW_REGISTER_TYPE_UV] = -1,
143         [BRW_REGISTER_TYPE_VF] = -1,
144         [BRW_REGISTER_TYPE_V]  = -1,
145         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
146         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
147         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
148         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
149      };
150      assert(type < ARRAY_SIZE(hw_types));
151      assert(hw_types[type] != -1);
152      assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
153      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
154      return hw_types[type];
155   }
156}
157
158void
159brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
160	     struct brw_reg dest)
161{
162   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
163       dest.file != BRW_MESSAGE_REGISTER_FILE)
164      assert(dest.nr < 128);
165
166   gen7_convert_mrf_to_grf(p, &dest);
167
168   insn->bits1.da1.dest_reg_file = dest.file;
169   insn->bits1.da1.dest_reg_type =
170      brw_reg_type_to_hw_type(p->brw, dest.type, dest.file);
171   insn->bits1.da1.dest_address_mode = dest.address_mode;
172
173   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
174      insn->bits1.da1.dest_reg_nr = dest.nr;
175
176      if (insn->header.access_mode == BRW_ALIGN_1) {
177	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
178	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
179	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
180	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
181      }
182      else {
183	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
184	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
185         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
186             dest.file == BRW_MESSAGE_REGISTER_FILE) {
187            assert(dest.dw1.bits.writemask != 0);
188         }
189	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
190	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
191	  *    this to be programmed as "01".
192	  */
193	 insn->bits1.da16.dest_horiz_stride = 1;
194      }
195   }
196   else {
197      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
198
199      /* These are different sizes in align1 vs align16:
200       */
201      if (insn->header.access_mode == BRW_ALIGN_1) {
202	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
203	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
204	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
205	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
206      }
207      else {
208	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
209	 /* even ignored in da16, still need to set as '01' */
210	 insn->bits1.ia16.dest_horiz_stride = 1;
211      }
212   }
213
214   /* NEW: Set the execution size based on dest.width and
215    * insn->compression_control:
216    */
217   guess_execution_size(p, insn, dest);
218}
219
220extern int reg_type_size[];
221
222static void
223validate_reg(struct brw_instruction *insn, struct brw_reg reg)
224{
225   int hstride_for_reg[] = {0, 1, 2, 4};
226   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
227   int width_for_reg[] = {1, 2, 4, 8, 16};
228   int execsize_for_reg[] = {1, 2, 4, 8, 16};
229   int width, hstride, vstride, execsize;
230
231   if (reg.file == BRW_IMMEDIATE_VALUE) {
232      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
233       * mean the destination has to be 128-bit aligned and the
234       * destination horiz stride has to be a word.
235       */
236      if (reg.type == BRW_REGISTER_TYPE_V) {
237	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
238		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
239      }
240
241      return;
242   }
243
244   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
245       reg.file == BRW_ARF_NULL)
246      return;
247
248   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
249   hstride = hstride_for_reg[reg.hstride];
250
251   if (reg.vstride == 0xf) {
252      vstride = -1;
253   } else {
254      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
255      vstride = vstride_for_reg[reg.vstride];
256   }
257
258   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
259   width = width_for_reg[reg.width];
260
261   assert(insn->header.execution_size >= 0 &&
262	  insn->header.execution_size < Elements(execsize_for_reg));
263   execsize = execsize_for_reg[insn->header.execution_size];
264
265   /* Restrictions from 3.3.10: Register Region Restrictions. */
266   /* 3. */
267   assert(execsize >= width);
268
269   /* 4. */
270   if (execsize == width && hstride != 0) {
271      assert(vstride == -1 || vstride == width * hstride);
272   }
273
274   /* 5. */
275   if (execsize == width && hstride == 0) {
276      /* no restriction on vstride. */
277   }
278
279   /* 6. */
280   if (width == 1) {
281      assert(hstride == 0);
282   }
283
284   /* 7. */
285   if (execsize == 1 && width == 1) {
286      assert(hstride == 0);
287      assert(vstride == 0);
288   }
289
290   /* 8. */
291   if (vstride == 0 && hstride == 0) {
292      assert(width == 1);
293   }
294
295   /* 10. Check destination issues. */
296}
297
298void
299brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
300	     struct brw_reg reg)
301{
302   struct brw_context *brw = p->brw;
303
304   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
305      assert(reg.nr < 128);
306
307   gen7_convert_mrf_to_grf(p, &reg);
308
309   if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
310                           insn->header.opcode == BRW_OPCODE_SENDC)) {
311      /* Any source modifiers or regions will be ignored, since this just
312       * identifies the MRF/GRF to start reading the message contents from.
313       * Check for some likely failures.
314       */
315      assert(!reg.negate);
316      assert(!reg.abs);
317      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
318   }
319
320   validate_reg(insn, reg);
321
322   insn->bits1.da1.src0_reg_file = reg.file;
323   insn->bits1.da1.src0_reg_type =
324      brw_reg_type_to_hw_type(brw, reg.type, reg.file);
325   insn->bits2.da1.src0_abs = reg.abs;
326   insn->bits2.da1.src0_negate = reg.negate;
327   insn->bits2.da1.src0_address_mode = reg.address_mode;
328
329   if (reg.file == BRW_IMMEDIATE_VALUE) {
330      insn->bits3.ud = reg.dw1.ud;
331
332      /* The Bspec's section titled "Non-present Operands" claims that if src0
333       * is an immediate that src1's type must be the same as that of src0.
334       *
335       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
336       * that do not follow this rule. E.g., from the IVB/HSW table:
337       *
338       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
339       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
340       *
341       * And from the SNB table:
342       *
343       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
344       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
345       *
346       * Neither of these cause warnings from the simulator when used,
347       * compacted or otherwise. In fact, all compaction mappings that have an
348       * immediate in src0 use a:ud for src1.
349       *
350       * The GM45 instruction compaction tables do not contain mapped meanings
351       * so it's not clear whether it has the restriction. We'll assume it was
352       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
353       */
354      insn->bits1.da1.src1_reg_file = 0; /* arf */
355      if (brw->gen < 6) {
356         insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type;
357      } else {
358         insn->bits1.da1.src1_reg_type = BRW_HW_REG_TYPE_UD;
359      }
360   }
361   else
362   {
363      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
364	 if (insn->header.access_mode == BRW_ALIGN_1) {
365	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
366	    insn->bits2.da1.src0_reg_nr = reg.nr;
367	 }
368	 else {
369	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
370	    insn->bits2.da16.src0_reg_nr = reg.nr;
371	 }
372      }
373      else {
374	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
375
376	 if (insn->header.access_mode == BRW_ALIGN_1) {
377	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
378	 }
379	 else {
380	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
381	 }
382      }
383
384      if (insn->header.access_mode == BRW_ALIGN_1) {
385	 if (reg.width == BRW_WIDTH_1 &&
386	     insn->header.execution_size == BRW_EXECUTE_1) {
387	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
388	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
389	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
390	 }
391	 else {
392	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
393	    insn->bits2.da1.src0_width = reg.width;
394	    insn->bits2.da1.src0_vert_stride = reg.vstride;
395	 }
396      }
397      else {
398	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
399	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
400	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
401	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
402
403	 /* This is an oddity of the fact we're using the same
404	  * descriptions for registers in align_16 as align_1:
405	  */
406	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
407	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
408	 else
409	    insn->bits2.da16.src0_vert_stride = reg.vstride;
410      }
411   }
412}
413
414
415void
416brw_set_src1(struct brw_compile *p,
417             struct brw_instruction *insn,
418             struct brw_reg reg)
419{
420   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
421
422   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
423      assert(reg.nr < 128);
424
425   gen7_convert_mrf_to_grf(p, &reg);
426
427   validate_reg(insn, reg);
428
429   insn->bits1.da1.src1_reg_file = reg.file;
430   insn->bits1.da1.src1_reg_type =
431      brw_reg_type_to_hw_type(p->brw, reg.type, reg.file);
432   insn->bits3.da1.src1_abs = reg.abs;
433   insn->bits3.da1.src1_negate = reg.negate;
434
435   /* Only src1 can be immediate in two-argument instructions.
436    */
437   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
438
439   if (reg.file == BRW_IMMEDIATE_VALUE) {
440      insn->bits3.ud = reg.dw1.ud;
441   }
442   else {
443      /* This is a hardware restriction, which may or may not be lifted
444       * in the future:
445       */
446      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
447      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
448
449      if (insn->header.access_mode == BRW_ALIGN_1) {
450	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
451	 insn->bits3.da1.src1_reg_nr = reg.nr;
452      }
453      else {
454	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
455	 insn->bits3.da16.src1_reg_nr = reg.nr;
456      }
457
458      if (insn->header.access_mode == BRW_ALIGN_1) {
459	 if (reg.width == BRW_WIDTH_1 &&
460	     insn->header.execution_size == BRW_EXECUTE_1) {
461	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
462	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
463	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
464	 }
465	 else {
466	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
467	    insn->bits3.da1.src1_width = reg.width;
468	    insn->bits3.da1.src1_vert_stride = reg.vstride;
469	 }
470      }
471      else {
472	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
473	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
474	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
475	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
476
477	 /* This is an oddity of the fact we're using the same
478	  * descriptions for registers in align_16 as align_1:
479	  */
480	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
481	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
482	 else
483	    insn->bits3.da16.src1_vert_stride = reg.vstride;
484      }
485   }
486}
487
488/**
489 * Set the Message Descriptor and Extended Message Descriptor fields
490 * for SEND messages.
491 *
492 * \note This zeroes out the Function Control bits, so it must be called
493 *       \b before filling out any message-specific data.  Callers can
494 *       choose not to fill in irrelevant bits; they will be zero.
495 */
496static void
497brw_set_message_descriptor(struct brw_compile *p,
498			   struct brw_instruction *inst,
499			   enum brw_message_target sfid,
500			   unsigned msg_length,
501			   unsigned response_length,
502			   bool header_present,
503			   bool end_of_thread)
504{
505   struct brw_context *brw = p->brw;
506
507   brw_set_src1(p, inst, brw_imm_d(0));
508
509   if (brw->gen >= 5) {
510      inst->bits3.generic_gen5.header_present = header_present;
511      inst->bits3.generic_gen5.response_length = response_length;
512      inst->bits3.generic_gen5.msg_length = msg_length;
513      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
514
515      if (brw->gen >= 6) {
516	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
517	 inst->header.destreg__conditionalmod = sfid;
518      } else {
519	 /* Set Extended Message Descriptor (ex_desc) */
520	 inst->bits2.send_gen5.sfid = sfid;
521	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
522      }
523   } else {
524      inst->bits3.generic.response_length = response_length;
525      inst->bits3.generic.msg_length = msg_length;
526      inst->bits3.generic.msg_target = sfid;
527      inst->bits3.generic.end_of_thread = end_of_thread;
528   }
529}
530
531static void brw_set_math_message( struct brw_compile *p,
532				  struct brw_instruction *insn,
533				  unsigned function,
534				  unsigned integer_type,
535				  bool low_precision,
536				  unsigned dataType )
537{
538   struct brw_context *brw = p->brw;
539   unsigned msg_length;
540   unsigned response_length;
541
542   /* Infer message length from the function */
543   switch (function) {
544   case BRW_MATH_FUNCTION_POW:
545   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
546   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
547   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
548      msg_length = 2;
549      break;
550   default:
551      msg_length = 1;
552      break;
553   }
554
555   /* Infer response length from the function */
556   switch (function) {
557   case BRW_MATH_FUNCTION_SINCOS:
558   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
559      response_length = 2;
560      break;
561   default:
562      response_length = 1;
563      break;
564   }
565
566
567   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
568			      msg_length, response_length, false, false);
569   if (brw->gen == 5) {
570      insn->bits3.math_gen5.function = function;
571      insn->bits3.math_gen5.int_type = integer_type;
572      insn->bits3.math_gen5.precision = low_precision;
573      insn->bits3.math_gen5.saturate = insn->header.saturate;
574      insn->bits3.math_gen5.data_type = dataType;
575      insn->bits3.math_gen5.snapshot = 0;
576   } else {
577      insn->bits3.math.function = function;
578      insn->bits3.math.int_type = integer_type;
579      insn->bits3.math.precision = low_precision;
580      insn->bits3.math.saturate = insn->header.saturate;
581      insn->bits3.math.data_type = dataType;
582   }
583   insn->header.saturate = 0;
584}
585
586
587static void brw_set_ff_sync_message(struct brw_compile *p,
588				    struct brw_instruction *insn,
589				    bool allocate,
590				    unsigned response_length,
591				    bool end_of_thread)
592{
593   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
594			      1, response_length, true, end_of_thread);
595   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
596   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
597   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
598   insn->bits3.urb_gen5.allocate = allocate;
599   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
600   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
601}
602
603static void brw_set_urb_message( struct brw_compile *p,
604				 struct brw_instruction *insn,
605                                 enum brw_urb_write_flags flags,
606				 unsigned msg_length,
607				 unsigned response_length,
608				 unsigned offset,
609				 unsigned swizzle_control )
610{
611   struct brw_context *brw = p->brw;
612
613   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
614			      msg_length, response_length, true,
615                              flags & BRW_URB_WRITE_EOT);
616   if (brw->gen == 7) {
617      if (flags & BRW_URB_WRITE_OWORD) {
618         assert(msg_length == 2); /* header + one OWORD of data */
619         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
620      } else {
621         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
622      }
623      insn->bits3.urb_gen7.offset = offset;
624      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
625      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
626      insn->bits3.urb_gen7.per_slot_offset =
627         flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
628      insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
629   } else if (brw->gen >= 5) {
630      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
631      insn->bits3.urb_gen5.offset = offset;
632      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
633      insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
634      insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
635      insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
636   } else {
637      insn->bits3.urb.opcode = 0;	/* ? */
638      insn->bits3.urb.offset = offset;
639      insn->bits3.urb.swizzle_control = swizzle_control;
640      insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
641      insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
642      insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
643   }
644}
645
646void
647brw_set_dp_write_message(struct brw_compile *p,
648			 struct brw_instruction *insn,
649			 unsigned binding_table_index,
650			 unsigned msg_control,
651			 unsigned msg_type,
652			 unsigned msg_length,
653			 bool header_present,
654			 unsigned last_render_target,
655			 unsigned response_length,
656			 unsigned end_of_thread,
657			 unsigned send_commit_msg)
658{
659   struct brw_context *brw = p->brw;
660   unsigned sfid;
661
662   if (brw->gen >= 7) {
663      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
664      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
665	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
666      else
667	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
668   } else if (brw->gen == 6) {
669      /* Use the render cache for all write messages. */
670      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
671   } else {
672      sfid = BRW_SFID_DATAPORT_WRITE;
673   }
674
675   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
676			      header_present, end_of_thread);
677
678   if (brw->gen >= 7) {
679      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
680      insn->bits3.gen7_dp.msg_control = msg_control;
681      insn->bits3.gen7_dp.last_render_target = last_render_target;
682      insn->bits3.gen7_dp.msg_type = msg_type;
683   } else if (brw->gen == 6) {
684      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
685      insn->bits3.gen6_dp.msg_control = msg_control;
686      insn->bits3.gen6_dp.last_render_target = last_render_target;
687      insn->bits3.gen6_dp.msg_type = msg_type;
688      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
689   } else if (brw->gen == 5) {
690      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
691      insn->bits3.dp_write_gen5.msg_control = msg_control;
692      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
693      insn->bits3.dp_write_gen5.msg_type = msg_type;
694      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
695   } else {
696      insn->bits3.dp_write.binding_table_index = binding_table_index;
697      insn->bits3.dp_write.msg_control = msg_control;
698      insn->bits3.dp_write.last_render_target = last_render_target;
699      insn->bits3.dp_write.msg_type = msg_type;
700      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
701   }
702}
703
704void
705brw_set_dp_read_message(struct brw_compile *p,
706			struct brw_instruction *insn,
707			unsigned binding_table_index,
708			unsigned msg_control,
709			unsigned msg_type,
710			unsigned target_cache,
711			unsigned msg_length,
712                        bool header_present,
713			unsigned response_length)
714{
715   struct brw_context *brw = p->brw;
716   unsigned sfid;
717
718   if (brw->gen >= 7) {
719      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
720   } else if (brw->gen == 6) {
721      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
722	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
723      else
724	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
725   } else {
726      sfid = BRW_SFID_DATAPORT_READ;
727   }
728
729   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
730			      header_present, false);
731
732   if (brw->gen >= 7) {
733      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
734      insn->bits3.gen7_dp.msg_control = msg_control;
735      insn->bits3.gen7_dp.last_render_target = 0;
736      insn->bits3.gen7_dp.msg_type = msg_type;
737   } else if (brw->gen == 6) {
738      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
739      insn->bits3.gen6_dp.msg_control = msg_control;
740      insn->bits3.gen6_dp.last_render_target = 0;
741      insn->bits3.gen6_dp.msg_type = msg_type;
742      insn->bits3.gen6_dp.send_commit_msg = 0;
743   } else if (brw->gen == 5) {
744      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
745      insn->bits3.dp_read_gen5.msg_control = msg_control;
746      insn->bits3.dp_read_gen5.msg_type = msg_type;
747      insn->bits3.dp_read_gen5.target_cache = target_cache;
748   } else if (brw->is_g4x) {
749      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
750      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
751      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
752      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
753   } else {
754      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
755      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
756      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
757      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
758   }
759}
760
761void
762brw_set_sampler_message(struct brw_compile *p,
763                        struct brw_instruction *insn,
764                        unsigned binding_table_index,
765                        unsigned sampler,
766                        unsigned msg_type,
767                        unsigned response_length,
768                        unsigned msg_length,
769                        unsigned header_present,
770                        unsigned simd_mode,
771                        unsigned return_format)
772{
773   struct brw_context *brw = p->brw;
774
775   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
776			      response_length, header_present, false);
777
778   if (brw->gen >= 7) {
779      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
780      insn->bits3.sampler_gen7.sampler = sampler;
781      insn->bits3.sampler_gen7.msg_type = msg_type;
782      insn->bits3.sampler_gen7.simd_mode = simd_mode;
783   } else if (brw->gen >= 5) {
784      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
785      insn->bits3.sampler_gen5.sampler = sampler;
786      insn->bits3.sampler_gen5.msg_type = msg_type;
787      insn->bits3.sampler_gen5.simd_mode = simd_mode;
788   } else if (brw->is_g4x) {
789      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
790      insn->bits3.sampler_g4x.sampler = sampler;
791      insn->bits3.sampler_g4x.msg_type = msg_type;
792   } else {
793      insn->bits3.sampler.binding_table_index = binding_table_index;
794      insn->bits3.sampler.sampler = sampler;
795      insn->bits3.sampler.msg_type = msg_type;
796      insn->bits3.sampler.return_format = return_format;
797   }
798}
799
800
801#define next_insn brw_next_insn
802struct brw_instruction *
803brw_next_insn(struct brw_compile *p, unsigned opcode)
804{
805   struct brw_instruction *insn;
806
807   if (p->nr_insn + 1 > p->store_size) {
808      if (0) {
809         fprintf(stderr, "incresing the store size to %d\n",
810                 p->store_size << 1);
811      }
812      p->store_size <<= 1;
813      p->store = reralloc(p->mem_ctx, p->store,
814                          struct brw_instruction, p->store_size);
815      if (!p->store)
816         assert(!"realloc eu store memeory failed");
817   }
818
819   p->next_insn_offset += 16;
820   insn = &p->store[p->nr_insn++];
821   memcpy(insn, p->current, sizeof(*insn));
822
823   /* Reset this one-shot flag:
824    */
825
826   if (p->current->header.destreg__conditionalmod) {
827      p->current->header.destreg__conditionalmod = 0;
828      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
829   }
830
831   insn->header.opcode = opcode;
832   return insn;
833}
834
835static struct brw_instruction *brw_alu1( struct brw_compile *p,
836					 unsigned opcode,
837					 struct brw_reg dest,
838					 struct brw_reg src )
839{
840   struct brw_instruction *insn = next_insn(p, opcode);
841   brw_set_dest(p, insn, dest);
842   brw_set_src0(p, insn, src);
843   return insn;
844}
845
846static struct brw_instruction *brw_alu2(struct brw_compile *p,
847					unsigned opcode,
848					struct brw_reg dest,
849					struct brw_reg src0,
850					struct brw_reg src1 )
851{
852   struct brw_instruction *insn = next_insn(p, opcode);
853   brw_set_dest(p, insn, dest);
854   brw_set_src0(p, insn, src0);
855   brw_set_src1(p, insn, src1);
856   return insn;
857}
858
859static int
860get_3src_subreg_nr(struct brw_reg reg)
861{
862   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
863      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
864      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
865   } else {
866      return reg.subnr / 4;
867   }
868}
869
870static struct brw_instruction *brw_alu3(struct brw_compile *p,
871					unsigned opcode,
872					struct brw_reg dest,
873					struct brw_reg src0,
874					struct brw_reg src1,
875					struct brw_reg src2)
876{
877   struct brw_context *brw = p->brw;
878   struct brw_instruction *insn = next_insn(p, opcode);
879
880   gen7_convert_mrf_to_grf(p, &dest);
881
882   assert(insn->header.access_mode == BRW_ALIGN_16);
883
884   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
885	  dest.file == BRW_MESSAGE_REGISTER_FILE);
886   assert(dest.nr < 128);
887   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
888   assert(dest.type == BRW_REGISTER_TYPE_F ||
889          dest.type == BRW_REGISTER_TYPE_D ||
890          dest.type == BRW_REGISTER_TYPE_UD);
891   insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
892   insn->bits1.da3src.dest_reg_nr = dest.nr;
893   insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
894   insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
895   guess_execution_size(p, insn, dest);
896
897   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
898   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
899   assert(src0.nr < 128);
900   insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
901   insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
902   insn->bits2.da3src.src0_reg_nr = src0.nr;
903   insn->bits1.da3src.src0_abs = src0.abs;
904   insn->bits1.da3src.src0_negate = src0.negate;
905   insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
906
907   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
908   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
909   assert(src1.nr < 128);
910   insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
911   insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
912   insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
913   insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
914   insn->bits3.da3src.src1_reg_nr = src1.nr;
915   insn->bits1.da3src.src1_abs = src1.abs;
916   insn->bits1.da3src.src1_negate = src1.negate;
917
918   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
919   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
920   assert(src2.nr < 128);
921   insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
922   insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
923   insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
924   insn->bits3.da3src.src2_reg_nr = src2.nr;
925   insn->bits1.da3src.src2_abs = src2.abs;
926   insn->bits1.da3src.src2_negate = src2.negate;
927
928   if (brw->gen >= 7) {
929      /* Set both the source and destination types based on dest.type,
930       * ignoring the source register types.  The MAD and LRP emitters ensure
931       * that all four types are float.  The BFE and BFI2 emitters, however,
932       * may send us mixed D and UD types and want us to ignore that and use
933       * the destination type.
934       */
935      switch (dest.type) {
936      case BRW_REGISTER_TYPE_F:
937         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
938         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
939         break;
940      case BRW_REGISTER_TYPE_D:
941         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
942         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
943         break;
944      case BRW_REGISTER_TYPE_UD:
945         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
946         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
947         break;
948      }
949   }
950
951   return insn;
952}
953
954
955/***********************************************************************
956 * Convenience routines.
957 */
958#define ALU1(OP)					\
959struct brw_instruction *brw_##OP(struct brw_compile *p,	\
960	      struct brw_reg dest,			\
961	      struct brw_reg src0)   			\
962{							\
963   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
964}
965
966#define ALU2(OP)					\
967struct brw_instruction *brw_##OP(struct brw_compile *p,	\
968	      struct brw_reg dest,			\
969	      struct brw_reg src0,			\
970	      struct brw_reg src1)   			\
971{							\
972   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
973}
974
975#define ALU3(OP)					\
976struct brw_instruction *brw_##OP(struct brw_compile *p,	\
977	      struct brw_reg dest,			\
978	      struct brw_reg src0,			\
979	      struct brw_reg src1,			\
980	      struct brw_reg src2)   			\
981{							\
982   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
983}
984
985#define ALU3F(OP)                                               \
986struct brw_instruction *brw_##OP(struct brw_compile *p,         \
987                                 struct brw_reg dest,           \
988                                 struct brw_reg src0,           \
989                                 struct brw_reg src1,           \
990                                 struct brw_reg src2)           \
991{                                                               \
992   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
993   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
994   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
995   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
996   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
997}
998
999/* Rounding operations (other than RNDD) require two instructions - the first
1000 * stores a rounded value (possibly the wrong way) in the dest register, but
1001 * also sets a per-channel "increment bit" in the flag register.  A predicated
1002 * add of 1.0 fixes dest to contain the desired result.
1003 *
1004 * Sandybridge and later appear to round correctly without an ADD.
1005 */
1006#define ROUND(OP)							      \
1007void brw_##OP(struct brw_compile *p,					      \
1008	      struct brw_reg dest,					      \
1009	      struct brw_reg src)					      \
1010{									      \
1011   struct brw_instruction *rnd, *add;					      \
1012   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
1013   brw_set_dest(p, rnd, dest);						      \
1014   brw_set_src0(p, rnd, src);						      \
1015									      \
1016   if (p->brw->gen < 6) {						      \
1017      /* turn on round-increments */					      \
1018      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
1019      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
1020      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
1021   }									      \
1022}
1023
1024
1025ALU1(MOV)
1026ALU2(SEL)
1027ALU1(NOT)
1028ALU2(AND)
1029ALU2(OR)
1030ALU2(XOR)
1031ALU2(SHR)
1032ALU2(SHL)
1033ALU2(ASR)
1034ALU1(F32TO16)
1035ALU1(F16TO32)
1036ALU1(FRC)
1037ALU1(RNDD)
1038ALU2(MAC)
1039ALU2(MACH)
1040ALU1(LZD)
1041ALU2(DP4)
1042ALU2(DPH)
1043ALU2(DP3)
1044ALU2(DP2)
1045ALU2(LINE)
1046ALU2(PLN)
1047ALU3F(MAD)
1048ALU3F(LRP)
1049ALU1(BFREV)
1050ALU3(BFE)
1051ALU2(BFI1)
1052ALU3(BFI2)
1053ALU1(FBH)
1054ALU1(FBL)
1055ALU1(CBIT)
1056ALU2(ADDC)
1057ALU2(SUBB)
1058
1059ROUND(RNDZ)
1060ROUND(RNDE)
1061
1062
1063struct brw_instruction *brw_ADD(struct brw_compile *p,
1064				struct brw_reg dest,
1065				struct brw_reg src0,
1066				struct brw_reg src1)
1067{
1068   /* 6.2.2: add */
1069   if (src0.type == BRW_REGISTER_TYPE_F ||
1070       (src0.file == BRW_IMMEDIATE_VALUE &&
1071	src0.type == BRW_REGISTER_TYPE_VF)) {
1072      assert(src1.type != BRW_REGISTER_TYPE_UD);
1073      assert(src1.type != BRW_REGISTER_TYPE_D);
1074   }
1075
1076   if (src1.type == BRW_REGISTER_TYPE_F ||
1077       (src1.file == BRW_IMMEDIATE_VALUE &&
1078	src1.type == BRW_REGISTER_TYPE_VF)) {
1079      assert(src0.type != BRW_REGISTER_TYPE_UD);
1080      assert(src0.type != BRW_REGISTER_TYPE_D);
1081   }
1082
1083   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1084}
1085
1086struct brw_instruction *brw_AVG(struct brw_compile *p,
1087                                struct brw_reg dest,
1088                                struct brw_reg src0,
1089                                struct brw_reg src1)
1090{
1091   assert(dest.type == src0.type);
1092   assert(src0.type == src1.type);
1093   switch (src0.type) {
1094   case BRW_REGISTER_TYPE_B:
1095   case BRW_REGISTER_TYPE_UB:
1096   case BRW_REGISTER_TYPE_W:
1097   case BRW_REGISTER_TYPE_UW:
1098   case BRW_REGISTER_TYPE_D:
1099   case BRW_REGISTER_TYPE_UD:
1100      break;
1101   default:
1102      assert(!"Bad type for brw_AVG");
1103   }
1104
1105   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1106}
1107
1108struct brw_instruction *brw_MUL(struct brw_compile *p,
1109				struct brw_reg dest,
1110				struct brw_reg src0,
1111				struct brw_reg src1)
1112{
1113   /* 6.32.38: mul */
1114   if (src0.type == BRW_REGISTER_TYPE_D ||
1115       src0.type == BRW_REGISTER_TYPE_UD ||
1116       src1.type == BRW_REGISTER_TYPE_D ||
1117       src1.type == BRW_REGISTER_TYPE_UD) {
1118      assert(dest.type != BRW_REGISTER_TYPE_F);
1119   }
1120
1121   if (src0.type == BRW_REGISTER_TYPE_F ||
1122       (src0.file == BRW_IMMEDIATE_VALUE &&
1123	src0.type == BRW_REGISTER_TYPE_VF)) {
1124      assert(src1.type != BRW_REGISTER_TYPE_UD);
1125      assert(src1.type != BRW_REGISTER_TYPE_D);
1126   }
1127
1128   if (src1.type == BRW_REGISTER_TYPE_F ||
1129       (src1.file == BRW_IMMEDIATE_VALUE &&
1130	src1.type == BRW_REGISTER_TYPE_VF)) {
1131      assert(src0.type != BRW_REGISTER_TYPE_UD);
1132      assert(src0.type != BRW_REGISTER_TYPE_D);
1133   }
1134
1135   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1136	  src0.nr != BRW_ARF_ACCUMULATOR);
1137   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1138	  src1.nr != BRW_ARF_ACCUMULATOR);
1139
1140   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1141}
1142
1143
1144void brw_NOP(struct brw_compile *p)
1145{
1146   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1147   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1148   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1149   brw_set_src1(p, insn, brw_imm_ud(0x0));
1150}
1151
1152
1153
1154
1155
1156/***********************************************************************
1157 * Comparisons, if/else/endif
1158 */
1159
1160struct brw_instruction *brw_JMPI(struct brw_compile *p,
1161                                 struct brw_reg dest,
1162                                 struct brw_reg src0,
1163                                 struct brw_reg src1)
1164{
1165   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1166
1167   insn->header.execution_size = 1;
1168   insn->header.compression_control = BRW_COMPRESSION_NONE;
1169   insn->header.mask_control = BRW_MASK_DISABLE;
1170
1171   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1172
1173   return insn;
1174}
1175
1176static void
1177push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1178{
1179   p->if_stack[p->if_stack_depth] = inst - p->store;
1180
1181   p->if_stack_depth++;
1182   if (p->if_stack_array_size <= p->if_stack_depth) {
1183      p->if_stack_array_size *= 2;
1184      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1185			     p->if_stack_array_size);
1186   }
1187}
1188
1189static struct brw_instruction *
1190pop_if_stack(struct brw_compile *p)
1191{
1192   p->if_stack_depth--;
1193   return &p->store[p->if_stack[p->if_stack_depth]];
1194}
1195
1196static void
1197push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1198{
1199   if (p->loop_stack_array_size < p->loop_stack_depth) {
1200      p->loop_stack_array_size *= 2;
1201      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1202			       p->loop_stack_array_size);
1203      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1204				     p->loop_stack_array_size);
1205   }
1206
1207   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1208   p->loop_stack_depth++;
1209   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1210}
1211
1212static struct brw_instruction *
1213get_inner_do_insn(struct brw_compile *p)
1214{
1215   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1216}
1217
1218/* EU takes the value from the flag register and pushes it onto some
1219 * sort of a stack (presumably merging with any flag value already on
1220 * the stack).  Within an if block, the flags at the top of the stack
1221 * control execution on each channel of the unit, eg. on each of the
1222 * 16 pixel values in our wm programs.
1223 *
1224 * When the matching 'else' instruction is reached (presumably by
1225 * countdown of the instruction count patched in by our ELSE/ENDIF
1226 * functions), the relevent flags are inverted.
1227 *
1228 * When the matching 'endif' instruction is reached, the flags are
1229 * popped off.  If the stack is now empty, normal execution resumes.
1230 */
1231struct brw_instruction *
1232brw_IF(struct brw_compile *p, unsigned execute_size)
1233{
1234   struct brw_context *brw = p->brw;
1235   struct brw_instruction *insn;
1236
1237   insn = next_insn(p, BRW_OPCODE_IF);
1238
1239   /* Override the defaults for this instruction:
1240    */
1241   if (brw->gen < 6) {
1242      brw_set_dest(p, insn, brw_ip_reg());
1243      brw_set_src0(p, insn, brw_ip_reg());
1244      brw_set_src1(p, insn, brw_imm_d(0x0));
1245   } else if (brw->gen == 6) {
1246      brw_set_dest(p, insn, brw_imm_w(0));
1247      insn->bits1.branch_gen6.jump_count = 0;
1248      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1249      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1250   } else {
1251      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1252      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1253      brw_set_src1(p, insn, brw_imm_ud(0));
1254      insn->bits3.break_cont.jip = 0;
1255      insn->bits3.break_cont.uip = 0;
1256   }
1257
1258   insn->header.execution_size = execute_size;
1259   insn->header.compression_control = BRW_COMPRESSION_NONE;
1260   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1261   insn->header.mask_control = BRW_MASK_ENABLE;
1262   if (!p->single_program_flow)
1263      insn->header.thread_control = BRW_THREAD_SWITCH;
1264
1265   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1266
1267   push_if_stack(p, insn);
1268   p->if_depth_in_loop[p->loop_stack_depth]++;
1269   return insn;
1270}
1271
1272/* This function is only used for gen6-style IF instructions with an
1273 * embedded comparison (conditional modifier).  It is not used on gen7.
1274 */
1275struct brw_instruction *
1276gen6_IF(struct brw_compile *p, uint32_t conditional,
1277	struct brw_reg src0, struct brw_reg src1)
1278{
1279   struct brw_instruction *insn;
1280
1281   insn = next_insn(p, BRW_OPCODE_IF);
1282
1283   brw_set_dest(p, insn, brw_imm_w(0));
1284   if (p->compressed) {
1285      insn->header.execution_size = BRW_EXECUTE_16;
1286   } else {
1287      insn->header.execution_size = BRW_EXECUTE_8;
1288   }
1289   insn->bits1.branch_gen6.jump_count = 0;
1290   brw_set_src0(p, insn, src0);
1291   brw_set_src1(p, insn, src1);
1292
1293   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1294   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1295   insn->header.destreg__conditionalmod = conditional;
1296
1297   if (!p->single_program_flow)
1298      insn->header.thread_control = BRW_THREAD_SWITCH;
1299
1300   push_if_stack(p, insn);
1301   return insn;
1302}
1303
1304/**
1305 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1306 */
1307static void
1308convert_IF_ELSE_to_ADD(struct brw_compile *p,
1309		       struct brw_instruction *if_inst,
1310		       struct brw_instruction *else_inst)
1311{
1312   /* The next instruction (where the ENDIF would be, if it existed) */
1313   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1314
1315   assert(p->single_program_flow);
1316   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1317   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1318   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1319
1320   /* Convert IF to an ADD instruction that moves the instruction pointer
1321    * to the first instruction of the ELSE block.  If there is no ELSE
1322    * block, point to where ENDIF would be.  Reverse the predicate.
1323    *
1324    * There's no need to execute an ENDIF since we don't need to do any
1325    * stack operations, and if we're currently executing, we just want to
1326    * continue normally.
1327    */
1328   if_inst->header.opcode = BRW_OPCODE_ADD;
1329   if_inst->header.predicate_inverse = 1;
1330
1331   if (else_inst != NULL) {
1332      /* Convert ELSE to an ADD instruction that points where the ENDIF
1333       * would be.
1334       */
1335      else_inst->header.opcode = BRW_OPCODE_ADD;
1336
1337      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1338      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1339   } else {
1340      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1341   }
1342}
1343
1344/**
1345 * Patch IF and ELSE instructions with appropriate jump targets.
1346 */
1347static void
1348patch_IF_ELSE(struct brw_compile *p,
1349	      struct brw_instruction *if_inst,
1350	      struct brw_instruction *else_inst,
1351	      struct brw_instruction *endif_inst)
1352{
1353   struct brw_context *brw = p->brw;
1354
1355   /* We shouldn't be patching IF and ELSE instructions in single program flow
1356    * mode when gen < 6, because in single program flow mode on those
1357    * platforms, we convert flow control instructions to conditional ADDs that
1358    * operate on IP (see brw_ENDIF).
1359    *
1360    * However, on Gen6, writing to IP doesn't work in single program flow mode
1361    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1362    * not be updated by non-flow control instructions.").  And on later
1363    * platforms, there is no significant benefit to converting control flow
1364    * instructions to conditional ADDs.  So we do patch IF and ELSE
1365    * instructions in single program flow mode on those platforms.
1366    */
1367   if (brw->gen < 6)
1368      assert(!p->single_program_flow);
1369
1370   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1371   assert(endif_inst != NULL);
1372   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1373
1374   unsigned br = 1;
1375   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1376    * requires 2 chunks.
1377    */
1378   if (brw->gen >= 5)
1379      br = 2;
1380
1381   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1382   endif_inst->header.execution_size = if_inst->header.execution_size;
1383
1384   if (else_inst == NULL) {
1385      /* Patch IF -> ENDIF */
1386      if (brw->gen < 6) {
1387	 /* Turn it into an IFF, which means no mask stack operations for
1388	  * all-false and jumping past the ENDIF.
1389	  */
1390	 if_inst->header.opcode = BRW_OPCODE_IFF;
1391	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1392	 if_inst->bits3.if_else.pop_count = 0;
1393	 if_inst->bits3.if_else.pad0 = 0;
1394      } else if (brw->gen == 6) {
1395	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1396	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1397      } else {
1398	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1399	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1400      }
1401   } else {
1402      else_inst->header.execution_size = if_inst->header.execution_size;
1403
1404      /* Patch IF -> ELSE */
1405      if (brw->gen < 6) {
1406	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1407	 if_inst->bits3.if_else.pop_count = 0;
1408	 if_inst->bits3.if_else.pad0 = 0;
1409      } else if (brw->gen == 6) {
1410	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1411      }
1412
1413      /* Patch ELSE -> ENDIF */
1414      if (brw->gen < 6) {
1415	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1416	  * matching ENDIF.
1417	  */
1418	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1419	 else_inst->bits3.if_else.pop_count = 1;
1420	 else_inst->bits3.if_else.pad0 = 0;
1421      } else if (brw->gen == 6) {
1422	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1423	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1424      } else {
1425	 /* The IF instruction's JIP should point just past the ELSE */
1426	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1427	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1428	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1429	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1430      }
1431   }
1432}
1433
1434void
1435brw_ELSE(struct brw_compile *p)
1436{
1437   struct brw_context *brw = p->brw;
1438   struct brw_instruction *insn;
1439
1440   insn = next_insn(p, BRW_OPCODE_ELSE);
1441
1442   if (brw->gen < 6) {
1443      brw_set_dest(p, insn, brw_ip_reg());
1444      brw_set_src0(p, insn, brw_ip_reg());
1445      brw_set_src1(p, insn, brw_imm_d(0x0));
1446   } else if (brw->gen == 6) {
1447      brw_set_dest(p, insn, brw_imm_w(0));
1448      insn->bits1.branch_gen6.jump_count = 0;
1449      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1450      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1451   } else {
1452      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1453      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1454      brw_set_src1(p, insn, brw_imm_ud(0));
1455      insn->bits3.break_cont.jip = 0;
1456      insn->bits3.break_cont.uip = 0;
1457   }
1458
1459   insn->header.compression_control = BRW_COMPRESSION_NONE;
1460   insn->header.mask_control = BRW_MASK_ENABLE;
1461   if (!p->single_program_flow)
1462      insn->header.thread_control = BRW_THREAD_SWITCH;
1463
1464   push_if_stack(p, insn);
1465}
1466
1467void
1468brw_ENDIF(struct brw_compile *p)
1469{
1470   struct brw_context *brw = p->brw;
1471   struct brw_instruction *insn = NULL;
1472   struct brw_instruction *else_inst = NULL;
1473   struct brw_instruction *if_inst = NULL;
1474   struct brw_instruction *tmp;
1475   bool emit_endif = true;
1476
1477   /* In single program flow mode, we can express IF and ELSE instructions
1478    * equivalently as ADD instructions that operate on IP.  On platforms prior
1479    * to Gen6, flow control instructions cause an implied thread switch, so
1480    * this is a significant savings.
1481    *
1482    * However, on Gen6, writing to IP doesn't work in single program flow mode
1483    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1484    * not be updated by non-flow control instructions.").  And on later
1485    * platforms, there is no significant benefit to converting control flow
1486    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1487    * Gen5.
1488    */
1489   if (brw->gen < 6 && p->single_program_flow)
1490      emit_endif = false;
1491
1492   /*
1493    * A single next_insn() may change the base adress of instruction store
1494    * memory(p->store), so call it first before referencing the instruction
1495    * store pointer from an index
1496    */
1497   if (emit_endif)
1498      insn = next_insn(p, BRW_OPCODE_ENDIF);
1499
1500   /* Pop the IF and (optional) ELSE instructions from the stack */
1501   p->if_depth_in_loop[p->loop_stack_depth]--;
1502   tmp = pop_if_stack(p);
1503   if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1504      else_inst = tmp;
1505      tmp = pop_if_stack(p);
1506   }
1507   if_inst = tmp;
1508
1509   if (!emit_endif) {
1510      /* ENDIF is useless; don't bother emitting it. */
1511      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1512      return;
1513   }
1514
1515   if (brw->gen < 6) {
1516      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1517      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1518      brw_set_src1(p, insn, brw_imm_d(0x0));
1519   } else if (brw->gen == 6) {
1520      brw_set_dest(p, insn, brw_imm_w(0));
1521      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1522      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1523   } else {
1524      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1525      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1526      brw_set_src1(p, insn, brw_imm_ud(0));
1527   }
1528
1529   insn->header.compression_control = BRW_COMPRESSION_NONE;
1530   insn->header.mask_control = BRW_MASK_ENABLE;
1531   insn->header.thread_control = BRW_THREAD_SWITCH;
1532
1533   /* Also pop item off the stack in the endif instruction: */
1534   if (brw->gen < 6) {
1535      insn->bits3.if_else.jump_count = 0;
1536      insn->bits3.if_else.pop_count = 1;
1537      insn->bits3.if_else.pad0 = 0;
1538   } else if (brw->gen == 6) {
1539      insn->bits1.branch_gen6.jump_count = 2;
1540   } else {
1541      insn->bits3.break_cont.jip = 2;
1542   }
1543   patch_IF_ELSE(p, if_inst, else_inst, insn);
1544}
1545
1546struct brw_instruction *brw_BREAK(struct brw_compile *p)
1547{
1548   struct brw_context *brw = p->brw;
1549   struct brw_instruction *insn;
1550
1551   insn = next_insn(p, BRW_OPCODE_BREAK);
1552   if (brw->gen >= 6) {
1553      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1554      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1555      brw_set_src1(p, insn, brw_imm_d(0x0));
1556   } else {
1557      brw_set_dest(p, insn, brw_ip_reg());
1558      brw_set_src0(p, insn, brw_ip_reg());
1559      brw_set_src1(p, insn, brw_imm_d(0x0));
1560      insn->bits3.if_else.pad0 = 0;
1561      insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1562   }
1563   insn->header.compression_control = BRW_COMPRESSION_NONE;
1564   insn->header.execution_size = BRW_EXECUTE_8;
1565
1566   return insn;
1567}
1568
1569struct brw_instruction *gen6_CONT(struct brw_compile *p)
1570{
1571   struct brw_instruction *insn;
1572
1573   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1574   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1575   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1576   brw_set_dest(p, insn, brw_ip_reg());
1577   brw_set_src0(p, insn, brw_ip_reg());
1578   brw_set_src1(p, insn, brw_imm_d(0x0));
1579
1580   insn->header.compression_control = BRW_COMPRESSION_NONE;
1581   insn->header.execution_size = BRW_EXECUTE_8;
1582   return insn;
1583}
1584
1585struct brw_instruction *brw_CONT(struct brw_compile *p)
1586{
1587   struct brw_instruction *insn;
1588   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1589   brw_set_dest(p, insn, brw_ip_reg());
1590   brw_set_src0(p, insn, brw_ip_reg());
1591   brw_set_src1(p, insn, brw_imm_d(0x0));
1592   insn->header.compression_control = BRW_COMPRESSION_NONE;
1593   insn->header.execution_size = BRW_EXECUTE_8;
1594   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1595   insn->bits3.if_else.pad0 = 0;
1596   insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1597   return insn;
1598}
1599
1600struct brw_instruction *gen6_HALT(struct brw_compile *p)
1601{
1602   struct brw_instruction *insn;
1603
1604   insn = next_insn(p, BRW_OPCODE_HALT);
1605   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1606   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1607   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1608
1609   if (p->compressed) {
1610      insn->header.execution_size = BRW_EXECUTE_16;
1611   } else {
1612      insn->header.compression_control = BRW_COMPRESSION_NONE;
1613      insn->header.execution_size = BRW_EXECUTE_8;
1614   }
1615   return insn;
1616}
1617
1618/* DO/WHILE loop:
1619 *
1620 * The DO/WHILE is just an unterminated loop -- break or continue are
1621 * used for control within the loop.  We have a few ways they can be
1622 * done.
1623 *
1624 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1625 * jip and no DO instruction.
1626 *
1627 * For non-uniform control flow pre-gen6, there's a DO instruction to
1628 * push the mask, and a WHILE to jump back, and BREAK to get out and
1629 * pop the mask.
1630 *
1631 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1632 * just points back to the first instruction of the loop.
1633 */
1634struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1635{
1636   struct brw_context *brw = p->brw;
1637
1638   if (brw->gen >= 6 || p->single_program_flow) {
1639      push_loop_stack(p, &p->store[p->nr_insn]);
1640      return &p->store[p->nr_insn];
1641   } else {
1642      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1643
1644      push_loop_stack(p, insn);
1645
1646      /* Override the defaults for this instruction:
1647       */
1648      brw_set_dest(p, insn, brw_null_reg());
1649      brw_set_src0(p, insn, brw_null_reg());
1650      brw_set_src1(p, insn, brw_null_reg());
1651
1652      insn->header.compression_control = BRW_COMPRESSION_NONE;
1653      insn->header.execution_size = execute_size;
1654      insn->header.predicate_control = BRW_PREDICATE_NONE;
1655      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1656      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1657
1658      return insn;
1659   }
1660}
1661
1662/**
1663 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1664 * instruction here.
1665 *
1666 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1667 * nesting, since it can always just point to the end of the block/current loop.
1668 */
1669static void
1670brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1671{
1672   struct brw_context *brw = p->brw;
1673   struct brw_instruction *do_inst = get_inner_do_insn(p);
1674   struct brw_instruction *inst;
1675   int br = (brw->gen == 5) ? 2 : 1;
1676
1677   for (inst = while_inst - 1; inst != do_inst; inst--) {
1678      /* If the jump count is != 0, that means that this instruction has already
1679       * been patched because it's part of a loop inside of the one we're
1680       * patching.
1681       */
1682      if (inst->header.opcode == BRW_OPCODE_BREAK &&
1683	  inst->bits3.if_else.jump_count == 0) {
1684	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1685      } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1686		 inst->bits3.if_else.jump_count == 0) {
1687	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1688      }
1689   }
1690}
1691
1692struct brw_instruction *brw_WHILE(struct brw_compile *p)
1693{
1694   struct brw_context *brw = p->brw;
1695   struct brw_instruction *insn, *do_insn;
1696   unsigned br = 1;
1697
1698   if (brw->gen >= 5)
1699      br = 2;
1700
1701   if (brw->gen >= 7) {
1702      insn = next_insn(p, BRW_OPCODE_WHILE);
1703      do_insn = get_inner_do_insn(p);
1704
1705      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1706      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1707      brw_set_src1(p, insn, brw_imm_ud(0));
1708      insn->bits3.break_cont.jip = br * (do_insn - insn);
1709
1710      insn->header.execution_size = BRW_EXECUTE_8;
1711   } else if (brw->gen == 6) {
1712      insn = next_insn(p, BRW_OPCODE_WHILE);
1713      do_insn = get_inner_do_insn(p);
1714
1715      brw_set_dest(p, insn, brw_imm_w(0));
1716      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1717      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1718      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1719
1720      insn->header.execution_size = BRW_EXECUTE_8;
1721   } else {
1722      if (p->single_program_flow) {
1723	 insn = next_insn(p, BRW_OPCODE_ADD);
1724         do_insn = get_inner_do_insn(p);
1725
1726	 brw_set_dest(p, insn, brw_ip_reg());
1727	 brw_set_src0(p, insn, brw_ip_reg());
1728	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1729	 insn->header.execution_size = BRW_EXECUTE_1;
1730      } else {
1731	 insn = next_insn(p, BRW_OPCODE_WHILE);
1732         do_insn = get_inner_do_insn(p);
1733
1734	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1735
1736	 brw_set_dest(p, insn, brw_ip_reg());
1737	 brw_set_src0(p, insn, brw_ip_reg());
1738	 brw_set_src1(p, insn, brw_imm_d(0));
1739
1740	 insn->header.execution_size = do_insn->header.execution_size;
1741	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1742	 insn->bits3.if_else.pop_count = 0;
1743	 insn->bits3.if_else.pad0 = 0;
1744
1745	 brw_patch_break_cont(p, insn);
1746      }
1747   }
1748   insn->header.compression_control = BRW_COMPRESSION_NONE;
1749   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1750
1751   p->loop_stack_depth--;
1752
1753   return insn;
1754}
1755
1756
1757/* FORWARD JUMPS:
1758 */
1759void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1760{
1761   struct brw_context *brw = p->brw;
1762   struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1763   unsigned jmpi = 1;
1764
1765   if (brw->gen >= 5)
1766      jmpi = 2;
1767
1768   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1769   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1770
1771   jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1772}
1773
1774
1775
1776/* To integrate with the above, it makes sense that the comparison
1777 * instruction should populate the flag register.  It might be simpler
1778 * just to use the flag reg for most WM tasks?
1779 */
1780void brw_CMP(struct brw_compile *p,
1781	     struct brw_reg dest,
1782	     unsigned conditional,
1783	     struct brw_reg src0,
1784	     struct brw_reg src1)
1785{
1786   struct brw_context *brw = p->brw;
1787   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1788
1789   insn->header.destreg__conditionalmod = conditional;
1790   brw_set_dest(p, insn, dest);
1791   brw_set_src0(p, insn, src0);
1792   brw_set_src1(p, insn, src1);
1793
1794/*    guess_execution_size(insn, src0); */
1795
1796
1797   /* Make it so that future instructions will use the computed flag
1798    * value until brw_set_predicate_control_flag_value() is called
1799    * again.
1800    */
1801   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1802       dest.nr == 0) {
1803      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1804      p->flag_value = 0xff;
1805   }
1806
1807   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1808    * page says:
1809    *    "Any CMP instruction with a null destination must use a {switch}."
1810    *
1811    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1812    * mentioned on their work-arounds pages.
1813    */
1814   if (brw->gen == 7) {
1815      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1816          dest.nr == BRW_ARF_NULL) {
1817         insn->header.thread_control = BRW_THREAD_SWITCH;
1818      }
1819   }
1820}
1821
1822/* Issue 'wait' instruction for n1, host could program MMIO
1823   to wake up thread. */
1824void brw_WAIT (struct brw_compile *p)
1825{
1826   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1827   struct brw_reg src = brw_notification_1_reg();
1828
1829   brw_set_dest(p, insn, src);
1830   brw_set_src0(p, insn, src);
1831   brw_set_src1(p, insn, brw_null_reg());
1832   insn->header.execution_size = 0; /* must */
1833   insn->header.predicate_control = 0;
1834   insn->header.compression_control = 0;
1835}
1836
1837
1838/***********************************************************************
1839 * Helpers for the various SEND message types:
1840 */
1841
1842/** Extended math function, float[8].
1843 */
1844void brw_math( struct brw_compile *p,
1845	       struct brw_reg dest,
1846	       unsigned function,
1847	       unsigned msg_reg_nr,
1848	       struct brw_reg src,
1849	       unsigned data_type,
1850	       unsigned precision )
1851{
1852   struct brw_context *brw = p->brw;
1853
1854   if (brw->gen >= 6) {
1855      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1856
1857      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1858             (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1859      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1860
1861      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1862      if (brw->gen == 6)
1863	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1864
1865      /* Source modifiers are ignored for extended math instructions on Gen6. */
1866      if (brw->gen == 6) {
1867	 assert(!src.negate);
1868	 assert(!src.abs);
1869      }
1870
1871      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1872	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1873	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1874	 assert(src.type != BRW_REGISTER_TYPE_F);
1875      } else {
1876	 assert(src.type == BRW_REGISTER_TYPE_F);
1877      }
1878
1879      /* Math is the same ISA format as other opcodes, except that CondModifier
1880       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1881       */
1882      insn->header.destreg__conditionalmod = function;
1883
1884      brw_set_dest(p, insn, dest);
1885      brw_set_src0(p, insn, src);
1886      brw_set_src1(p, insn, brw_null_reg());
1887   } else {
1888      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1889
1890      /* Example code doesn't set predicate_control for send
1891       * instructions.
1892       */
1893      insn->header.predicate_control = 0;
1894      insn->header.destreg__conditionalmod = msg_reg_nr;
1895
1896      brw_set_dest(p, insn, dest);
1897      brw_set_src0(p, insn, src);
1898      brw_set_math_message(p,
1899			   insn,
1900			   function,
1901			   src.type == BRW_REGISTER_TYPE_D,
1902			   precision,
1903			   data_type);
1904   }
1905}
1906
1907/** Extended math function, float[8].
1908 */
1909void brw_math2(struct brw_compile *p,
1910	       struct brw_reg dest,
1911	       unsigned function,
1912	       struct brw_reg src0,
1913	       struct brw_reg src1)
1914{
1915   struct brw_context *brw = p->brw;
1916   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1917
1918   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1919          (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1920   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1921   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1922
1923   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1924   if (brw->gen == 6) {
1925      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1926      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1927   }
1928
1929   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1930       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1931       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1932      assert(src0.type != BRW_REGISTER_TYPE_F);
1933      assert(src1.type != BRW_REGISTER_TYPE_F);
1934   } else {
1935      assert(src0.type == BRW_REGISTER_TYPE_F);
1936      assert(src1.type == BRW_REGISTER_TYPE_F);
1937   }
1938
1939   /* Source modifiers are ignored for extended math instructions on Gen6. */
1940   if (brw->gen == 6) {
1941      assert(!src0.negate);
1942      assert(!src0.abs);
1943      assert(!src1.negate);
1944      assert(!src1.abs);
1945   }
1946
1947   /* Math is the same ISA format as other opcodes, except that CondModifier
1948    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1949    */
1950   insn->header.destreg__conditionalmod = function;
1951
1952   brw_set_dest(p, insn, dest);
1953   brw_set_src0(p, insn, src0);
1954   brw_set_src1(p, insn, src1);
1955}
1956
1957
1958/**
1959 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1960 * using a constant offset per channel.
1961 *
1962 * The offset must be aligned to oword size (16 bytes).  Used for
1963 * register spilling.
1964 */
1965void brw_oword_block_write_scratch(struct brw_compile *p,
1966				   struct brw_reg mrf,
1967				   int num_regs,
1968				   unsigned offset)
1969{
1970   struct brw_context *brw = p->brw;
1971   uint32_t msg_control, msg_type;
1972   int mlen;
1973
1974   if (brw->gen >= 6)
1975      offset /= 16;
1976
1977   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1978
1979   if (num_regs == 1) {
1980      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1981      mlen = 2;
1982   } else {
1983      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1984      mlen = 3;
1985   }
1986
1987   /* Set up the message header.  This is g0, with g0.2 filled with
1988    * the offset.  We don't want to leave our offset around in g0 or
1989    * it'll screw up texture samples, so set it up inside the message
1990    * reg.
1991    */
1992   {
1993      brw_push_insn_state(p);
1994      brw_set_mask_control(p, BRW_MASK_DISABLE);
1995      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1996
1997      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1998
1999      /* set message header global offset field (reg 0, element 2) */
2000      brw_MOV(p,
2001	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2002				  mrf.nr,
2003				  2), BRW_REGISTER_TYPE_UD),
2004	      brw_imm_ud(offset));
2005
2006      brw_pop_insn_state(p);
2007   }
2008
2009   {
2010      struct brw_reg dest;
2011      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2012      int send_commit_msg;
2013      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2014					 BRW_REGISTER_TYPE_UW);
2015
2016      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
2017	 insn->header.compression_control = BRW_COMPRESSION_NONE;
2018	 src_header = vec16(src_header);
2019      }
2020      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2021      insn->header.destreg__conditionalmod = mrf.nr;
2022
2023      /* Until gen6, writes followed by reads from the same location
2024       * are not guaranteed to be ordered unless write_commit is set.
2025       * If set, then a no-op write is issued to the destination
2026       * register to set a dependency, and a read from the destination
2027       * can be used to ensure the ordering.
2028       *
2029       * For gen6, only writes between different threads need ordering
2030       * protection.  Our use of DP writes is all about register
2031       * spilling within a thread.
2032       */
2033      if (brw->gen >= 6) {
2034	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2035	 send_commit_msg = 0;
2036      } else {
2037	 dest = src_header;
2038	 send_commit_msg = 1;
2039      }
2040
2041      brw_set_dest(p, insn, dest);
2042      if (brw->gen >= 6) {
2043	 brw_set_src0(p, insn, mrf);
2044      } else {
2045	 brw_set_src0(p, insn, brw_null_reg());
2046      }
2047
2048      if (brw->gen >= 6)
2049	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2050      else
2051	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2052
2053      brw_set_dp_write_message(p,
2054			       insn,
2055			       255, /* binding table index (255=stateless) */
2056			       msg_control,
2057			       msg_type,
2058			       mlen,
2059			       true, /* header_present */
2060			       0, /* not a render target */
2061			       send_commit_msg, /* response_length */
2062			       0, /* eot */
2063			       send_commit_msg);
2064   }
2065}
2066
2067
2068/**
2069 * Read a block of owords (half a GRF each) from the scratch buffer
2070 * using a constant index per channel.
2071 *
2072 * Offset must be aligned to oword size (16 bytes).  Used for register
2073 * spilling.
2074 */
2075void
2076brw_oword_block_read_scratch(struct brw_compile *p,
2077			     struct brw_reg dest,
2078			     struct brw_reg mrf,
2079			     int num_regs,
2080			     unsigned offset)
2081{
2082   struct brw_context *brw = p->brw;
2083   uint32_t msg_control;
2084   int rlen;
2085
2086   if (brw->gen >= 6)
2087      offset /= 16;
2088
2089   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2090   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2091
2092   if (num_regs == 1) {
2093      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2094      rlen = 1;
2095   } else {
2096      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2097      rlen = 2;
2098   }
2099
2100   {
2101      brw_push_insn_state(p);
2102      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2103      brw_set_mask_control(p, BRW_MASK_DISABLE);
2104
2105      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2106
2107      /* set message header global offset field (reg 0, element 2) */
2108      brw_MOV(p,
2109	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2110				  mrf.nr,
2111				  2), BRW_REGISTER_TYPE_UD),
2112	      brw_imm_ud(offset));
2113
2114      brw_pop_insn_state(p);
2115   }
2116
2117   {
2118      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2119
2120      assert(insn->header.predicate_control == 0);
2121      insn->header.compression_control = BRW_COMPRESSION_NONE;
2122      insn->header.destreg__conditionalmod = mrf.nr;
2123
2124      brw_set_dest(p, insn, dest);	/* UW? */
2125      if (brw->gen >= 6) {
2126	 brw_set_src0(p, insn, mrf);
2127      } else {
2128	 brw_set_src0(p, insn, brw_null_reg());
2129      }
2130
2131      brw_set_dp_read_message(p,
2132			      insn,
2133			      255, /* binding table index (255=stateless) */
2134			      msg_control,
2135			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2136			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2137			      1, /* msg_length */
2138                              true, /* header_present */
2139			      rlen);
2140   }
2141}
2142
2143void
2144gen7_block_read_scratch(struct brw_compile *p,
2145                        struct brw_reg dest,
2146                        int num_regs,
2147                        unsigned offset)
2148{
2149   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2150
2151   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2152
2153   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2154   insn->header.compression_control = BRW_COMPRESSION_NONE;
2155
2156   brw_set_dest(p, insn, dest);
2157
2158   /* The HW requires that the header is present; this is to get the g0.5
2159    * scratch offset.
2160    */
2161   bool header_present = true;
2162   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2163
2164   brw_set_message_descriptor(p, insn,
2165                              GEN7_SFID_DATAPORT_DATA_CACHE,
2166                              1, /* mlen: just g0 */
2167                              num_regs,
2168                              header_present,
2169                              false);
2170
2171   insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;
2172
2173   assert(num_regs == 1 || num_regs == 2 || num_regs == 4);
2174   insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;
2175
2176   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2177    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2178    * is 32 bytes, which happens to be the size of a register.
2179    */
2180   offset /= REG_SIZE;
2181   assert(offset < (1 << 12));
2182   insn->bits3.ud |= offset;
2183}
2184
2185/**
2186 * Read a float[4] vector from the data port Data Cache (const buffer).
2187 * Location (in buffer) should be a multiple of 16.
2188 * Used for fetching shader constants.
2189 */
2190void brw_oword_block_read(struct brw_compile *p,
2191			  struct brw_reg dest,
2192			  struct brw_reg mrf,
2193			  uint32_t offset,
2194			  uint32_t bind_table_index)
2195{
2196   struct brw_context *brw = p->brw;
2197
2198   /* On newer hardware, offset is in units of owords. */
2199   if (brw->gen >= 6)
2200      offset /= 16;
2201
2202   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2203
2204   brw_push_insn_state(p);
2205   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2206   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2207   brw_set_mask_control(p, BRW_MASK_DISABLE);
2208
2209   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2210
2211   /* set message header global offset field (reg 0, element 2) */
2212   brw_MOV(p,
2213	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2214			       mrf.nr,
2215			       2), BRW_REGISTER_TYPE_UD),
2216	   brw_imm_ud(offset));
2217
2218   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2219   insn->header.destreg__conditionalmod = mrf.nr;
2220
2221   /* cast dest to a uword[8] vector */
2222   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2223
2224   brw_set_dest(p, insn, dest);
2225   if (brw->gen >= 6) {
2226      brw_set_src0(p, insn, mrf);
2227   } else {
2228      brw_set_src0(p, insn, brw_null_reg());
2229   }
2230
2231   brw_set_dp_read_message(p,
2232			   insn,
2233			   bind_table_index,
2234			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2235			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2236			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2237			   1, /* msg_length */
2238                           true, /* header_present */
2239			   1); /* response_length (1 reg, 2 owords!) */
2240
2241   brw_pop_insn_state(p);
2242}
2243
2244
2245void brw_fb_WRITE(struct brw_compile *p,
2246		  int dispatch_width,
2247                  unsigned msg_reg_nr,
2248                  struct brw_reg src0,
2249                  unsigned msg_control,
2250                  unsigned binding_table_index,
2251                  unsigned msg_length,
2252                  unsigned response_length,
2253                  bool eot,
2254                  bool header_present)
2255{
2256   struct brw_context *brw = p->brw;
2257   struct brw_instruction *insn;
2258   unsigned msg_type;
2259   struct brw_reg dest;
2260
2261   if (dispatch_width == 16)
2262      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2263   else
2264      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2265
2266   if (brw->gen >= 6) {
2267      insn = next_insn(p, BRW_OPCODE_SENDC);
2268   } else {
2269      insn = next_insn(p, BRW_OPCODE_SEND);
2270   }
2271   insn->header.compression_control = BRW_COMPRESSION_NONE;
2272
2273   if (brw->gen >= 6) {
2274      /* headerless version, just submit color payload */
2275      src0 = brw_message_reg(msg_reg_nr);
2276
2277      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2278   } else {
2279      insn->header.destreg__conditionalmod = msg_reg_nr;
2280
2281      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2282   }
2283
2284   brw_set_dest(p, insn, dest);
2285   brw_set_src0(p, insn, src0);
2286   brw_set_dp_write_message(p,
2287			    insn,
2288			    binding_table_index,
2289			    msg_control,
2290			    msg_type,
2291			    msg_length,
2292			    header_present,
2293			    eot, /* last render target write */
2294			    response_length,
2295			    eot,
2296			    0 /* send_commit_msg */);
2297}
2298
2299
2300/**
2301 * Texture sample instruction.
2302 * Note: the msg_type plus msg_length values determine exactly what kind
2303 * of sampling operation is performed.  See volume 4, page 161 of docs.
2304 */
2305void brw_SAMPLE(struct brw_compile *p,
2306		struct brw_reg dest,
2307		unsigned msg_reg_nr,
2308		struct brw_reg src0,
2309		unsigned binding_table_index,
2310		unsigned sampler,
2311		unsigned msg_type,
2312		unsigned response_length,
2313		unsigned msg_length,
2314		unsigned header_present,
2315		unsigned simd_mode,
2316		unsigned return_format)
2317{
2318   struct brw_context *brw = p->brw;
2319   struct brw_instruction *insn;
2320
2321   if (msg_reg_nr != -1)
2322      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2323
2324   insn = next_insn(p, BRW_OPCODE_SEND);
2325   insn->header.predicate_control = 0; /* XXX */
2326
2327   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2328    *
2329    *    "Instruction compression is not allowed for this instruction (that
2330    *     is, send). The hardware behavior is undefined if this instruction is
2331    *     set as compressed. However, compress control can be set to "SecHalf"
2332    *     to affect the EMask generation."
2333    *
2334    * No similar wording is found in later PRMs, but there are examples
2335    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2336    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2337    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2338    */
2339   if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
2340      insn->header.compression_control = BRW_COMPRESSION_NONE;
2341
2342   if (brw->gen < 6)
2343      insn->header.destreg__conditionalmod = msg_reg_nr;
2344
2345   brw_set_dest(p, insn, dest);
2346   brw_set_src0(p, insn, src0);
2347   brw_set_sampler_message(p, insn,
2348                           binding_table_index,
2349                           sampler,
2350                           msg_type,
2351                           response_length,
2352                           msg_length,
2353                           header_present,
2354                           simd_mode,
2355                           return_format);
2356}
2357
2358/* All these variables are pretty confusing - we might be better off
2359 * using bitmasks and macros for this, in the old style.  Or perhaps
2360 * just having the caller instantiate the fields in dword3 itself.
2361 */
2362void brw_urb_WRITE(struct brw_compile *p,
2363		   struct brw_reg dest,
2364		   unsigned msg_reg_nr,
2365		   struct brw_reg src0,
2366                   enum brw_urb_write_flags flags,
2367		   unsigned msg_length,
2368		   unsigned response_length,
2369		   unsigned offset,
2370		   unsigned swizzle)
2371{
2372   struct brw_context *brw = p->brw;
2373   struct brw_instruction *insn;
2374
2375   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2376
2377   if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2378      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2379      brw_push_insn_state(p);
2380      brw_set_access_mode(p, BRW_ALIGN_1);
2381      brw_set_mask_control(p, BRW_MASK_DISABLE);
2382      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2383		       BRW_REGISTER_TYPE_UD),
2384	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2385		brw_imm_ud(0xff00));
2386      brw_pop_insn_state(p);
2387   }
2388
2389   insn = next_insn(p, BRW_OPCODE_SEND);
2390
2391   assert(msg_length < BRW_MAX_MRF);
2392
2393   brw_set_dest(p, insn, dest);
2394   brw_set_src0(p, insn, src0);
2395   brw_set_src1(p, insn, brw_imm_d(0));
2396
2397   if (brw->gen < 6)
2398      insn->header.destreg__conditionalmod = msg_reg_nr;
2399
2400   brw_set_urb_message(p,
2401		       insn,
2402		       flags,
2403		       msg_length,
2404		       response_length,
2405		       offset,
2406		       swizzle);
2407}
2408
2409static int
2410brw_find_next_block_end(struct brw_compile *p, int start_offset)
2411{
2412   int offset;
2413   void *store = p->store;
2414
2415   for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2416        offset = next_offset(store, offset)) {
2417      struct brw_instruction *insn = store + offset;
2418
2419      switch (insn->header.opcode) {
2420      case BRW_OPCODE_ENDIF:
2421      case BRW_OPCODE_ELSE:
2422      case BRW_OPCODE_WHILE:
2423      case BRW_OPCODE_HALT:
2424	 return offset;
2425      }
2426   }
2427
2428   return 0;
2429}
2430
2431/* There is no DO instruction on gen6, so to find the end of the loop
2432 * we have to see if the loop is jumping back before our start
2433 * instruction.
2434 */
2435static int
2436brw_find_loop_end(struct brw_compile *p, int start_offset)
2437{
2438   struct brw_context *brw = p->brw;
2439   int offset;
2440   int scale = 8;
2441   void *store = p->store;
2442
2443   /* Always start after the instruction (such as a WHILE) we're trying to fix
2444    * up.
2445    */
2446   for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2447        offset = next_offset(store, offset)) {
2448      struct brw_instruction *insn = store + offset;
2449
2450      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2451	 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2452				   : insn->bits3.break_cont.jip;
2453	 if (offset + jip * scale <= start_offset)
2454	    return offset;
2455      }
2456   }
2457   assert(!"not reached");
2458   return start_offset;
2459}
2460
2461/* After program generation, go back and update the UIP and JIP of
2462 * BREAK, CONT, and HALT instructions to their correct locations.
2463 */
2464void
2465brw_set_uip_jip(struct brw_compile *p)
2466{
2467   struct brw_context *brw = p->brw;
2468   int offset;
2469   int scale = 8;
2470   void *store = p->store;
2471
2472   if (brw->gen < 6)
2473      return;
2474
2475   for (offset = 0; offset < p->next_insn_offset;
2476        offset = next_offset(store, offset)) {
2477      struct brw_instruction *insn = store + offset;
2478
2479      if (insn->header.cmpt_control) {
2480	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2481	 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2482		insn->header.opcode != BRW_OPCODE_CONTINUE &&
2483		insn->header.opcode != BRW_OPCODE_HALT);
2484	 continue;
2485      }
2486
2487      int block_end_offset = brw_find_next_block_end(p, offset);
2488      switch (insn->header.opcode) {
2489      case BRW_OPCODE_BREAK:
2490         assert(block_end_offset != 0);
2491	 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2492	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2493	 insn->bits3.break_cont.uip =
2494	    (brw_find_loop_end(p, offset) - offset +
2495             (brw->gen == 6 ? 16 : 0)) / scale;
2496	 break;
2497      case BRW_OPCODE_CONTINUE:
2498         assert(block_end_offset != 0);
2499	 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2500	 insn->bits3.break_cont.uip =
2501            (brw_find_loop_end(p, offset) - offset) / scale;
2502
2503	 assert(insn->bits3.break_cont.uip != 0);
2504	 assert(insn->bits3.break_cont.jip != 0);
2505	 break;
2506
2507      case BRW_OPCODE_ENDIF:
2508         if (block_end_offset == 0)
2509            insn->bits3.break_cont.jip = 2;
2510         else
2511            insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2512	 break;
2513
2514      case BRW_OPCODE_HALT:
2515	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2516	  *
2517	  *    "In case of the halt instruction not inside any conditional
2518	  *     code block, the value of <JIP> and <UIP> should be the
2519	  *     same. In case of the halt instruction inside conditional code
2520	  *     block, the <UIP> should be the end of the program, and the
2521	  *     <JIP> should be end of the most inner conditional code block."
2522	  *
2523	  * The uip will have already been set by whoever set up the
2524	  * instruction.
2525	  */
2526	 if (block_end_offset == 0) {
2527	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2528	 } else {
2529	    insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2530	 }
2531	 assert(insn->bits3.break_cont.uip != 0);
2532	 assert(insn->bits3.break_cont.jip != 0);
2533	 break;
2534      }
2535   }
2536}
2537
2538void brw_ff_sync(struct brw_compile *p,
2539		   struct brw_reg dest,
2540		   unsigned msg_reg_nr,
2541		   struct brw_reg src0,
2542		   bool allocate,
2543		   unsigned response_length,
2544		   bool eot)
2545{
2546   struct brw_context *brw = p->brw;
2547   struct brw_instruction *insn;
2548
2549   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2550
2551   insn = next_insn(p, BRW_OPCODE_SEND);
2552   brw_set_dest(p, insn, dest);
2553   brw_set_src0(p, insn, src0);
2554   brw_set_src1(p, insn, brw_imm_d(0));
2555
2556   if (brw->gen < 6)
2557      insn->header.destreg__conditionalmod = msg_reg_nr;
2558
2559   brw_set_ff_sync_message(p,
2560			   insn,
2561			   allocate,
2562			   response_length,
2563			   eot);
2564}
2565
2566/**
2567 * Emit the SEND instruction necessary to generate stream output data on Gen6
2568 * (for transform feedback).
2569 *
2570 * If send_commit_msg is true, this is the last piece of stream output data
2571 * from this thread, so send the data as a committed write.  According to the
2572 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2573 *
2574 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2575 *   writes are complete by sending the final write as a committed write."
2576 */
2577void
2578brw_svb_write(struct brw_compile *p,
2579              struct brw_reg dest,
2580              unsigned msg_reg_nr,
2581              struct brw_reg src0,
2582              unsigned binding_table_index,
2583              bool   send_commit_msg)
2584{
2585   struct brw_instruction *insn;
2586
2587   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2588
2589   insn = next_insn(p, BRW_OPCODE_SEND);
2590   brw_set_dest(p, insn, dest);
2591   brw_set_src0(p, insn, src0);
2592   brw_set_src1(p, insn, brw_imm_d(0));
2593   brw_set_dp_write_message(p, insn,
2594                            binding_table_index,
2595                            0, /* msg_control: ignored */
2596                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2597                            1, /* msg_length */
2598                            true, /* header_present */
2599                            0, /* last_render_target: ignored */
2600                            send_commit_msg, /* response_length */
2601                            0, /* end_of_thread */
2602                            send_commit_msg); /* send_commit_msg */
2603}
2604
2605static void
2606brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2607                                  struct brw_instruction *insn,
2608                                  unsigned atomic_op,
2609                                  unsigned bind_table_index,
2610                                  unsigned msg_length,
2611                                  unsigned response_length,
2612                                  bool header_present)
2613{
2614   if (p->brw->is_haswell) {
2615      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2616                                 msg_length, response_length,
2617                                 header_present, false);
2618
2619
2620      if (insn->header.access_mode == BRW_ALIGN_1) {
2621         if (insn->header.execution_size != BRW_EXECUTE_16)
2622            insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2623
2624         insn->bits3.gen7_dp.msg_type =
2625            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2626      } else {
2627         insn->bits3.gen7_dp.msg_type =
2628            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
2629      }
2630
2631   } else {
2632      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2633                                 msg_length, response_length,
2634                                 header_present, false);
2635
2636      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2637
2638      if (insn->header.execution_size != BRW_EXECUTE_16)
2639         insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2640   }
2641
2642   if (response_length)
2643      insn->bits3.ud |= 1 << 13; /* Return data expected */
2644
2645   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2646   insn->bits3.ud |= atomic_op << 8;
2647}
2648
2649void
2650brw_untyped_atomic(struct brw_compile *p,
2651                   struct brw_reg dest,
2652                   struct brw_reg mrf,
2653                   unsigned atomic_op,
2654                   unsigned bind_table_index,
2655                   unsigned msg_length,
2656                   unsigned response_length) {
2657   struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2658
2659   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2660   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2661   brw_set_src1(p, insn, brw_imm_d(0));
2662   brw_set_dp_untyped_atomic_message(
2663      p, insn, atomic_op, bind_table_index, msg_length, response_length,
2664      insn->header.access_mode == BRW_ALIGN_1);
2665}
2666
2667static void
2668brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2669                                        struct brw_instruction *insn,
2670                                        unsigned bind_table_index,
2671                                        unsigned msg_length,
2672                                        unsigned response_length,
2673                                        bool header_present)
2674{
2675   const unsigned dispatch_width =
2676      (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
2677   const unsigned num_channels = response_length / (dispatch_width / 8);
2678
2679   if (p->brw->is_haswell) {
2680      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2681                                 msg_length, response_length,
2682                                 header_present, false);
2683
2684      insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
2685   } else {
2686      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2687                                 msg_length, response_length,
2688                                 header_present, false);
2689
2690      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
2691   }
2692
2693   if (insn->header.access_mode == BRW_ALIGN_1) {
2694      if (dispatch_width == 16)
2695         insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
2696      else
2697         insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
2698   }
2699
2700   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2701
2702   /* Set mask of 32-bit channels to drop. */
2703   insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
2704}
2705
2706void
2707brw_untyped_surface_read(struct brw_compile *p,
2708                         struct brw_reg dest,
2709                         struct brw_reg mrf,
2710                         unsigned bind_table_index,
2711                         unsigned msg_length,
2712                         unsigned response_length)
2713{
2714   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2715
2716   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2717   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2718   brw_set_dp_untyped_surface_read_message(
2719      p, insn, bind_table_index, msg_length, response_length,
2720      insn->header.access_mode == BRW_ALIGN_1);
2721}
2722
2723/**
2724 * This instruction is generated as a single-channel align1 instruction by
2725 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2726 *
2727 * We can't use the typed atomic op in the FS because that has the execution
2728 * mask ANDed with the pixel mask, but we just want to write the one dword for
2729 * all the pixels.
2730 *
2731 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2732 * one u32.  So we use the same untyped atomic write message as the pixel
2733 * shader.
2734 *
2735 * The untyped atomic operation requires a BUFFER surface type with RAW
2736 * format, and is only accessible through the legacy DATA_CACHE dataport
2737 * messages.
2738 */
2739void brw_shader_time_add(struct brw_compile *p,
2740                         struct brw_reg payload,
2741                         uint32_t surf_index)
2742{
2743   struct brw_context *brw = p->brw;
2744   assert(brw->gen >= 7);
2745
2746   brw_push_insn_state(p);
2747   brw_set_access_mode(p, BRW_ALIGN_1);
2748   brw_set_mask_control(p, BRW_MASK_DISABLE);
2749   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2750   brw_pop_insn_state(p);
2751
2752   /* We use brw_vec1_reg and unmasked because we want to increment the given
2753    * offset only once.
2754    */
2755   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2756                                      BRW_ARF_NULL, 0));
2757   brw_set_src0(p, send, brw_vec1_reg(payload.file,
2758                                      payload.nr, 0));
2759   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2760                                     2 /* message length */,
2761                                     0 /* response length */,
2762                                     false /* header present */);
2763}
2764