brw_eu_emit.c revision 8873120f9fb0c82cfd46cd15c39e66c38076cb0d
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  unsigned msg_reg_nr)
65{
66   struct brw_context *brw = p->brw;
67   if (brw->gen < 6)
68      return;
69
70   if (src->file == BRW_MESSAGE_REGISTER_FILE)
71      return;
72
73   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74      brw_push_insn_state(p);
75      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
76      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
77      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	      retype(*src, BRW_REGISTER_TYPE_UD));
79      brw_pop_insn_state(p);
80   }
81   *src = brw_message_reg(msg_reg_nr);
82}
83
84static void
85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86{
87   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
88    * "The send with EOT should use register space R112-R127 for <src>. This is
89    *  to enable loading of a new thread into the same slot while the message
90    *  with EOT for current thread is pending dispatch."
91    *
92    * Since we're pretending to have 16 MRFs anyway, we may as well use the
93    * registers required for messages with EOT.
94    */
95   struct brw_context *brw = p->brw;
96   if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97      reg->file = BRW_GENERAL_REGISTER_FILE;
98      reg->nr += GEN7_MRF_HACK_START;
99   }
100}
101
102/**
103 * Convert a brw_reg_type enumeration value into the hardware representation.
104 *
105 * The hardware encoding may depend on whether the value is an immediate.
106 */
107unsigned
108brw_reg_type_to_hw_type(const struct brw_context *brw,
109                        enum brw_reg_type type, unsigned file)
110{
111   if (file == BRW_IMMEDIATE_VALUE) {
112      const static int imm_hw_types[] = {
113         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
114         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
115         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
116         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
117         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
118         [BRW_REGISTER_TYPE_UB] = -1,
119         [BRW_REGISTER_TYPE_B]  = -1,
120         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
121         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
122         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
123         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
124         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
125         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
126         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
127      };
128      assert(type < ARRAY_SIZE(imm_hw_types));
129      assert(imm_hw_types[type] != -1);
130      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
131      return imm_hw_types[type];
132   } else {
133      /* Non-immediate registers */
134      const static int hw_types[] = {
135         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
136         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
137         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
138         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
139         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
140         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
141         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
142         [BRW_REGISTER_TYPE_UV] = -1,
143         [BRW_REGISTER_TYPE_VF] = -1,
144         [BRW_REGISTER_TYPE_V]  = -1,
145         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
146         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
147         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
148         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
149      };
150      assert(type < ARRAY_SIZE(hw_types));
151      assert(hw_types[type] != -1);
152      assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
153      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
154      return hw_types[type];
155   }
156}
157
158void
159brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
160	     struct brw_reg dest)
161{
162   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
163       dest.file != BRW_MESSAGE_REGISTER_FILE)
164      assert(dest.nr < 128);
165
166   gen7_convert_mrf_to_grf(p, &dest);
167
168   insn->bits1.da1.dest_reg_file = dest.file;
169   insn->bits1.da1.dest_reg_type =
170      brw_reg_type_to_hw_type(p->brw, dest.type, dest.file);
171   insn->bits1.da1.dest_address_mode = dest.address_mode;
172
173   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
174      insn->bits1.da1.dest_reg_nr = dest.nr;
175
176      if (insn->header.access_mode == BRW_ALIGN_1) {
177	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
178	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
179	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
180	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
181      } else {
182	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
183	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
184         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
185             dest.file == BRW_MESSAGE_REGISTER_FILE) {
186            assert(dest.dw1.bits.writemask != 0);
187         }
188	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
189	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
190	  *    this to be programmed as "01".
191	  */
192	 insn->bits1.da16.dest_horiz_stride = 1;
193      }
194   } else {
195      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
196
197      /* These are different sizes in align1 vs align16:
198       */
199      if (insn->header.access_mode == BRW_ALIGN_1) {
200	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
201	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
202	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
203	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
204      } else {
205	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
206	 /* even ignored in da16, still need to set as '01' */
207	 insn->bits1.ia16.dest_horiz_stride = 1;
208      }
209   }
210
211   /* NEW: Set the execution size based on dest.width and
212    * insn->compression_control:
213    */
214   guess_execution_size(p, insn, dest);
215}
216
217extern int reg_type_size[];
218
219static void
220validate_reg(struct brw_instruction *insn, struct brw_reg reg)
221{
222   int hstride_for_reg[] = {0, 1, 2, 4};
223   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
224   int width_for_reg[] = {1, 2, 4, 8, 16};
225   int execsize_for_reg[] = {1, 2, 4, 8, 16};
226   int width, hstride, vstride, execsize;
227
228   if (reg.file == BRW_IMMEDIATE_VALUE) {
229      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
230       * mean the destination has to be 128-bit aligned and the
231       * destination horiz stride has to be a word.
232       */
233      if (reg.type == BRW_REGISTER_TYPE_V) {
234	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
235		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
236      }
237
238      return;
239   }
240
241   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
242       reg.file == BRW_ARF_NULL)
243      return;
244
245   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
246   hstride = hstride_for_reg[reg.hstride];
247
248   if (reg.vstride == 0xf) {
249      vstride = -1;
250   } else {
251      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
252      vstride = vstride_for_reg[reg.vstride];
253   }
254
255   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
256   width = width_for_reg[reg.width];
257
258   assert(insn->header.execution_size >= 0 &&
259	  insn->header.execution_size < Elements(execsize_for_reg));
260   execsize = execsize_for_reg[insn->header.execution_size];
261
262   /* Restrictions from 3.3.10: Register Region Restrictions. */
263   /* 3. */
264   assert(execsize >= width);
265
266   /* 4. */
267   if (execsize == width && hstride != 0) {
268      assert(vstride == -1 || vstride == width * hstride);
269   }
270
271   /* 5. */
272   if (execsize == width && hstride == 0) {
273      /* no restriction on vstride. */
274   }
275
276   /* 6. */
277   if (width == 1) {
278      assert(hstride == 0);
279   }
280
281   /* 7. */
282   if (execsize == 1 && width == 1) {
283      assert(hstride == 0);
284      assert(vstride == 0);
285   }
286
287   /* 8. */
288   if (vstride == 0 && hstride == 0) {
289      assert(width == 1);
290   }
291
292   /* 10. Check destination issues. */
293}
294
295static bool
296is_compactable_immediate(unsigned imm)
297{
298   /* We get the low 12 bits as-is. */
299   imm &= ~0xfff;
300
301   /* We get one bit replicated through the top 20 bits. */
302   return imm == 0 || imm == 0xfffff000;
303}
304
305void
306brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
307	     struct brw_reg reg)
308{
309   struct brw_context *brw = p->brw;
310
311   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
312      assert(reg.nr < 128);
313
314   gen7_convert_mrf_to_grf(p, &reg);
315
316   if (brw->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
317                           insn->header.opcode == BRW_OPCODE_SENDC)) {
318      /* Any source modifiers or regions will be ignored, since this just
319       * identifies the MRF/GRF to start reading the message contents from.
320       * Check for some likely failures.
321       */
322      assert(!reg.negate);
323      assert(!reg.abs);
324      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
325   }
326
327   validate_reg(insn, reg);
328
329   insn->bits1.da1.src0_reg_file = reg.file;
330   insn->bits1.da1.src0_reg_type =
331      brw_reg_type_to_hw_type(brw, reg.type, reg.file);
332   insn->bits2.da1.src0_abs = reg.abs;
333   insn->bits2.da1.src0_negate = reg.negate;
334   insn->bits2.da1.src0_address_mode = reg.address_mode;
335
336   if (reg.file == BRW_IMMEDIATE_VALUE) {
337      insn->bits3.ud = reg.dw1.ud;
338
339      /* The Bspec's section titled "Non-present Operands" claims that if src0
340       * is an immediate that src1's type must be the same as that of src0.
341       *
342       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
343       * that do not follow this rule. E.g., from the IVB/HSW table:
344       *
345       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
346       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
347       *
348       * And from the SNB table:
349       *
350       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
351       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
352       *
353       * Neither of these cause warnings from the simulator when used,
354       * compacted or otherwise. In fact, all compaction mappings that have an
355       * immediate in src0 use a:ud for src1.
356       *
357       * The GM45 instruction compaction tables do not contain mapped meanings
358       * so it's not clear whether it has the restriction. We'll assume it was
359       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
360       */
361      insn->bits1.da1.src1_reg_file = 0; /* arf */
362      if (brw->gen < 6) {
363         insn->bits1.da1.src1_reg_type = insn->bits1.da1.src0_reg_type;
364      } else {
365         insn->bits1.da1.src1_reg_type = BRW_HW_REG_TYPE_UD;
366      }
367
368      /* Compacted instructions only have 12-bits (plus 1 for the other 20)
369       * for immediate values. Presumably the hardware engineers realized
370       * that the only useful floating-point value that could be represented
371       * in this format is 0.0, which can also be represented as a VF-typed
372       * immediate, so they gave us the previously mentioned mapping on IVB+.
373       *
374       * Strangely, we do have a mapping for imm:f in src1, so we don't need
375       * to do this there.
376       *
377       * If we see a 0.0:F, change the type to VF so that it can be compacted.
378       */
379      if (insn->bits3.ud == 0x0 &&
380          insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_F) {
381         insn->bits1.da1.src0_reg_type = BRW_HW_REG_IMM_TYPE_VF;
382      }
383
384      /* There are no mappings for dst:d | i:d, so if the immediate is suitable
385       * set the types to :UD so the instruction can be compacted.
386       */
387      if (is_compactable_immediate(insn->bits3.ud) &&
388          insn->header.destreg__conditionalmod == BRW_CONDITIONAL_NONE &&
389          insn->bits1.da1.src0_reg_type == BRW_HW_REG_TYPE_D &&
390          insn->bits1.da1.dest_reg_type == BRW_HW_REG_TYPE_D) {
391         insn->bits1.da1.src0_reg_type = BRW_HW_REG_TYPE_UD;
392         insn->bits1.da1.dest_reg_type = BRW_HW_REG_TYPE_UD;
393      }
394   } else {
395      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
396	 if (insn->header.access_mode == BRW_ALIGN_1) {
397	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
398	    insn->bits2.da1.src0_reg_nr = reg.nr;
399	 } else {
400	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
401	    insn->bits2.da16.src0_reg_nr = reg.nr;
402	 }
403      } else {
404	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
405
406	 if (insn->header.access_mode == BRW_ALIGN_1) {
407	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
408	 } else {
409	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
410	 }
411      }
412
413      if (insn->header.access_mode == BRW_ALIGN_1) {
414	 if (reg.width == BRW_WIDTH_1 &&
415	     insn->header.execution_size == BRW_EXECUTE_1) {
416	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
417	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
418	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
419	 } else {
420	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
421	    insn->bits2.da1.src0_width = reg.width;
422	    insn->bits2.da1.src0_vert_stride = reg.vstride;
423	 }
424      } else {
425	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
426	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
427	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
428	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
429
430	 /* This is an oddity of the fact we're using the same
431	  * descriptions for registers in align_16 as align_1:
432	  */
433	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
434	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
435	 else
436	    insn->bits2.da16.src0_vert_stride = reg.vstride;
437      }
438   }
439}
440
441
442void
443brw_set_src1(struct brw_compile *p,
444             struct brw_instruction *insn,
445             struct brw_reg reg)
446{
447   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
448
449   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
450      assert(reg.nr < 128);
451
452   gen7_convert_mrf_to_grf(p, &reg);
453
454   validate_reg(insn, reg);
455
456   insn->bits1.da1.src1_reg_file = reg.file;
457   insn->bits1.da1.src1_reg_type =
458      brw_reg_type_to_hw_type(p->brw, reg.type, reg.file);
459   insn->bits3.da1.src1_abs = reg.abs;
460   insn->bits3.da1.src1_negate = reg.negate;
461
462   /* Only src1 can be immediate in two-argument instructions.
463    */
464   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
465
466   if (reg.file == BRW_IMMEDIATE_VALUE) {
467      insn->bits3.ud = reg.dw1.ud;
468   } else {
469      /* This is a hardware restriction, which may or may not be lifted
470       * in the future:
471       */
472      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
473      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
474
475      if (insn->header.access_mode == BRW_ALIGN_1) {
476	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
477	 insn->bits3.da1.src1_reg_nr = reg.nr;
478      } else {
479	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
480	 insn->bits3.da16.src1_reg_nr = reg.nr;
481      }
482
483      if (insn->header.access_mode == BRW_ALIGN_1) {
484	 if (reg.width == BRW_WIDTH_1 &&
485	     insn->header.execution_size == BRW_EXECUTE_1) {
486	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
487	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
488	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
489	 } else {
490	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
491	    insn->bits3.da1.src1_width = reg.width;
492	    insn->bits3.da1.src1_vert_stride = reg.vstride;
493	 }
494      } else {
495	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
496	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
497	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
498	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
499
500	 /* This is an oddity of the fact we're using the same
501	  * descriptions for registers in align_16 as align_1:
502	  */
503	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
504	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
505	 else
506	    insn->bits3.da16.src1_vert_stride = reg.vstride;
507      }
508   }
509}
510
511/**
512 * Set the Message Descriptor and Extended Message Descriptor fields
513 * for SEND messages.
514 *
515 * \note This zeroes out the Function Control bits, so it must be called
516 *       \b before filling out any message-specific data.  Callers can
517 *       choose not to fill in irrelevant bits; they will be zero.
518 */
519static void
520brw_set_message_descriptor(struct brw_compile *p,
521			   struct brw_instruction *inst,
522			   enum brw_message_target sfid,
523			   unsigned msg_length,
524			   unsigned response_length,
525			   bool header_present,
526			   bool end_of_thread)
527{
528   struct brw_context *brw = p->brw;
529
530   brw_set_src1(p, inst, brw_imm_d(0));
531
532   if (brw->gen >= 5) {
533      inst->bits3.generic_gen5.header_present = header_present;
534      inst->bits3.generic_gen5.response_length = response_length;
535      inst->bits3.generic_gen5.msg_length = msg_length;
536      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
537
538      if (brw->gen >= 6) {
539	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
540	 inst->header.destreg__conditionalmod = sfid;
541      } else {
542	 /* Set Extended Message Descriptor (ex_desc) */
543	 inst->bits2.send_gen5.sfid = sfid;
544	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
545      }
546   } else {
547      inst->bits3.generic.response_length = response_length;
548      inst->bits3.generic.msg_length = msg_length;
549      inst->bits3.generic.msg_target = sfid;
550      inst->bits3.generic.end_of_thread = end_of_thread;
551   }
552}
553
554static void brw_set_math_message( struct brw_compile *p,
555				  struct brw_instruction *insn,
556				  unsigned function,
557				  unsigned integer_type,
558				  bool low_precision,
559				  unsigned dataType )
560{
561   struct brw_context *brw = p->brw;
562   unsigned msg_length;
563   unsigned response_length;
564
565   /* Infer message length from the function */
566   switch (function) {
567   case BRW_MATH_FUNCTION_POW:
568   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
569   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
570   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
571      msg_length = 2;
572      break;
573   default:
574      msg_length = 1;
575      break;
576   }
577
578   /* Infer response length from the function */
579   switch (function) {
580   case BRW_MATH_FUNCTION_SINCOS:
581   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
582      response_length = 2;
583      break;
584   default:
585      response_length = 1;
586      break;
587   }
588
589
590   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
591			      msg_length, response_length, false, false);
592   if (brw->gen == 5) {
593      insn->bits3.math_gen5.function = function;
594      insn->bits3.math_gen5.int_type = integer_type;
595      insn->bits3.math_gen5.precision = low_precision;
596      insn->bits3.math_gen5.saturate = insn->header.saturate;
597      insn->bits3.math_gen5.data_type = dataType;
598      insn->bits3.math_gen5.snapshot = 0;
599   } else {
600      insn->bits3.math.function = function;
601      insn->bits3.math.int_type = integer_type;
602      insn->bits3.math.precision = low_precision;
603      insn->bits3.math.saturate = insn->header.saturate;
604      insn->bits3.math.data_type = dataType;
605   }
606   insn->header.saturate = 0;
607}
608
609
610static void brw_set_ff_sync_message(struct brw_compile *p,
611				    struct brw_instruction *insn,
612				    bool allocate,
613				    unsigned response_length,
614				    bool end_of_thread)
615{
616   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
617			      1, response_length, true, end_of_thread);
618   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
619   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
620   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
621   insn->bits3.urb_gen5.allocate = allocate;
622   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
623   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
624}
625
626static void brw_set_urb_message( struct brw_compile *p,
627				 struct brw_instruction *insn,
628                                 enum brw_urb_write_flags flags,
629				 unsigned msg_length,
630				 unsigned response_length,
631				 unsigned offset,
632				 unsigned swizzle_control )
633{
634   struct brw_context *brw = p->brw;
635
636   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
637			      msg_length, response_length, true,
638                              flags & BRW_URB_WRITE_EOT);
639   if (brw->gen == 7) {
640      if (flags & BRW_URB_WRITE_OWORD) {
641         assert(msg_length == 2); /* header + one OWORD of data */
642         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_OWORD;
643      } else {
644         insn->bits3.urb_gen7.opcode = BRW_URB_OPCODE_WRITE_HWORD;
645      }
646      insn->bits3.urb_gen7.offset = offset;
647      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
648      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
649      insn->bits3.urb_gen7.per_slot_offset =
650         flags & BRW_URB_WRITE_PER_SLOT_OFFSET ? 1 : 0;
651      insn->bits3.urb_gen7.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
652   } else if (brw->gen >= 5) {
653      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
654      insn->bits3.urb_gen5.offset = offset;
655      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
656      insn->bits3.urb_gen5.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
657      insn->bits3.urb_gen5.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
658      insn->bits3.urb_gen5.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
659   } else {
660      insn->bits3.urb.opcode = 0;	/* ? */
661      insn->bits3.urb.offset = offset;
662      insn->bits3.urb.swizzle_control = swizzle_control;
663      insn->bits3.urb.allocate = flags & BRW_URB_WRITE_ALLOCATE ? 1 : 0;
664      insn->bits3.urb.used = flags & BRW_URB_WRITE_UNUSED ? 0 : 1;
665      insn->bits3.urb.complete = flags & BRW_URB_WRITE_COMPLETE ? 1 : 0;
666   }
667}
668
669void
670brw_set_dp_write_message(struct brw_compile *p,
671			 struct brw_instruction *insn,
672			 unsigned binding_table_index,
673			 unsigned msg_control,
674			 unsigned msg_type,
675			 unsigned msg_length,
676			 bool header_present,
677			 unsigned last_render_target,
678			 unsigned response_length,
679			 unsigned end_of_thread,
680			 unsigned send_commit_msg)
681{
682   struct brw_context *brw = p->brw;
683   unsigned sfid;
684
685   if (brw->gen >= 7) {
686      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
687      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
688	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
689      else
690	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
691   } else if (brw->gen == 6) {
692      /* Use the render cache for all write messages. */
693      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
694   } else {
695      sfid = BRW_SFID_DATAPORT_WRITE;
696   }
697
698   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
699			      header_present, end_of_thread);
700
701   if (brw->gen >= 7) {
702      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
703      insn->bits3.gen7_dp.msg_control = msg_control;
704      insn->bits3.gen7_dp.last_render_target = last_render_target;
705      insn->bits3.gen7_dp.msg_type = msg_type;
706   } else if (brw->gen == 6) {
707      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
708      insn->bits3.gen6_dp.msg_control = msg_control;
709      insn->bits3.gen6_dp.last_render_target = last_render_target;
710      insn->bits3.gen6_dp.msg_type = msg_type;
711      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
712   } else if (brw->gen == 5) {
713      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
714      insn->bits3.dp_write_gen5.msg_control = msg_control;
715      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
716      insn->bits3.dp_write_gen5.msg_type = msg_type;
717      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
718   } else {
719      insn->bits3.dp_write.binding_table_index = binding_table_index;
720      insn->bits3.dp_write.msg_control = msg_control;
721      insn->bits3.dp_write.last_render_target = last_render_target;
722      insn->bits3.dp_write.msg_type = msg_type;
723      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
724   }
725}
726
727void
728brw_set_dp_read_message(struct brw_compile *p,
729			struct brw_instruction *insn,
730			unsigned binding_table_index,
731			unsigned msg_control,
732			unsigned msg_type,
733			unsigned target_cache,
734			unsigned msg_length,
735                        bool header_present,
736			unsigned response_length)
737{
738   struct brw_context *brw = p->brw;
739   unsigned sfid;
740
741   if (brw->gen >= 7) {
742      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
743   } else if (brw->gen == 6) {
744      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
745	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
746      else
747	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
748   } else {
749      sfid = BRW_SFID_DATAPORT_READ;
750   }
751
752   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
753			      header_present, false);
754
755   if (brw->gen >= 7) {
756      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
757      insn->bits3.gen7_dp.msg_control = msg_control;
758      insn->bits3.gen7_dp.last_render_target = 0;
759      insn->bits3.gen7_dp.msg_type = msg_type;
760   } else if (brw->gen == 6) {
761      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
762      insn->bits3.gen6_dp.msg_control = msg_control;
763      insn->bits3.gen6_dp.last_render_target = 0;
764      insn->bits3.gen6_dp.msg_type = msg_type;
765      insn->bits3.gen6_dp.send_commit_msg = 0;
766   } else if (brw->gen == 5) {
767      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
768      insn->bits3.dp_read_gen5.msg_control = msg_control;
769      insn->bits3.dp_read_gen5.msg_type = msg_type;
770      insn->bits3.dp_read_gen5.target_cache = target_cache;
771   } else if (brw->is_g4x) {
772      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
773      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
774      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
775      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
776   } else {
777      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
778      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
779      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
780      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
781   }
782}
783
784void
785brw_set_sampler_message(struct brw_compile *p,
786                        struct brw_instruction *insn,
787                        unsigned binding_table_index,
788                        unsigned sampler,
789                        unsigned msg_type,
790                        unsigned response_length,
791                        unsigned msg_length,
792                        unsigned header_present,
793                        unsigned simd_mode,
794                        unsigned return_format)
795{
796   struct brw_context *brw = p->brw;
797
798   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
799			      response_length, header_present, false);
800
801   if (brw->gen >= 7) {
802      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
803      insn->bits3.sampler_gen7.sampler = sampler;
804      insn->bits3.sampler_gen7.msg_type = msg_type;
805      insn->bits3.sampler_gen7.simd_mode = simd_mode;
806   } else if (brw->gen >= 5) {
807      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
808      insn->bits3.sampler_gen5.sampler = sampler;
809      insn->bits3.sampler_gen5.msg_type = msg_type;
810      insn->bits3.sampler_gen5.simd_mode = simd_mode;
811   } else if (brw->is_g4x) {
812      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
813      insn->bits3.sampler_g4x.sampler = sampler;
814      insn->bits3.sampler_g4x.msg_type = msg_type;
815   } else {
816      insn->bits3.sampler.binding_table_index = binding_table_index;
817      insn->bits3.sampler.sampler = sampler;
818      insn->bits3.sampler.msg_type = msg_type;
819      insn->bits3.sampler.return_format = return_format;
820   }
821}
822
823
824#define next_insn brw_next_insn
825struct brw_instruction *
826brw_next_insn(struct brw_compile *p, unsigned opcode)
827{
828   struct brw_instruction *insn;
829
830   if (p->nr_insn + 1 > p->store_size) {
831      p->store_size <<= 1;
832      p->store = reralloc(p->mem_ctx, p->store,
833                          struct brw_instruction, p->store_size);
834   }
835
836   p->next_insn_offset += 16;
837   insn = &p->store[p->nr_insn++];
838   memcpy(insn, p->current, sizeof(*insn));
839
840   insn->header.opcode = opcode;
841   return insn;
842}
843
844static struct brw_instruction *brw_alu1( struct brw_compile *p,
845					 unsigned opcode,
846					 struct brw_reg dest,
847					 struct brw_reg src )
848{
849   struct brw_instruction *insn = next_insn(p, opcode);
850   brw_set_dest(p, insn, dest);
851   brw_set_src0(p, insn, src);
852   return insn;
853}
854
855static struct brw_instruction *brw_alu2(struct brw_compile *p,
856					unsigned opcode,
857					struct brw_reg dest,
858					struct brw_reg src0,
859					struct brw_reg src1 )
860{
861   struct brw_instruction *insn = next_insn(p, opcode);
862   brw_set_dest(p, insn, dest);
863   brw_set_src0(p, insn, src0);
864   brw_set_src1(p, insn, src1);
865   return insn;
866}
867
868static int
869get_3src_subreg_nr(struct brw_reg reg)
870{
871   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
872      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
873      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
874   } else {
875      return reg.subnr / 4;
876   }
877}
878
879static struct brw_instruction *brw_alu3(struct brw_compile *p,
880					unsigned opcode,
881					struct brw_reg dest,
882					struct brw_reg src0,
883					struct brw_reg src1,
884					struct brw_reg src2)
885{
886   struct brw_context *brw = p->brw;
887   struct brw_instruction *insn = next_insn(p, opcode);
888
889   gen7_convert_mrf_to_grf(p, &dest);
890
891   assert(insn->header.access_mode == BRW_ALIGN_16);
892
893   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
894	  dest.file == BRW_MESSAGE_REGISTER_FILE);
895   assert(dest.nr < 128);
896   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
897   assert(dest.type == BRW_REGISTER_TYPE_F ||
898          dest.type == BRW_REGISTER_TYPE_D ||
899          dest.type == BRW_REGISTER_TYPE_UD);
900   insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
901   insn->bits1.da3src.dest_reg_nr = dest.nr;
902   insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
903   insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
904   guess_execution_size(p, insn, dest);
905
906   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
907   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
908   assert(src0.nr < 128);
909   insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
910   insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
911   insn->bits2.da3src.src0_reg_nr = src0.nr;
912   insn->bits1.da3src.src0_abs = src0.abs;
913   insn->bits1.da3src.src0_negate = src0.negate;
914   insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
915
916   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
917   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
918   assert(src1.nr < 128);
919   insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
920   insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
921   insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
922   insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
923   insn->bits3.da3src.src1_reg_nr = src1.nr;
924   insn->bits1.da3src.src1_abs = src1.abs;
925   insn->bits1.da3src.src1_negate = src1.negate;
926
927   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
928   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
929   assert(src2.nr < 128);
930   insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
931   insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
932   insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
933   insn->bits3.da3src.src2_reg_nr = src2.nr;
934   insn->bits1.da3src.src2_abs = src2.abs;
935   insn->bits1.da3src.src2_negate = src2.negate;
936
937   if (brw->gen >= 7) {
938      /* Set both the source and destination types based on dest.type,
939       * ignoring the source register types.  The MAD and LRP emitters ensure
940       * that all four types are float.  The BFE and BFI2 emitters, however,
941       * may send us mixed D and UD types and want us to ignore that and use
942       * the destination type.
943       */
944      switch (dest.type) {
945      case BRW_REGISTER_TYPE_F:
946         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_F;
947         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_F;
948         break;
949      case BRW_REGISTER_TYPE_D:
950         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_D;
951         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_D;
952         break;
953      case BRW_REGISTER_TYPE_UD:
954         insn->bits1.da3src.src_type = BRW_3SRC_TYPE_UD;
955         insn->bits1.da3src.dst_type = BRW_3SRC_TYPE_UD;
956         break;
957      }
958   }
959
960   return insn;
961}
962
963
964/***********************************************************************
965 * Convenience routines.
966 */
967#define ALU1(OP)					\
968struct brw_instruction *brw_##OP(struct brw_compile *p,	\
969	      struct brw_reg dest,			\
970	      struct brw_reg src0)   			\
971{							\
972   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
973}
974
975#define ALU2(OP)					\
976struct brw_instruction *brw_##OP(struct brw_compile *p,	\
977	      struct brw_reg dest,			\
978	      struct brw_reg src0,			\
979	      struct brw_reg src1)   			\
980{							\
981   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
982}
983
984#define ALU3(OP)					\
985struct brw_instruction *brw_##OP(struct brw_compile *p,	\
986	      struct brw_reg dest,			\
987	      struct brw_reg src0,			\
988	      struct brw_reg src1,			\
989	      struct brw_reg src2)   			\
990{							\
991   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
992}
993
994#define ALU3F(OP)                                               \
995struct brw_instruction *brw_##OP(struct brw_compile *p,         \
996                                 struct brw_reg dest,           \
997                                 struct brw_reg src0,           \
998                                 struct brw_reg src1,           \
999                                 struct brw_reg src2)           \
1000{                                                               \
1001   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
1002   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
1003   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
1004   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
1005   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1006}
1007
1008/* Rounding operations (other than RNDD) require two instructions - the first
1009 * stores a rounded value (possibly the wrong way) in the dest register, but
1010 * also sets a per-channel "increment bit" in the flag register.  A predicated
1011 * add of 1.0 fixes dest to contain the desired result.
1012 *
1013 * Sandybridge and later appear to round correctly without an ADD.
1014 */
1015#define ROUND(OP)							      \
1016void brw_##OP(struct brw_compile *p,					      \
1017	      struct brw_reg dest,					      \
1018	      struct brw_reg src)					      \
1019{									      \
1020   struct brw_instruction *rnd, *add;					      \
1021   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
1022   brw_set_dest(p, rnd, dest);						      \
1023   brw_set_src0(p, rnd, src);						      \
1024									      \
1025   if (p->brw->gen < 6) {						      \
1026      /* turn on round-increments */					      \
1027      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
1028      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
1029      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
1030   }									      \
1031}
1032
1033
1034ALU1(MOV)
1035ALU2(SEL)
1036ALU1(NOT)
1037ALU2(AND)
1038ALU2(OR)
1039ALU2(XOR)
1040ALU2(SHR)
1041ALU2(SHL)
1042ALU2(ASR)
1043ALU1(F32TO16)
1044ALU1(F16TO32)
1045ALU1(FRC)
1046ALU1(RNDD)
1047ALU2(MAC)
1048ALU2(MACH)
1049ALU1(LZD)
1050ALU2(DP4)
1051ALU2(DPH)
1052ALU2(DP3)
1053ALU2(DP2)
1054ALU2(LINE)
1055ALU2(PLN)
1056ALU3F(MAD)
1057ALU3F(LRP)
1058ALU1(BFREV)
1059ALU3(BFE)
1060ALU2(BFI1)
1061ALU3(BFI2)
1062ALU1(FBH)
1063ALU1(FBL)
1064ALU1(CBIT)
1065ALU2(ADDC)
1066ALU2(SUBB)
1067
1068ROUND(RNDZ)
1069ROUND(RNDE)
1070
1071
1072struct brw_instruction *brw_ADD(struct brw_compile *p,
1073				struct brw_reg dest,
1074				struct brw_reg src0,
1075				struct brw_reg src1)
1076{
1077   /* 6.2.2: add */
1078   if (src0.type == BRW_REGISTER_TYPE_F ||
1079       (src0.file == BRW_IMMEDIATE_VALUE &&
1080	src0.type == BRW_REGISTER_TYPE_VF)) {
1081      assert(src1.type != BRW_REGISTER_TYPE_UD);
1082      assert(src1.type != BRW_REGISTER_TYPE_D);
1083   }
1084
1085   if (src1.type == BRW_REGISTER_TYPE_F ||
1086       (src1.file == BRW_IMMEDIATE_VALUE &&
1087	src1.type == BRW_REGISTER_TYPE_VF)) {
1088      assert(src0.type != BRW_REGISTER_TYPE_UD);
1089      assert(src0.type != BRW_REGISTER_TYPE_D);
1090   }
1091
1092   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1093}
1094
1095struct brw_instruction *brw_AVG(struct brw_compile *p,
1096                                struct brw_reg dest,
1097                                struct brw_reg src0,
1098                                struct brw_reg src1)
1099{
1100   assert(dest.type == src0.type);
1101   assert(src0.type == src1.type);
1102   switch (src0.type) {
1103   case BRW_REGISTER_TYPE_B:
1104   case BRW_REGISTER_TYPE_UB:
1105   case BRW_REGISTER_TYPE_W:
1106   case BRW_REGISTER_TYPE_UW:
1107   case BRW_REGISTER_TYPE_D:
1108   case BRW_REGISTER_TYPE_UD:
1109      break;
1110   default:
1111      assert(!"Bad type for brw_AVG");
1112   }
1113
1114   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1115}
1116
1117struct brw_instruction *brw_MUL(struct brw_compile *p,
1118				struct brw_reg dest,
1119				struct brw_reg src0,
1120				struct brw_reg src1)
1121{
1122   /* 6.32.38: mul */
1123   if (src0.type == BRW_REGISTER_TYPE_D ||
1124       src0.type == BRW_REGISTER_TYPE_UD ||
1125       src1.type == BRW_REGISTER_TYPE_D ||
1126       src1.type == BRW_REGISTER_TYPE_UD) {
1127      assert(dest.type != BRW_REGISTER_TYPE_F);
1128   }
1129
1130   if (src0.type == BRW_REGISTER_TYPE_F ||
1131       (src0.file == BRW_IMMEDIATE_VALUE &&
1132	src0.type == BRW_REGISTER_TYPE_VF)) {
1133      assert(src1.type != BRW_REGISTER_TYPE_UD);
1134      assert(src1.type != BRW_REGISTER_TYPE_D);
1135   }
1136
1137   if (src1.type == BRW_REGISTER_TYPE_F ||
1138       (src1.file == BRW_IMMEDIATE_VALUE &&
1139	src1.type == BRW_REGISTER_TYPE_VF)) {
1140      assert(src0.type != BRW_REGISTER_TYPE_UD);
1141      assert(src0.type != BRW_REGISTER_TYPE_D);
1142   }
1143
1144   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1145	  src0.nr != BRW_ARF_ACCUMULATOR);
1146   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1147	  src1.nr != BRW_ARF_ACCUMULATOR);
1148
1149   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1150}
1151
1152
1153void brw_NOP(struct brw_compile *p)
1154{
1155   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1156   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1157   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1158   brw_set_src1(p, insn, brw_imm_ud(0x0));
1159}
1160
1161
1162
1163
1164
1165/***********************************************************************
1166 * Comparisons, if/else/endif
1167 */
1168
1169struct brw_instruction *brw_JMPI(struct brw_compile *p,
1170                                 struct brw_reg index,
1171                                 unsigned predicate_control)
1172{
1173   struct brw_reg ip = brw_ip_reg();
1174   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1175
1176   insn->header.execution_size = 1;
1177   insn->header.compression_control = BRW_COMPRESSION_NONE;
1178   insn->header.mask_control = BRW_MASK_DISABLE;
1179   insn->header.predicate_control = predicate_control;
1180
1181   return insn;
1182}
1183
1184static void
1185push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1186{
1187   p->if_stack[p->if_stack_depth] = inst - p->store;
1188
1189   p->if_stack_depth++;
1190   if (p->if_stack_array_size <= p->if_stack_depth) {
1191      p->if_stack_array_size *= 2;
1192      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1193			     p->if_stack_array_size);
1194   }
1195}
1196
1197static struct brw_instruction *
1198pop_if_stack(struct brw_compile *p)
1199{
1200   p->if_stack_depth--;
1201   return &p->store[p->if_stack[p->if_stack_depth]];
1202}
1203
1204static void
1205push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1206{
1207   if (p->loop_stack_array_size < p->loop_stack_depth) {
1208      p->loop_stack_array_size *= 2;
1209      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1210			       p->loop_stack_array_size);
1211      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1212				     p->loop_stack_array_size);
1213   }
1214
1215   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1216   p->loop_stack_depth++;
1217   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1218}
1219
1220static struct brw_instruction *
1221get_inner_do_insn(struct brw_compile *p)
1222{
1223   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1224}
1225
1226/* EU takes the value from the flag register and pushes it onto some
1227 * sort of a stack (presumably merging with any flag value already on
1228 * the stack).  Within an if block, the flags at the top of the stack
1229 * control execution on each channel of the unit, eg. on each of the
1230 * 16 pixel values in our wm programs.
1231 *
1232 * When the matching 'else' instruction is reached (presumably by
1233 * countdown of the instruction count patched in by our ELSE/ENDIF
1234 * functions), the relevent flags are inverted.
1235 *
1236 * When the matching 'endif' instruction is reached, the flags are
1237 * popped off.  If the stack is now empty, normal execution resumes.
1238 */
1239struct brw_instruction *
1240brw_IF(struct brw_compile *p, unsigned execute_size)
1241{
1242   struct brw_context *brw = p->brw;
1243   struct brw_instruction *insn;
1244
1245   insn = next_insn(p, BRW_OPCODE_IF);
1246
1247   /* Override the defaults for this instruction:
1248    */
1249   if (brw->gen < 6) {
1250      brw_set_dest(p, insn, brw_ip_reg());
1251      brw_set_src0(p, insn, brw_ip_reg());
1252      brw_set_src1(p, insn, brw_imm_d(0x0));
1253   } else if (brw->gen == 6) {
1254      brw_set_dest(p, insn, brw_imm_w(0));
1255      insn->bits1.branch_gen6.jump_count = 0;
1256      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1257      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1258   } else {
1259      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1260      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1261      brw_set_src1(p, insn, brw_imm_ud(0));
1262      insn->bits3.break_cont.jip = 0;
1263      insn->bits3.break_cont.uip = 0;
1264   }
1265
1266   insn->header.execution_size = execute_size;
1267   insn->header.compression_control = BRW_COMPRESSION_NONE;
1268   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1269   insn->header.mask_control = BRW_MASK_ENABLE;
1270   if (!p->single_program_flow)
1271      insn->header.thread_control = BRW_THREAD_SWITCH;
1272
1273   push_if_stack(p, insn);
1274   p->if_depth_in_loop[p->loop_stack_depth]++;
1275   return insn;
1276}
1277
1278/* This function is only used for gen6-style IF instructions with an
1279 * embedded comparison (conditional modifier).  It is not used on gen7.
1280 */
1281struct brw_instruction *
1282gen6_IF(struct brw_compile *p, uint32_t conditional,
1283	struct brw_reg src0, struct brw_reg src1)
1284{
1285   struct brw_instruction *insn;
1286
1287   insn = next_insn(p, BRW_OPCODE_IF);
1288
1289   brw_set_dest(p, insn, brw_imm_w(0));
1290   if (p->compressed) {
1291      insn->header.execution_size = BRW_EXECUTE_16;
1292   } else {
1293      insn->header.execution_size = BRW_EXECUTE_8;
1294   }
1295   insn->bits1.branch_gen6.jump_count = 0;
1296   brw_set_src0(p, insn, src0);
1297   brw_set_src1(p, insn, src1);
1298
1299   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1300   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1301   insn->header.destreg__conditionalmod = conditional;
1302
1303   if (!p->single_program_flow)
1304      insn->header.thread_control = BRW_THREAD_SWITCH;
1305
1306   push_if_stack(p, insn);
1307   return insn;
1308}
1309
1310/**
1311 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1312 */
1313static void
1314convert_IF_ELSE_to_ADD(struct brw_compile *p,
1315		       struct brw_instruction *if_inst,
1316		       struct brw_instruction *else_inst)
1317{
1318   /* The next instruction (where the ENDIF would be, if it existed) */
1319   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1320
1321   assert(p->single_program_flow);
1322   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1323   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1324   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1325
1326   /* Convert IF to an ADD instruction that moves the instruction pointer
1327    * to the first instruction of the ELSE block.  If there is no ELSE
1328    * block, point to where ENDIF would be.  Reverse the predicate.
1329    *
1330    * There's no need to execute an ENDIF since we don't need to do any
1331    * stack operations, and if we're currently executing, we just want to
1332    * continue normally.
1333    */
1334   if_inst->header.opcode = BRW_OPCODE_ADD;
1335   if_inst->header.predicate_inverse = 1;
1336
1337   if (else_inst != NULL) {
1338      /* Convert ELSE to an ADD instruction that points where the ENDIF
1339       * would be.
1340       */
1341      else_inst->header.opcode = BRW_OPCODE_ADD;
1342
1343      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1344      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1345   } else {
1346      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1347   }
1348}
1349
1350/**
1351 * Patch IF and ELSE instructions with appropriate jump targets.
1352 */
1353static void
1354patch_IF_ELSE(struct brw_compile *p,
1355	      struct brw_instruction *if_inst,
1356	      struct brw_instruction *else_inst,
1357	      struct brw_instruction *endif_inst)
1358{
1359   struct brw_context *brw = p->brw;
1360
1361   /* We shouldn't be patching IF and ELSE instructions in single program flow
1362    * mode when gen < 6, because in single program flow mode on those
1363    * platforms, we convert flow control instructions to conditional ADDs that
1364    * operate on IP (see brw_ENDIF).
1365    *
1366    * However, on Gen6, writing to IP doesn't work in single program flow mode
1367    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1368    * not be updated by non-flow control instructions.").  And on later
1369    * platforms, there is no significant benefit to converting control flow
1370    * instructions to conditional ADDs.  So we do patch IF and ELSE
1371    * instructions in single program flow mode on those platforms.
1372    */
1373   if (brw->gen < 6)
1374      assert(!p->single_program_flow);
1375
1376   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1377   assert(endif_inst != NULL);
1378   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1379
1380   unsigned br = 1;
1381   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1382    * requires 2 chunks.
1383    */
1384   if (brw->gen >= 5)
1385      br = 2;
1386
1387   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1388   endif_inst->header.execution_size = if_inst->header.execution_size;
1389
1390   if (else_inst == NULL) {
1391      /* Patch IF -> ENDIF */
1392      if (brw->gen < 6) {
1393	 /* Turn it into an IFF, which means no mask stack operations for
1394	  * all-false and jumping past the ENDIF.
1395	  */
1396	 if_inst->header.opcode = BRW_OPCODE_IFF;
1397	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1398	 if_inst->bits3.if_else.pop_count = 0;
1399	 if_inst->bits3.if_else.pad0 = 0;
1400      } else if (brw->gen == 6) {
1401	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1402	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1403      } else {
1404	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1405	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1406      }
1407   } else {
1408      else_inst->header.execution_size = if_inst->header.execution_size;
1409
1410      /* Patch IF -> ELSE */
1411      if (brw->gen < 6) {
1412	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1413	 if_inst->bits3.if_else.pop_count = 0;
1414	 if_inst->bits3.if_else.pad0 = 0;
1415      } else if (brw->gen == 6) {
1416	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1417      }
1418
1419      /* Patch ELSE -> ENDIF */
1420      if (brw->gen < 6) {
1421	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1422	  * matching ENDIF.
1423	  */
1424	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1425	 else_inst->bits3.if_else.pop_count = 1;
1426	 else_inst->bits3.if_else.pad0 = 0;
1427      } else if (brw->gen == 6) {
1428	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1429	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1430      } else {
1431	 /* The IF instruction's JIP should point just past the ELSE */
1432	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1433	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1434	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1435	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1436      }
1437   }
1438}
1439
1440void
1441brw_ELSE(struct brw_compile *p)
1442{
1443   struct brw_context *brw = p->brw;
1444   struct brw_instruction *insn;
1445
1446   insn = next_insn(p, BRW_OPCODE_ELSE);
1447
1448   if (brw->gen < 6) {
1449      brw_set_dest(p, insn, brw_ip_reg());
1450      brw_set_src0(p, insn, brw_ip_reg());
1451      brw_set_src1(p, insn, brw_imm_d(0x0));
1452   } else if (brw->gen == 6) {
1453      brw_set_dest(p, insn, brw_imm_w(0));
1454      insn->bits1.branch_gen6.jump_count = 0;
1455      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1456      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1457   } else {
1458      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1459      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1460      brw_set_src1(p, insn, brw_imm_ud(0));
1461      insn->bits3.break_cont.jip = 0;
1462      insn->bits3.break_cont.uip = 0;
1463   }
1464
1465   insn->header.compression_control = BRW_COMPRESSION_NONE;
1466   insn->header.mask_control = BRW_MASK_ENABLE;
1467   if (!p->single_program_flow)
1468      insn->header.thread_control = BRW_THREAD_SWITCH;
1469
1470   push_if_stack(p, insn);
1471}
1472
1473void
1474brw_ENDIF(struct brw_compile *p)
1475{
1476   struct brw_context *brw = p->brw;
1477   struct brw_instruction *insn = NULL;
1478   struct brw_instruction *else_inst = NULL;
1479   struct brw_instruction *if_inst = NULL;
1480   struct brw_instruction *tmp;
1481   bool emit_endif = true;
1482
1483   /* In single program flow mode, we can express IF and ELSE instructions
1484    * equivalently as ADD instructions that operate on IP.  On platforms prior
1485    * to Gen6, flow control instructions cause an implied thread switch, so
1486    * this is a significant savings.
1487    *
1488    * However, on Gen6, writing to IP doesn't work in single program flow mode
1489    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1490    * not be updated by non-flow control instructions.").  And on later
1491    * platforms, there is no significant benefit to converting control flow
1492    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1493    * Gen5.
1494    */
1495   if (brw->gen < 6 && p->single_program_flow)
1496      emit_endif = false;
1497
1498   /*
1499    * A single next_insn() may change the base adress of instruction store
1500    * memory(p->store), so call it first before referencing the instruction
1501    * store pointer from an index
1502    */
1503   if (emit_endif)
1504      insn = next_insn(p, BRW_OPCODE_ENDIF);
1505
1506   /* Pop the IF and (optional) ELSE instructions from the stack */
1507   p->if_depth_in_loop[p->loop_stack_depth]--;
1508   tmp = pop_if_stack(p);
1509   if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1510      else_inst = tmp;
1511      tmp = pop_if_stack(p);
1512   }
1513   if_inst = tmp;
1514
1515   if (!emit_endif) {
1516      /* ENDIF is useless; don't bother emitting it. */
1517      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1518      return;
1519   }
1520
1521   if (brw->gen < 6) {
1522      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1523      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1524      brw_set_src1(p, insn, brw_imm_d(0x0));
1525   } else if (brw->gen == 6) {
1526      brw_set_dest(p, insn, brw_imm_w(0));
1527      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1528      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1529   } else {
1530      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1531      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1532      brw_set_src1(p, insn, brw_imm_ud(0));
1533   }
1534
1535   insn->header.compression_control = BRW_COMPRESSION_NONE;
1536   insn->header.mask_control = BRW_MASK_ENABLE;
1537   insn->header.thread_control = BRW_THREAD_SWITCH;
1538
1539   /* Also pop item off the stack in the endif instruction: */
1540   if (brw->gen < 6) {
1541      insn->bits3.if_else.jump_count = 0;
1542      insn->bits3.if_else.pop_count = 1;
1543      insn->bits3.if_else.pad0 = 0;
1544   } else if (brw->gen == 6) {
1545      insn->bits1.branch_gen6.jump_count = 2;
1546   } else {
1547      insn->bits3.break_cont.jip = 2;
1548   }
1549   patch_IF_ELSE(p, if_inst, else_inst, insn);
1550}
1551
1552struct brw_instruction *brw_BREAK(struct brw_compile *p)
1553{
1554   struct brw_context *brw = p->brw;
1555   struct brw_instruction *insn;
1556
1557   insn = next_insn(p, BRW_OPCODE_BREAK);
1558   if (brw->gen >= 6) {
1559      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1560      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1561      brw_set_src1(p, insn, brw_imm_d(0x0));
1562   } else {
1563      brw_set_dest(p, insn, brw_ip_reg());
1564      brw_set_src0(p, insn, brw_ip_reg());
1565      brw_set_src1(p, insn, brw_imm_d(0x0));
1566      insn->bits3.if_else.pad0 = 0;
1567      insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1568   }
1569   insn->header.compression_control = BRW_COMPRESSION_NONE;
1570   insn->header.execution_size = BRW_EXECUTE_8;
1571
1572   return insn;
1573}
1574
1575struct brw_instruction *gen6_CONT(struct brw_compile *p)
1576{
1577   struct brw_instruction *insn;
1578
1579   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1580   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1581   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1582   brw_set_dest(p, insn, brw_ip_reg());
1583   brw_set_src0(p, insn, brw_ip_reg());
1584   brw_set_src1(p, insn, brw_imm_d(0x0));
1585
1586   insn->header.compression_control = BRW_COMPRESSION_NONE;
1587   insn->header.execution_size = BRW_EXECUTE_8;
1588   return insn;
1589}
1590
1591struct brw_instruction *brw_CONT(struct brw_compile *p)
1592{
1593   struct brw_instruction *insn;
1594   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1595   brw_set_dest(p, insn, brw_ip_reg());
1596   brw_set_src0(p, insn, brw_ip_reg());
1597   brw_set_src1(p, insn, brw_imm_d(0x0));
1598   insn->header.compression_control = BRW_COMPRESSION_NONE;
1599   insn->header.execution_size = BRW_EXECUTE_8;
1600   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1601   insn->bits3.if_else.pad0 = 0;
1602   insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1603   return insn;
1604}
1605
1606struct brw_instruction *gen6_HALT(struct brw_compile *p)
1607{
1608   struct brw_instruction *insn;
1609
1610   insn = next_insn(p, BRW_OPCODE_HALT);
1611   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1612   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1613   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1614
1615   if (p->compressed) {
1616      insn->header.execution_size = BRW_EXECUTE_16;
1617   } else {
1618      insn->header.compression_control = BRW_COMPRESSION_NONE;
1619      insn->header.execution_size = BRW_EXECUTE_8;
1620   }
1621   return insn;
1622}
1623
1624/* DO/WHILE loop:
1625 *
1626 * The DO/WHILE is just an unterminated loop -- break or continue are
1627 * used for control within the loop.  We have a few ways they can be
1628 * done.
1629 *
1630 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1631 * jip and no DO instruction.
1632 *
1633 * For non-uniform control flow pre-gen6, there's a DO instruction to
1634 * push the mask, and a WHILE to jump back, and BREAK to get out and
1635 * pop the mask.
1636 *
1637 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1638 * just points back to the first instruction of the loop.
1639 */
1640struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1641{
1642   struct brw_context *brw = p->brw;
1643
1644   if (brw->gen >= 6 || p->single_program_flow) {
1645      push_loop_stack(p, &p->store[p->nr_insn]);
1646      return &p->store[p->nr_insn];
1647   } else {
1648      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1649
1650      push_loop_stack(p, insn);
1651
1652      /* Override the defaults for this instruction:
1653       */
1654      brw_set_dest(p, insn, brw_null_reg());
1655      brw_set_src0(p, insn, brw_null_reg());
1656      brw_set_src1(p, insn, brw_null_reg());
1657
1658      insn->header.compression_control = BRW_COMPRESSION_NONE;
1659      insn->header.execution_size = execute_size;
1660      insn->header.predicate_control = BRW_PREDICATE_NONE;
1661      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1662      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1663
1664      return insn;
1665   }
1666}
1667
1668/**
1669 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1670 * instruction here.
1671 *
1672 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1673 * nesting, since it can always just point to the end of the block/current loop.
1674 */
1675static void
1676brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1677{
1678   struct brw_context *brw = p->brw;
1679   struct brw_instruction *do_inst = get_inner_do_insn(p);
1680   struct brw_instruction *inst;
1681   int br = (brw->gen == 5) ? 2 : 1;
1682
1683   for (inst = while_inst - 1; inst != do_inst; inst--) {
1684      /* If the jump count is != 0, that means that this instruction has already
1685       * been patched because it's part of a loop inside of the one we're
1686       * patching.
1687       */
1688      if (inst->header.opcode == BRW_OPCODE_BREAK &&
1689	  inst->bits3.if_else.jump_count == 0) {
1690	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1691      } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1692		 inst->bits3.if_else.jump_count == 0) {
1693	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1694      }
1695   }
1696}
1697
1698struct brw_instruction *brw_WHILE(struct brw_compile *p)
1699{
1700   struct brw_context *brw = p->brw;
1701   struct brw_instruction *insn, *do_insn;
1702   unsigned br = 1;
1703
1704   if (brw->gen >= 5)
1705      br = 2;
1706
1707   if (brw->gen >= 7) {
1708      insn = next_insn(p, BRW_OPCODE_WHILE);
1709      do_insn = get_inner_do_insn(p);
1710
1711      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1712      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1713      brw_set_src1(p, insn, brw_imm_ud(0));
1714      insn->bits3.break_cont.jip = br * (do_insn - insn);
1715
1716      insn->header.execution_size = BRW_EXECUTE_8;
1717   } else if (brw->gen == 6) {
1718      insn = next_insn(p, BRW_OPCODE_WHILE);
1719      do_insn = get_inner_do_insn(p);
1720
1721      brw_set_dest(p, insn, brw_imm_w(0));
1722      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1723      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1724      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1725
1726      insn->header.execution_size = BRW_EXECUTE_8;
1727   } else {
1728      if (p->single_program_flow) {
1729	 insn = next_insn(p, BRW_OPCODE_ADD);
1730         do_insn = get_inner_do_insn(p);
1731
1732	 brw_set_dest(p, insn, brw_ip_reg());
1733	 brw_set_src0(p, insn, brw_ip_reg());
1734	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1735	 insn->header.execution_size = BRW_EXECUTE_1;
1736      } else {
1737	 insn = next_insn(p, BRW_OPCODE_WHILE);
1738         do_insn = get_inner_do_insn(p);
1739
1740	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1741
1742	 brw_set_dest(p, insn, brw_ip_reg());
1743	 brw_set_src0(p, insn, brw_ip_reg());
1744	 brw_set_src1(p, insn, brw_imm_d(0));
1745
1746	 insn->header.execution_size = do_insn->header.execution_size;
1747	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1748	 insn->bits3.if_else.pop_count = 0;
1749	 insn->bits3.if_else.pad0 = 0;
1750
1751	 brw_patch_break_cont(p, insn);
1752      }
1753   }
1754   insn->header.compression_control = BRW_COMPRESSION_NONE;
1755
1756   p->loop_stack_depth--;
1757
1758   return insn;
1759}
1760
1761/* FORWARD JUMPS:
1762 */
1763void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1764{
1765   struct brw_context *brw = p->brw;
1766   struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1767   unsigned jmpi = 1;
1768
1769   if (brw->gen >= 5)
1770      jmpi = 2;
1771
1772   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1773   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1774
1775   jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1776}
1777
1778/* To integrate with the above, it makes sense that the comparison
1779 * instruction should populate the flag register.  It might be simpler
1780 * just to use the flag reg for most WM tasks?
1781 */
1782void brw_CMP(struct brw_compile *p,
1783	     struct brw_reg dest,
1784	     unsigned conditional,
1785	     struct brw_reg src0,
1786	     struct brw_reg src1)
1787{
1788   struct brw_context *brw = p->brw;
1789   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1790
1791   insn->header.destreg__conditionalmod = conditional;
1792   brw_set_dest(p, insn, dest);
1793   brw_set_src0(p, insn, src0);
1794   brw_set_src1(p, insn, src1);
1795
1796   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1797    * page says:
1798    *    "Any CMP instruction with a null destination must use a {switch}."
1799    *
1800    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1801    * mentioned on their work-arounds pages.
1802    */
1803   if (brw->gen == 7) {
1804      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1805          dest.nr == BRW_ARF_NULL) {
1806         insn->header.thread_control = BRW_THREAD_SWITCH;
1807      }
1808   }
1809}
1810
1811/* Issue 'wait' instruction for n1, host could program MMIO
1812   to wake up thread. */
1813void brw_WAIT (struct brw_compile *p)
1814{
1815   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1816   struct brw_reg src = brw_notification_1_reg();
1817
1818   brw_set_dest(p, insn, src);
1819   brw_set_src0(p, insn, src);
1820   brw_set_src1(p, insn, brw_null_reg());
1821   insn->header.execution_size = 0; /* must */
1822   insn->header.predicate_control = 0;
1823   insn->header.compression_control = 0;
1824}
1825
1826
1827/***********************************************************************
1828 * Helpers for the various SEND message types:
1829 */
1830
1831/** Extended math function, float[8].
1832 */
1833void brw_math( struct brw_compile *p,
1834	       struct brw_reg dest,
1835	       unsigned function,
1836	       unsigned msg_reg_nr,
1837	       struct brw_reg src,
1838	       unsigned data_type,
1839	       unsigned precision )
1840{
1841   struct brw_context *brw = p->brw;
1842
1843   if (brw->gen >= 6) {
1844      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1845
1846      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1847             (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1848      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1849
1850      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1851      if (brw->gen == 6)
1852	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1853
1854      /* Source modifiers are ignored for extended math instructions on Gen6. */
1855      if (brw->gen == 6) {
1856	 assert(!src.negate);
1857	 assert(!src.abs);
1858      }
1859
1860      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1861	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1862	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1863	 assert(src.type != BRW_REGISTER_TYPE_F);
1864      } else {
1865	 assert(src.type == BRW_REGISTER_TYPE_F);
1866      }
1867
1868      /* Math is the same ISA format as other opcodes, except that CondModifier
1869       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1870       */
1871      insn->header.destreg__conditionalmod = function;
1872
1873      brw_set_dest(p, insn, dest);
1874      brw_set_src0(p, insn, src);
1875      brw_set_src1(p, insn, brw_null_reg());
1876   } else {
1877      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1878
1879      /* Example code doesn't set predicate_control for send
1880       * instructions.
1881       */
1882      insn->header.predicate_control = 0;
1883      insn->header.destreg__conditionalmod = msg_reg_nr;
1884
1885      brw_set_dest(p, insn, dest);
1886      brw_set_src0(p, insn, src);
1887      brw_set_math_message(p,
1888			   insn,
1889			   function,
1890			   src.type == BRW_REGISTER_TYPE_D,
1891			   precision,
1892			   data_type);
1893   }
1894}
1895
1896/** Extended math function, float[8].
1897 */
1898void brw_math2(struct brw_compile *p,
1899	       struct brw_reg dest,
1900	       unsigned function,
1901	       struct brw_reg src0,
1902	       struct brw_reg src1)
1903{
1904   struct brw_context *brw = p->brw;
1905   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1906
1907   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1908          (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1909   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1910   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1911
1912   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1913   if (brw->gen == 6) {
1914      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1915      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1916   }
1917
1918   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1919       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1920       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1921      assert(src0.type != BRW_REGISTER_TYPE_F);
1922      assert(src1.type != BRW_REGISTER_TYPE_F);
1923   } else {
1924      assert(src0.type == BRW_REGISTER_TYPE_F);
1925      assert(src1.type == BRW_REGISTER_TYPE_F);
1926   }
1927
1928   /* Source modifiers are ignored for extended math instructions on Gen6. */
1929   if (brw->gen == 6) {
1930      assert(!src0.negate);
1931      assert(!src0.abs);
1932      assert(!src1.negate);
1933      assert(!src1.abs);
1934   }
1935
1936   /* Math is the same ISA format as other opcodes, except that CondModifier
1937    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1938    */
1939   insn->header.destreg__conditionalmod = function;
1940
1941   brw_set_dest(p, insn, dest);
1942   brw_set_src0(p, insn, src0);
1943   brw_set_src1(p, insn, src1);
1944}
1945
1946
1947/**
1948 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1949 * using a constant offset per channel.
1950 *
1951 * The offset must be aligned to oword size (16 bytes).  Used for
1952 * register spilling.
1953 */
1954void brw_oword_block_write_scratch(struct brw_compile *p,
1955				   struct brw_reg mrf,
1956				   int num_regs,
1957				   unsigned offset)
1958{
1959   struct brw_context *brw = p->brw;
1960   uint32_t msg_control, msg_type;
1961   int mlen;
1962
1963   if (brw->gen >= 6)
1964      offset /= 16;
1965
1966   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1967
1968   if (num_regs == 1) {
1969      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1970      mlen = 2;
1971   } else {
1972      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1973      mlen = 3;
1974   }
1975
1976   /* Set up the message header.  This is g0, with g0.2 filled with
1977    * the offset.  We don't want to leave our offset around in g0 or
1978    * it'll screw up texture samples, so set it up inside the message
1979    * reg.
1980    */
1981   {
1982      brw_push_insn_state(p);
1983      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1984      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1985
1986      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1987
1988      /* set message header global offset field (reg 0, element 2) */
1989      brw_MOV(p,
1990	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1991				  mrf.nr,
1992				  2), BRW_REGISTER_TYPE_UD),
1993	      brw_imm_ud(offset));
1994
1995      brw_pop_insn_state(p);
1996   }
1997
1998   {
1999      struct brw_reg dest;
2000      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2001      int send_commit_msg;
2002      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2003					 BRW_REGISTER_TYPE_UW);
2004
2005      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
2006	 insn->header.compression_control = BRW_COMPRESSION_NONE;
2007	 src_header = vec16(src_header);
2008      }
2009      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2010      insn->header.destreg__conditionalmod = mrf.nr;
2011
2012      /* Until gen6, writes followed by reads from the same location
2013       * are not guaranteed to be ordered unless write_commit is set.
2014       * If set, then a no-op write is issued to the destination
2015       * register to set a dependency, and a read from the destination
2016       * can be used to ensure the ordering.
2017       *
2018       * For gen6, only writes between different threads need ordering
2019       * protection.  Our use of DP writes is all about register
2020       * spilling within a thread.
2021       */
2022      if (brw->gen >= 6) {
2023	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2024	 send_commit_msg = 0;
2025      } else {
2026	 dest = src_header;
2027	 send_commit_msg = 1;
2028      }
2029
2030      brw_set_dest(p, insn, dest);
2031      if (brw->gen >= 6) {
2032	 brw_set_src0(p, insn, mrf);
2033      } else {
2034	 brw_set_src0(p, insn, brw_null_reg());
2035      }
2036
2037      if (brw->gen >= 6)
2038	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2039      else
2040	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2041
2042      brw_set_dp_write_message(p,
2043			       insn,
2044			       255, /* binding table index (255=stateless) */
2045			       msg_control,
2046			       msg_type,
2047			       mlen,
2048			       true, /* header_present */
2049			       0, /* not a render target */
2050			       send_commit_msg, /* response_length */
2051			       0, /* eot */
2052			       send_commit_msg);
2053   }
2054}
2055
2056
2057/**
2058 * Read a block of owords (half a GRF each) from the scratch buffer
2059 * using a constant index per channel.
2060 *
2061 * Offset must be aligned to oword size (16 bytes).  Used for register
2062 * spilling.
2063 */
2064void
2065brw_oword_block_read_scratch(struct brw_compile *p,
2066			     struct brw_reg dest,
2067			     struct brw_reg mrf,
2068			     int num_regs,
2069			     unsigned offset)
2070{
2071   struct brw_context *brw = p->brw;
2072   uint32_t msg_control;
2073   int rlen;
2074
2075   if (brw->gen >= 6)
2076      offset /= 16;
2077
2078   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2079   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2080
2081   if (num_regs == 1) {
2082      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2083      rlen = 1;
2084   } else {
2085      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2086      rlen = 2;
2087   }
2088
2089   {
2090      brw_push_insn_state(p);
2091      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2092      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2093
2094      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2095
2096      /* set message header global offset field (reg 0, element 2) */
2097      brw_MOV(p,
2098	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2099				  mrf.nr,
2100				  2), BRW_REGISTER_TYPE_UD),
2101	      brw_imm_ud(offset));
2102
2103      brw_pop_insn_state(p);
2104   }
2105
2106   {
2107      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2108
2109      assert(insn->header.predicate_control == 0);
2110      insn->header.compression_control = BRW_COMPRESSION_NONE;
2111      insn->header.destreg__conditionalmod = mrf.nr;
2112
2113      brw_set_dest(p, insn, dest);	/* UW? */
2114      if (brw->gen >= 6) {
2115	 brw_set_src0(p, insn, mrf);
2116      } else {
2117	 brw_set_src0(p, insn, brw_null_reg());
2118      }
2119
2120      brw_set_dp_read_message(p,
2121			      insn,
2122			      255, /* binding table index (255=stateless) */
2123			      msg_control,
2124			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2125			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2126			      1, /* msg_length */
2127                              true, /* header_present */
2128			      rlen);
2129   }
2130}
2131
2132void
2133gen7_block_read_scratch(struct brw_compile *p,
2134                        struct brw_reg dest,
2135                        int num_regs,
2136                        unsigned offset)
2137{
2138   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2139
2140   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2141
2142   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
2143   insn->header.compression_control = BRW_COMPRESSION_NONE;
2144
2145   brw_set_dest(p, insn, dest);
2146
2147   /* The HW requires that the header is present; this is to get the g0.5
2148    * scratch offset.
2149    */
2150   bool header_present = true;
2151   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2152
2153   brw_set_message_descriptor(p, insn,
2154                              GEN7_SFID_DATAPORT_DATA_CACHE,
2155                              1, /* mlen: just g0 */
2156                              num_regs,
2157                              header_present,
2158                              false);
2159
2160   insn->bits3.ud |= GEN7_DATAPORT_SCRATCH_READ;
2161
2162   assert(num_regs == 1 || num_regs == 2 || num_regs == 4);
2163   insn->bits3.ud |= (num_regs - 1) << GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT;
2164
2165   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2166    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2167    * is 32 bytes, which happens to be the size of a register.
2168    */
2169   offset /= REG_SIZE;
2170   assert(offset < (1 << 12));
2171   insn->bits3.ud |= offset;
2172}
2173
2174/**
2175 * Read a float[4] vector from the data port Data Cache (const buffer).
2176 * Location (in buffer) should be a multiple of 16.
2177 * Used for fetching shader constants.
2178 */
2179void brw_oword_block_read(struct brw_compile *p,
2180			  struct brw_reg dest,
2181			  struct brw_reg mrf,
2182			  uint32_t offset,
2183			  uint32_t bind_table_index)
2184{
2185   struct brw_context *brw = p->brw;
2186
2187   /* On newer hardware, offset is in units of owords. */
2188   if (brw->gen >= 6)
2189      offset /= 16;
2190
2191   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2192
2193   brw_push_insn_state(p);
2194   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2195   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2196   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2197
2198   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2199
2200   /* set message header global offset field (reg 0, element 2) */
2201   brw_MOV(p,
2202	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2203			       mrf.nr,
2204			       2), BRW_REGISTER_TYPE_UD),
2205	   brw_imm_ud(offset));
2206
2207   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2208   insn->header.destreg__conditionalmod = mrf.nr;
2209
2210   /* cast dest to a uword[8] vector */
2211   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2212
2213   brw_set_dest(p, insn, dest);
2214   if (brw->gen >= 6) {
2215      brw_set_src0(p, insn, mrf);
2216   } else {
2217      brw_set_src0(p, insn, brw_null_reg());
2218   }
2219
2220   brw_set_dp_read_message(p,
2221			   insn,
2222			   bind_table_index,
2223			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2224			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2225			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2226			   1, /* msg_length */
2227                           true, /* header_present */
2228			   1); /* response_length (1 reg, 2 owords!) */
2229
2230   brw_pop_insn_state(p);
2231}
2232
2233
2234void brw_fb_WRITE(struct brw_compile *p,
2235		  int dispatch_width,
2236                  unsigned msg_reg_nr,
2237                  struct brw_reg src0,
2238                  unsigned msg_control,
2239                  unsigned binding_table_index,
2240                  unsigned msg_length,
2241                  unsigned response_length,
2242                  bool eot,
2243                  bool header_present)
2244{
2245   struct brw_context *brw = p->brw;
2246   struct brw_instruction *insn;
2247   unsigned msg_type;
2248   struct brw_reg dest;
2249
2250   if (dispatch_width == 16)
2251      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2252   else
2253      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2254
2255   if (brw->gen >= 6) {
2256      insn = next_insn(p, BRW_OPCODE_SENDC);
2257   } else {
2258      insn = next_insn(p, BRW_OPCODE_SEND);
2259   }
2260   insn->header.compression_control = BRW_COMPRESSION_NONE;
2261
2262   if (brw->gen >= 6) {
2263      /* headerless version, just submit color payload */
2264      src0 = brw_message_reg(msg_reg_nr);
2265
2266      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2267   } else {
2268      insn->header.destreg__conditionalmod = msg_reg_nr;
2269
2270      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2271   }
2272
2273   brw_set_dest(p, insn, dest);
2274   brw_set_src0(p, insn, src0);
2275   brw_set_dp_write_message(p,
2276			    insn,
2277			    binding_table_index,
2278			    msg_control,
2279			    msg_type,
2280			    msg_length,
2281			    header_present,
2282			    eot, /* last render target write */
2283			    response_length,
2284			    eot,
2285			    0 /* send_commit_msg */);
2286}
2287
2288
2289/**
2290 * Texture sample instruction.
2291 * Note: the msg_type plus msg_length values determine exactly what kind
2292 * of sampling operation is performed.  See volume 4, page 161 of docs.
2293 */
2294void brw_SAMPLE(struct brw_compile *p,
2295		struct brw_reg dest,
2296		unsigned msg_reg_nr,
2297		struct brw_reg src0,
2298		unsigned binding_table_index,
2299		unsigned sampler,
2300		unsigned msg_type,
2301		unsigned response_length,
2302		unsigned msg_length,
2303		unsigned header_present,
2304		unsigned simd_mode,
2305		unsigned return_format)
2306{
2307   struct brw_context *brw = p->brw;
2308   struct brw_instruction *insn;
2309
2310   if (msg_reg_nr != -1)
2311      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2312
2313   insn = next_insn(p, BRW_OPCODE_SEND);
2314   insn->header.predicate_control = 0; /* XXX */
2315
2316   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2317    *
2318    *    "Instruction compression is not allowed for this instruction (that
2319    *     is, send). The hardware behavior is undefined if this instruction is
2320    *     set as compressed. However, compress control can be set to "SecHalf"
2321    *     to affect the EMask generation."
2322    *
2323    * No similar wording is found in later PRMs, but there are examples
2324    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2325    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2326    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2327    */
2328   if (insn->header.compression_control != BRW_COMPRESSION_2NDHALF)
2329      insn->header.compression_control = BRW_COMPRESSION_NONE;
2330
2331   if (brw->gen < 6)
2332      insn->header.destreg__conditionalmod = msg_reg_nr;
2333
2334   brw_set_dest(p, insn, dest);
2335   brw_set_src0(p, insn, src0);
2336   brw_set_sampler_message(p, insn,
2337                           binding_table_index,
2338                           sampler,
2339                           msg_type,
2340                           response_length,
2341                           msg_length,
2342                           header_present,
2343                           simd_mode,
2344                           return_format);
2345}
2346
2347/* All these variables are pretty confusing - we might be better off
2348 * using bitmasks and macros for this, in the old style.  Or perhaps
2349 * just having the caller instantiate the fields in dword3 itself.
2350 */
2351void brw_urb_WRITE(struct brw_compile *p,
2352		   struct brw_reg dest,
2353		   unsigned msg_reg_nr,
2354		   struct brw_reg src0,
2355                   enum brw_urb_write_flags flags,
2356		   unsigned msg_length,
2357		   unsigned response_length,
2358		   unsigned offset,
2359		   unsigned swizzle)
2360{
2361   struct brw_context *brw = p->brw;
2362   struct brw_instruction *insn;
2363
2364   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2365
2366   if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2367      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2368      brw_push_insn_state(p);
2369      brw_set_default_access_mode(p, BRW_ALIGN_1);
2370      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2371      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2372		       BRW_REGISTER_TYPE_UD),
2373	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2374		brw_imm_ud(0xff00));
2375      brw_pop_insn_state(p);
2376   }
2377
2378   insn = next_insn(p, BRW_OPCODE_SEND);
2379
2380   assert(msg_length < BRW_MAX_MRF);
2381
2382   brw_set_dest(p, insn, dest);
2383   brw_set_src0(p, insn, src0);
2384   brw_set_src1(p, insn, brw_imm_d(0));
2385
2386   if (brw->gen < 6)
2387      insn->header.destreg__conditionalmod = msg_reg_nr;
2388
2389   brw_set_urb_message(p,
2390		       insn,
2391		       flags,
2392		       msg_length,
2393		       response_length,
2394		       offset,
2395		       swizzle);
2396}
2397
2398static int
2399brw_find_next_block_end(struct brw_compile *p, int start_offset)
2400{
2401   int offset;
2402   void *store = p->store;
2403
2404   for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2405        offset = next_offset(store, offset)) {
2406      struct brw_instruction *insn = store + offset;
2407
2408      switch (insn->header.opcode) {
2409      case BRW_OPCODE_ENDIF:
2410      case BRW_OPCODE_ELSE:
2411      case BRW_OPCODE_WHILE:
2412      case BRW_OPCODE_HALT:
2413	 return offset;
2414      }
2415   }
2416
2417   return 0;
2418}
2419
2420/* There is no DO instruction on gen6, so to find the end of the loop
2421 * we have to see if the loop is jumping back before our start
2422 * instruction.
2423 */
2424static int
2425brw_find_loop_end(struct brw_compile *p, int start_offset)
2426{
2427   struct brw_context *brw = p->brw;
2428   int offset;
2429   int scale = 8;
2430   void *store = p->store;
2431
2432   /* Always start after the instruction (such as a WHILE) we're trying to fix
2433    * up.
2434    */
2435   for (offset = next_offset(store, start_offset); offset < p->next_insn_offset;
2436        offset = next_offset(store, offset)) {
2437      struct brw_instruction *insn = store + offset;
2438
2439      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2440	 int jip = brw->gen == 6 ? insn->bits1.branch_gen6.jump_count
2441				   : insn->bits3.break_cont.jip;
2442	 if (offset + jip * scale <= start_offset)
2443	    return offset;
2444      }
2445   }
2446   assert(!"not reached");
2447   return start_offset;
2448}
2449
2450/* After program generation, go back and update the UIP and JIP of
2451 * BREAK, CONT, and HALT instructions to their correct locations.
2452 */
2453void
2454brw_set_uip_jip(struct brw_compile *p)
2455{
2456   struct brw_context *brw = p->brw;
2457   int offset;
2458   int scale = 8;
2459   void *store = p->store;
2460
2461   if (brw->gen < 6)
2462      return;
2463
2464   for (offset = 0; offset < p->next_insn_offset;
2465        offset = next_offset(store, offset)) {
2466      struct brw_instruction *insn = store + offset;
2467
2468      if (insn->header.cmpt_control) {
2469	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2470	 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2471		insn->header.opcode != BRW_OPCODE_CONTINUE &&
2472		insn->header.opcode != BRW_OPCODE_HALT);
2473	 continue;
2474      }
2475
2476      int block_end_offset = brw_find_next_block_end(p, offset);
2477      switch (insn->header.opcode) {
2478      case BRW_OPCODE_BREAK:
2479         assert(block_end_offset != 0);
2480	 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2481	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2482	 insn->bits3.break_cont.uip =
2483	    (brw_find_loop_end(p, offset) - offset +
2484             (brw->gen == 6 ? 16 : 0)) / scale;
2485	 break;
2486      case BRW_OPCODE_CONTINUE:
2487         assert(block_end_offset != 0);
2488	 insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2489	 insn->bits3.break_cont.uip =
2490            (brw_find_loop_end(p, offset) - offset) / scale;
2491
2492	 assert(insn->bits3.break_cont.uip != 0);
2493	 assert(insn->bits3.break_cont.jip != 0);
2494	 break;
2495
2496      case BRW_OPCODE_ENDIF:
2497         if (block_end_offset == 0)
2498            insn->bits3.break_cont.jip = 2;
2499         else
2500            insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2501	 break;
2502
2503      case BRW_OPCODE_HALT:
2504	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2505	  *
2506	  *    "In case of the halt instruction not inside any conditional
2507	  *     code block, the value of <JIP> and <UIP> should be the
2508	  *     same. In case of the halt instruction inside conditional code
2509	  *     block, the <UIP> should be the end of the program, and the
2510	  *     <JIP> should be end of the most inner conditional code block."
2511	  *
2512	  * The uip will have already been set by whoever set up the
2513	  * instruction.
2514	  */
2515	 if (block_end_offset == 0) {
2516	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2517	 } else {
2518	    insn->bits3.break_cont.jip = (block_end_offset - offset) / scale;
2519	 }
2520	 assert(insn->bits3.break_cont.uip != 0);
2521	 assert(insn->bits3.break_cont.jip != 0);
2522	 break;
2523      }
2524   }
2525}
2526
2527void brw_ff_sync(struct brw_compile *p,
2528		   struct brw_reg dest,
2529		   unsigned msg_reg_nr,
2530		   struct brw_reg src0,
2531		   bool allocate,
2532		   unsigned response_length,
2533		   bool eot)
2534{
2535   struct brw_context *brw = p->brw;
2536   struct brw_instruction *insn;
2537
2538   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2539
2540   insn = next_insn(p, BRW_OPCODE_SEND);
2541   brw_set_dest(p, insn, dest);
2542   brw_set_src0(p, insn, src0);
2543   brw_set_src1(p, insn, brw_imm_d(0));
2544
2545   if (brw->gen < 6)
2546      insn->header.destreg__conditionalmod = msg_reg_nr;
2547
2548   brw_set_ff_sync_message(p,
2549			   insn,
2550			   allocate,
2551			   response_length,
2552			   eot);
2553}
2554
2555/**
2556 * Emit the SEND instruction necessary to generate stream output data on Gen6
2557 * (for transform feedback).
2558 *
2559 * If send_commit_msg is true, this is the last piece of stream output data
2560 * from this thread, so send the data as a committed write.  According to the
2561 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2562 *
2563 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2564 *   writes are complete by sending the final write as a committed write."
2565 */
2566void
2567brw_svb_write(struct brw_compile *p,
2568              struct brw_reg dest,
2569              unsigned msg_reg_nr,
2570              struct brw_reg src0,
2571              unsigned binding_table_index,
2572              bool   send_commit_msg)
2573{
2574   struct brw_instruction *insn;
2575
2576   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2577
2578   insn = next_insn(p, BRW_OPCODE_SEND);
2579   brw_set_dest(p, insn, dest);
2580   brw_set_src0(p, insn, src0);
2581   brw_set_src1(p, insn, brw_imm_d(0));
2582   brw_set_dp_write_message(p, insn,
2583                            binding_table_index,
2584                            0, /* msg_control: ignored */
2585                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2586                            1, /* msg_length */
2587                            true, /* header_present */
2588                            0, /* last_render_target: ignored */
2589                            send_commit_msg, /* response_length */
2590                            0, /* end_of_thread */
2591                            send_commit_msg); /* send_commit_msg */
2592}
2593
2594static void
2595brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2596                                  struct brw_instruction *insn,
2597                                  unsigned atomic_op,
2598                                  unsigned bind_table_index,
2599                                  unsigned msg_length,
2600                                  unsigned response_length,
2601                                  bool header_present)
2602{
2603   if (p->brw->is_haswell) {
2604      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2605                                 msg_length, response_length,
2606                                 header_present, false);
2607
2608
2609      if (insn->header.access_mode == BRW_ALIGN_1) {
2610         if (insn->header.execution_size != BRW_EXECUTE_16)
2611            insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2612
2613         insn->bits3.gen7_dp.msg_type =
2614            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2615      } else {
2616         insn->bits3.gen7_dp.msg_type =
2617            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
2618      }
2619   } else {
2620      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2621                                 msg_length, response_length,
2622                                 header_present, false);
2623
2624      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2625
2626      if (insn->header.execution_size != BRW_EXECUTE_16)
2627         insn->bits3.ud |= 1 << 12; /* SIMD8 mode */
2628   }
2629
2630   if (response_length)
2631      insn->bits3.ud |= 1 << 13; /* Return data expected */
2632
2633   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2634   insn->bits3.ud |= atomic_op << 8;
2635}
2636
2637void
2638brw_untyped_atomic(struct brw_compile *p,
2639                   struct brw_reg dest,
2640                   struct brw_reg mrf,
2641                   unsigned atomic_op,
2642                   unsigned bind_table_index,
2643                   unsigned msg_length,
2644                   unsigned response_length) {
2645   struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2646
2647   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2648   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2649   brw_set_src1(p, insn, brw_imm_d(0));
2650   brw_set_dp_untyped_atomic_message(
2651      p, insn, atomic_op, bind_table_index, msg_length, response_length,
2652      insn->header.access_mode == BRW_ALIGN_1);
2653}
2654
2655static void
2656brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2657                                        struct brw_instruction *insn,
2658                                        unsigned bind_table_index,
2659                                        unsigned msg_length,
2660                                        unsigned response_length,
2661                                        bool header_present)
2662{
2663   const unsigned dispatch_width =
2664      (insn->header.execution_size == BRW_EXECUTE_16 ? 16 : 8);
2665   const unsigned num_channels = response_length / (dispatch_width / 8);
2666
2667   if (p->brw->is_haswell) {
2668      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2669                                 msg_length, response_length,
2670                                 header_present, false);
2671
2672      insn->bits3.gen7_dp.msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ;
2673   } else {
2674      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2675                                 msg_length, response_length,
2676                                 header_present, false);
2677
2678      insn->bits3.gen7_dp.msg_type = GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ;
2679   }
2680
2681   if (insn->header.access_mode == BRW_ALIGN_1) {
2682      if (dispatch_width == 16)
2683         insn->bits3.ud |= 1 << 12; /* SIMD16 mode */
2684      else
2685         insn->bits3.ud |= 2 << 12; /* SIMD8 mode */
2686   }
2687
2688   insn->bits3.gen7_dp.binding_table_index = bind_table_index;
2689
2690   /* Set mask of 32-bit channels to drop. */
2691   insn->bits3.ud |= (0xf & (0xf << num_channels)) << 8;
2692}
2693
2694void
2695brw_untyped_surface_read(struct brw_compile *p,
2696                         struct brw_reg dest,
2697                         struct brw_reg mrf,
2698                         unsigned bind_table_index,
2699                         unsigned msg_length,
2700                         unsigned response_length)
2701{
2702   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2703
2704   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2705   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2706   brw_set_dp_untyped_surface_read_message(
2707      p, insn, bind_table_index, msg_length, response_length,
2708      insn->header.access_mode == BRW_ALIGN_1);
2709}
2710
2711/**
2712 * This instruction is generated as a single-channel align1 instruction by
2713 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2714 *
2715 * We can't use the typed atomic op in the FS because that has the execution
2716 * mask ANDed with the pixel mask, but we just want to write the one dword for
2717 * all the pixels.
2718 *
2719 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2720 * one u32.  So we use the same untyped atomic write message as the pixel
2721 * shader.
2722 *
2723 * The untyped atomic operation requires a BUFFER surface type with RAW
2724 * format, and is only accessible through the legacy DATA_CACHE dataport
2725 * messages.
2726 */
2727void brw_shader_time_add(struct brw_compile *p,
2728                         struct brw_reg payload,
2729                         uint32_t surf_index)
2730{
2731   struct brw_context *brw = p->brw;
2732   assert(brw->gen >= 7);
2733
2734   brw_push_insn_state(p);
2735   brw_set_default_access_mode(p, BRW_ALIGN_1);
2736   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2737   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2738   brw_pop_insn_state(p);
2739
2740   /* We use brw_vec1_reg and unmasked because we want to increment the given
2741    * offset only once.
2742    */
2743   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2744                                      BRW_ARF_NULL, 0));
2745   brw_set_src0(p, send, brw_vec1_reg(payload.file,
2746                                      payload.nr, 0));
2747   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2748                                     2 /* message length */,
2749                                     0 /* response length */,
2750                                     false /* header present */);
2751}
2752