brw_eu_emit.c revision 829aac4b6783a6e7667293a60d97947d277cfa39
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   const struct brw_context *brw = p->brw;
48
49   if (reg.width == BRW_WIDTH_8 && p->compressed) {
50      brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_16);
51   } else {
52      /* Register width definitions are compatible with BRW_EXECUTE_* enums. */
53      brw_inst_set_exec_size(brw, insn, reg.width);
54   }
55}
56
57
58/**
59 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
60 * registers, implicitly moving the operand to a message register.
61 *
62 * On Sandybridge, this is no longer the case.  This function performs the
63 * explicit move; it should be called before emitting a SEND instruction.
64 */
65void
66gen6_resolve_implied_move(struct brw_compile *p,
67			  struct brw_reg *src,
68			  unsigned msg_reg_nr)
69{
70   struct brw_context *brw = p->brw;
71   if (brw->gen < 6)
72      return;
73
74   if (src->file == BRW_MESSAGE_REGISTER_FILE)
75      return;
76
77   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
78      brw_push_insn_state(p);
79      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
80      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
81      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
82	      retype(*src, BRW_REGISTER_TYPE_UD));
83      brw_pop_insn_state(p);
84   }
85   *src = brw_message_reg(msg_reg_nr);
86}
87
88static void
89gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
90{
91   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
92    * "The send with EOT should use register space R112-R127 for <src>. This is
93    *  to enable loading of a new thread into the same slot while the message
94    *  with EOT for current thread is pending dispatch."
95    *
96    * Since we're pretending to have 16 MRFs anyway, we may as well use the
97    * registers required for messages with EOT.
98    */
99   struct brw_context *brw = p->brw;
100   if (brw->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
101      reg->file = BRW_GENERAL_REGISTER_FILE;
102      reg->nr += GEN7_MRF_HACK_START;
103   }
104}
105
106/**
107 * Convert a brw_reg_type enumeration value into the hardware representation.
108 *
109 * The hardware encoding may depend on whether the value is an immediate.
110 */
111unsigned
112brw_reg_type_to_hw_type(const struct brw_context *brw,
113                        enum brw_reg_type type, unsigned file)
114{
115   if (file == BRW_IMMEDIATE_VALUE) {
116      const static int imm_hw_types[] = {
117         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
118         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
119         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
120         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
121         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
122         [BRW_REGISTER_TYPE_UB] = -1,
123         [BRW_REGISTER_TYPE_B]  = -1,
124         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
125         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
126         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
127         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
128         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
129         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
130         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
131      };
132      assert(type < ARRAY_SIZE(imm_hw_types));
133      assert(imm_hw_types[type] != -1);
134      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
135      return imm_hw_types[type];
136   } else {
137      /* Non-immediate registers */
138      const static int hw_types[] = {
139         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
140         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
141         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
142         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
143         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
144         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
145         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
146         [BRW_REGISTER_TYPE_UV] = -1,
147         [BRW_REGISTER_TYPE_VF] = -1,
148         [BRW_REGISTER_TYPE_V]  = -1,
149         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
150         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
151         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
152         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
153      };
154      assert(type < ARRAY_SIZE(hw_types));
155      assert(hw_types[type] != -1);
156      assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
157      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
158      return hw_types[type];
159   }
160}
161
162void
163brw_set_dest(struct brw_compile *p, struct brw_instruction *inst,
164	     struct brw_reg dest)
165{
166   const struct brw_context *brw = p->brw;
167
168   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
169       dest.file != BRW_MESSAGE_REGISTER_FILE)
170      assert(dest.nr < 128);
171
172   gen7_convert_mrf_to_grf(p, &dest);
173
174   brw_inst_set_dst_reg_file(brw, inst, dest.file);
175   brw_inst_set_dst_reg_type(brw, inst, brw_reg_type_to_hw_type(brw, dest.type,
176                                                                dest.file));
177   brw_inst_set_dst_address_mode(brw, inst, dest.address_mode);
178
179   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
180      brw_inst_set_dst_da_reg_nr(brw, inst, dest.nr);
181
182      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
183         brw_inst_set_dst_da1_subreg_nr(brw, inst, dest.subnr);
184	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
185	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
186         brw_inst_set_dst_hstride(brw, inst, dest.hstride);
187      } else {
188         brw_inst_set_dst_da16_subreg_nr(brw, inst, dest.subnr / 16);
189         brw_inst_set_da16_writemask(brw, inst, dest.dw1.bits.writemask);
190         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
191             dest.file == BRW_MESSAGE_REGISTER_FILE) {
192            assert(dest.dw1.bits.writemask != 0);
193         }
194	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
195	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
196	  *    this to be programmed as "01".
197	  */
198         brw_inst_set_dst_hstride(brw, inst, 1);
199      }
200   } else {
201      brw_inst_set_dst_ia_subreg_nr(brw, inst, dest.subnr);
202
203      /* These are different sizes in align1 vs align16:
204       */
205      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
206         brw_inst_set_dst_ia1_addr_imm(brw, inst,
207                                       dest.dw1.bits.indirect_offset);
208	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
209	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
210         brw_inst_set_dst_hstride(brw, inst, dest.hstride);
211      } else {
212         brw_inst_set_dst_ia16_addr_imm(brw, inst,
213                                        dest.dw1.bits.indirect_offset);
214	 /* even ignored in da16, still need to set as '01' */
215         brw_inst_set_dst_hstride(brw, inst, 1);
216      }
217   }
218
219   /* NEW: Set the execution size based on dest.width and
220    * inst->compression_control:
221    */
222   guess_execution_size(p, inst, dest);
223}
224
225extern int reg_type_size[];
226
227static void
228validate_reg(const struct brw_context *brw,
229             struct brw_instruction *inst, struct brw_reg reg)
230{
231   int hstride_for_reg[] = {0, 1, 2, 4};
232   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
233   int width_for_reg[] = {1, 2, 4, 8, 16};
234   int execsize_for_reg[] = {1, 2, 4, 8, 16};
235   int width, hstride, vstride, execsize;
236
237   if (reg.file == BRW_IMMEDIATE_VALUE) {
238      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
239       * mean the destination has to be 128-bit aligned and the
240       * destination horiz stride has to be a word.
241       */
242      if (reg.type == BRW_REGISTER_TYPE_V) {
243         assert(hstride_for_reg[brw_inst_dst_hstride(brw, inst)] *
244                reg_type_size[brw_inst_dst_reg_type(brw, inst)] == 2);
245      }
246
247      return;
248   }
249
250   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
251       reg.file == BRW_ARF_NULL)
252      return;
253
254   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
255   hstride = hstride_for_reg[reg.hstride];
256
257   if (reg.vstride == 0xf) {
258      vstride = -1;
259   } else {
260      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
261      vstride = vstride_for_reg[reg.vstride];
262   }
263
264   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
265   width = width_for_reg[reg.width];
266
267   assert(brw_inst_exec_size(brw, inst) >= 0 &&
268          brw_inst_exec_size(brw, inst) < Elements(execsize_for_reg));
269   execsize = execsize_for_reg[brw_inst_exec_size(brw, inst)];
270
271   /* Restrictions from 3.3.10: Register Region Restrictions. */
272   /* 3. */
273   assert(execsize >= width);
274
275   /* 4. */
276   if (execsize == width && hstride != 0) {
277      assert(vstride == -1 || vstride == width * hstride);
278   }
279
280   /* 5. */
281   if (execsize == width && hstride == 0) {
282      /* no restriction on vstride. */
283   }
284
285   /* 6. */
286   if (width == 1) {
287      assert(hstride == 0);
288   }
289
290   /* 7. */
291   if (execsize == 1 && width == 1) {
292      assert(hstride == 0);
293      assert(vstride == 0);
294   }
295
296   /* 8. */
297   if (vstride == 0 && hstride == 0) {
298      assert(width == 1);
299   }
300
301   /* 10. Check destination issues. */
302}
303
304static bool
305is_compactable_immediate(unsigned imm)
306{
307   /* We get the low 12 bits as-is. */
308   imm &= ~0xfff;
309
310   /* We get one bit replicated through the top 20 bits. */
311   return imm == 0 || imm == 0xfffff000;
312}
313
314void
315brw_set_src0(struct brw_compile *p, struct brw_instruction *inst,
316	     struct brw_reg reg)
317{
318   struct brw_context *brw = p->brw;
319
320   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
321      assert(reg.nr < 128);
322
323   gen7_convert_mrf_to_grf(p, &reg);
324
325   if (brw->gen >= 6 && (brw_inst_opcode(brw, inst) == BRW_OPCODE_SEND ||
326                         brw_inst_opcode(brw, inst) == BRW_OPCODE_SENDC)) {
327      /* Any source modifiers or regions will be ignored, since this just
328       * identifies the MRF/GRF to start reading the message contents from.
329       * Check for some likely failures.
330       */
331      assert(!reg.negate);
332      assert(!reg.abs);
333      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
334   }
335
336   validate_reg(brw, inst, reg);
337
338   brw_inst_set_src0_reg_file(brw, inst, reg.file);
339   brw_inst_set_src0_reg_type(brw, inst,
340                              brw_reg_type_to_hw_type(brw, reg.type, reg.file));
341   brw_inst_set_src0_abs(brw, inst, reg.abs);
342   brw_inst_set_src0_negate(brw, inst, reg.negate);
343   brw_inst_set_src0_address_mode(brw, inst, reg.address_mode);
344
345   if (reg.file == BRW_IMMEDIATE_VALUE) {
346      brw_inst_set_imm_ud(brw, inst, reg.dw1.ud);
347
348      /* The Bspec's section titled "Non-present Operands" claims that if src0
349       * is an immediate that src1's type must be the same as that of src0.
350       *
351       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
352       * that do not follow this rule. E.g., from the IVB/HSW table:
353       *
354       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
355       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
356       *
357       * And from the SNB table:
358       *
359       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
360       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
361       *
362       * Neither of these cause warnings from the simulator when used,
363       * compacted or otherwise. In fact, all compaction mappings that have an
364       * immediate in src0 use a:ud for src1.
365       *
366       * The GM45 instruction compaction tables do not contain mapped meanings
367       * so it's not clear whether it has the restriction. We'll assume it was
368       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
369       */
370      brw_inst_set_src1_reg_file(brw, inst, BRW_ARCHITECTURE_REGISTER_FILE);
371      if (brw->gen < 6) {
372         brw_inst_set_src1_reg_type(brw, inst,
373                                    brw_inst_src0_reg_type(brw, inst));
374      } else {
375         brw_inst_set_src1_reg_type(brw, inst, BRW_HW_REG_TYPE_UD);
376      }
377
378      /* Compacted instructions only have 12-bits (plus 1 for the other 20)
379       * for immediate values. Presumably the hardware engineers realized
380       * that the only useful floating-point value that could be represented
381       * in this format is 0.0, which can also be represented as a VF-typed
382       * immediate, so they gave us the previously mentioned mapping on IVB+.
383       *
384       * Strangely, we do have a mapping for imm:f in src1, so we don't need
385       * to do this there.
386       *
387       * If we see a 0.0:F, change the type to VF so that it can be compacted.
388       */
389      if (brw_inst_imm_ud(brw, inst) == 0x0 &&
390          brw_inst_src0_reg_type(brw, inst) == BRW_HW_REG_TYPE_F) {
391         brw_inst_set_src0_reg_type(brw, inst, BRW_HW_REG_IMM_TYPE_VF);
392      }
393
394      /* There are no mappings for dst:d | i:d, so if the immediate is suitable
395       * set the types to :UD so the instruction can be compacted.
396       */
397      if (is_compactable_immediate(brw_inst_imm_ud(brw, inst)) &&
398          brw_inst_cond_modifier(brw, inst) == BRW_CONDITIONAL_NONE &&
399          brw_inst_src0_reg_type(brw, inst) == BRW_HW_REG_TYPE_D &&
400          brw_inst_dst_reg_type(brw, inst) == BRW_HW_REG_TYPE_D) {
401         brw_inst_set_src0_reg_type(brw, inst, BRW_HW_REG_TYPE_UD);
402         brw_inst_set_dst_reg_type(brw, inst, BRW_HW_REG_TYPE_UD);
403      }
404   } else {
405      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
406         brw_inst_set_src0_da_reg_nr(brw, inst, reg.nr);
407         if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
408             brw_inst_set_src0_da1_subreg_nr(brw, inst, reg.subnr);
409	 } else {
410            brw_inst_set_src0_da16_subreg_nr(brw, inst, reg.subnr / 16);
411	 }
412      } else {
413         brw_inst_set_src0_ia_subreg_nr(brw, inst, reg.subnr);
414
415         if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
416            brw_inst_set_src0_ia1_addr_imm(brw, inst, reg.dw1.bits.indirect_offset);
417	 } else {
418            brw_inst_set_src0_ia_subreg_nr(brw, inst, reg.dw1.bits.indirect_offset);
419	 }
420      }
421
422      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
423	 if (reg.width == BRW_WIDTH_1 &&
424             brw_inst_exec_size(brw, inst) == BRW_EXECUTE_1) {
425            brw_inst_set_src0_hstride(brw, inst, BRW_HORIZONTAL_STRIDE_0);
426            brw_inst_set_src0_width(brw, inst, BRW_WIDTH_1);
427            brw_inst_set_src0_vstride(brw, inst, BRW_VERTICAL_STRIDE_0);
428	 } else {
429            brw_inst_set_src0_hstride(brw, inst, reg.hstride);
430            brw_inst_set_src0_width(brw, inst, reg.width);
431            brw_inst_set_src0_vstride(brw, inst, reg.vstride);
432	 }
433      } else {
434         brw_inst_set_src0_da16_swiz_x(brw, inst,
435            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
436         brw_inst_set_src0_da16_swiz_y(brw, inst,
437            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
438         brw_inst_set_src0_da16_swiz_z(brw, inst,
439            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
440         brw_inst_set_src0_da16_swiz_w(brw, inst,
441            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
442
443	 /* This is an oddity of the fact we're using the same
444	  * descriptions for registers in align_16 as align_1:
445	  */
446	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
447            brw_inst_set_src0_vstride(brw, inst, BRW_VERTICAL_STRIDE_4);
448	 else
449            brw_inst_set_src0_vstride(brw, inst, reg.vstride);
450      }
451   }
452}
453
454
455void
456brw_set_src1(struct brw_compile *p,
457             struct brw_instruction *inst,
458             struct brw_reg reg)
459{
460   const struct brw_context *brw = p->brw;
461   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
462
463   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
464      assert(reg.nr < 128);
465
466   gen7_convert_mrf_to_grf(p, &reg);
467
468   validate_reg(brw, inst, reg);
469
470   brw_inst_set_src1_reg_file(brw, inst, reg.file);
471   brw_inst_set_src1_reg_type(brw, inst,
472                              brw_reg_type_to_hw_type(brw, reg.type, reg.file));
473   brw_inst_set_src1_abs(brw, inst, reg.abs);
474   brw_inst_set_src1_negate(brw, inst, reg.negate);
475
476   /* Only src1 can be immediate in two-argument instructions.
477    */
478   assert(brw_inst_src0_reg_file(brw, inst) != BRW_IMMEDIATE_VALUE);
479
480   if (reg.file == BRW_IMMEDIATE_VALUE) {
481      brw_inst_set_imm_ud(brw, inst, reg.dw1.ud);
482   } else {
483      /* This is a hardware restriction, which may or may not be lifted
484       * in the future:
485       */
486      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
487      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
488
489      brw_inst_set_src1_da_reg_nr(brw, inst, reg.nr);
490      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
491         brw_inst_set_src1_da1_subreg_nr(brw, inst, reg.subnr);
492      } else {
493         brw_inst_set_src1_da16_subreg_nr(brw, inst, reg.subnr / 16);
494      }
495
496      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
497	 if (reg.width == BRW_WIDTH_1 &&
498             brw_inst_exec_size(brw, inst) == BRW_EXECUTE_1) {
499            brw_inst_set_src1_hstride(brw, inst, BRW_HORIZONTAL_STRIDE_0);
500            brw_inst_set_src1_width(brw, inst, BRW_WIDTH_1);
501            brw_inst_set_src1_vstride(brw, inst, BRW_VERTICAL_STRIDE_0);
502	 } else {
503            brw_inst_set_src1_hstride(brw, inst, reg.hstride);
504            brw_inst_set_src1_width(brw, inst, reg.width);
505            brw_inst_set_src1_vstride(brw, inst, reg.vstride);
506	 }
507      } else {
508         brw_inst_set_src1_da16_swiz_x(brw, inst,
509            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
510         brw_inst_set_src1_da16_swiz_y(brw, inst,
511            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
512         brw_inst_set_src1_da16_swiz_z(brw, inst,
513            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
514         brw_inst_set_src1_da16_swiz_w(brw, inst,
515            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
516
517	 /* This is an oddity of the fact we're using the same
518	  * descriptions for registers in align_16 as align_1:
519	  */
520	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
521            brw_inst_set_src1_vstride(brw, inst, BRW_VERTICAL_STRIDE_4);
522	 else
523            brw_inst_set_src1_vstride(brw, inst, reg.vstride);
524      }
525   }
526}
527
528/**
529 * Set the Message Descriptor and Extended Message Descriptor fields
530 * for SEND messages.
531 *
532 * \note This zeroes out the Function Control bits, so it must be called
533 *       \b before filling out any message-specific data.  Callers can
534 *       choose not to fill in irrelevant bits; they will be zero.
535 */
536static void
537brw_set_message_descriptor(struct brw_compile *p,
538			   struct brw_instruction *inst,
539			   enum brw_message_target sfid,
540			   unsigned msg_length,
541			   unsigned response_length,
542			   bool header_present,
543			   bool end_of_thread)
544{
545   struct brw_context *brw = p->brw;
546
547   brw_set_src1(p, inst, brw_imm_d(0));
548   brw_inst_set_sfid(brw, inst, sfid);
549   brw_inst_set_mlen(brw, inst, msg_length);
550   brw_inst_set_rlen(brw, inst, response_length);
551   brw_inst_set_eot(brw, inst, end_of_thread);
552
553   if (brw->gen >= 5) {
554      brw_inst_set_header_present(brw, inst, header_present);
555   }
556}
557
558static void brw_set_math_message( struct brw_compile *p,
559				  struct brw_instruction *inst,
560				  unsigned function,
561				  unsigned integer_type,
562				  bool low_precision,
563				  unsigned dataType )
564{
565   struct brw_context *brw = p->brw;
566   unsigned msg_length;
567   unsigned response_length;
568
569   /* Infer message length from the function */
570   switch (function) {
571   case BRW_MATH_FUNCTION_POW:
572   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
573   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
574   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
575      msg_length = 2;
576      break;
577   default:
578      msg_length = 1;
579      break;
580   }
581
582   /* Infer response length from the function */
583   switch (function) {
584   case BRW_MATH_FUNCTION_SINCOS:
585   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
586      response_length = 2;
587      break;
588   default:
589      response_length = 1;
590      break;
591   }
592
593
594   brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
595			      msg_length, response_length, false, false);
596   brw_inst_set_math_msg_function(brw, inst, function);
597   brw_inst_set_math_msg_signed_int(brw, inst, integer_type);
598   brw_inst_set_math_msg_precision(brw, inst, low_precision);
599   brw_inst_set_math_msg_saturate(brw, inst, brw_inst_saturate(brw, inst));
600   brw_inst_set_math_msg_data_type(brw, inst, dataType);
601   brw_inst_set_saturate(brw, inst, 0);
602}
603
604
605static void brw_set_ff_sync_message(struct brw_compile *p,
606				    struct brw_instruction *insn,
607				    bool allocate,
608				    unsigned response_length,
609				    bool end_of_thread)
610{
611   const struct brw_context *brw = p->brw;
612
613   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
614			      1, response_length, true, end_of_thread);
615   brw_inst_set_urb_opcode(brw, insn, 1); /* FF_SYNC */
616   brw_inst_set_urb_allocate(brw, insn, allocate);
617   /* The following fields are not used by FF_SYNC: */
618   brw_inst_set_urb_global_offset(brw, insn, 0);
619   brw_inst_set_urb_swizzle_control(brw, insn, 0);
620   brw_inst_set_urb_used(brw, insn, 0);
621   brw_inst_set_urb_complete(brw, insn, 0);
622}
623
624static void brw_set_urb_message( struct brw_compile *p,
625				 struct brw_instruction *insn,
626                                 enum brw_urb_write_flags flags,
627				 unsigned msg_length,
628				 unsigned response_length,
629				 unsigned offset,
630				 unsigned swizzle_control )
631{
632   struct brw_context *brw = p->brw;
633
634   assert(brw->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
635   assert(brw->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
636   assert(brw->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
637
638   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
639			      msg_length, response_length, true,
640                              flags & BRW_URB_WRITE_EOT);
641
642   if (flags & BRW_URB_WRITE_OWORD) {
643      assert(msg_length == 2); /* header + one OWORD of data */
644      brw_inst_set_urb_opcode(brw, insn, BRW_URB_OPCODE_WRITE_OWORD);
645   } else {
646      brw_inst_set_urb_opcode(brw, insn, BRW_URB_OPCODE_WRITE_HWORD);
647   }
648
649   brw_inst_set_urb_global_offset(brw, insn, offset);
650   brw_inst_set_urb_swizzle_control(brw, insn, swizzle_control);
651
652   if (brw->gen < 8) {
653      brw_inst_set_urb_complete(brw, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
654   }
655
656   if (brw->gen < 7) {
657      brw_inst_set_urb_allocate(brw, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
658      brw_inst_set_urb_used(brw, insn, !(flags & BRW_URB_WRITE_UNUSED));
659   } else {
660      brw_inst_set_urb_per_slot_offset(brw, insn,
661         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
662   }
663}
664
665void
666brw_set_dp_write_message(struct brw_compile *p,
667			 struct brw_instruction *insn,
668			 unsigned binding_table_index,
669			 unsigned msg_control,
670			 unsigned msg_type,
671			 unsigned msg_length,
672			 bool header_present,
673			 unsigned last_render_target,
674			 unsigned response_length,
675			 unsigned end_of_thread,
676			 unsigned send_commit_msg)
677{
678   struct brw_context *brw = p->brw;
679   unsigned sfid;
680
681   if (brw->gen >= 7) {
682      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
683      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
684	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
685      else
686	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
687   } else if (brw->gen == 6) {
688      /* Use the render cache for all write messages. */
689      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
690   } else {
691      sfid = BRW_SFID_DATAPORT_WRITE;
692   }
693
694   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
695			      header_present, end_of_thread);
696
697   brw_inst_set_binding_table_index(brw, insn, binding_table_index);
698   brw_inst_set_dp_write_msg_type(brw, insn, msg_type);
699   brw_inst_set_dp_write_msg_control(brw, insn, msg_control);
700   brw_inst_set_rt_last(brw, insn, last_render_target);
701   if (brw->gen < 7) {
702      brw_inst_set_dp_write_commit(brw, insn, send_commit_msg);
703   }
704}
705
706void
707brw_set_dp_read_message(struct brw_compile *p,
708			struct brw_instruction *insn,
709			unsigned binding_table_index,
710			unsigned msg_control,
711			unsigned msg_type,
712			unsigned target_cache,
713			unsigned msg_length,
714                        bool header_present,
715			unsigned response_length)
716{
717   struct brw_context *brw = p->brw;
718   unsigned sfid;
719
720   if (brw->gen >= 7) {
721      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
722   } else if (brw->gen == 6) {
723      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
724	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
725      else
726	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
727   } else {
728      sfid = BRW_SFID_DATAPORT_READ;
729   }
730
731   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
732			      header_present, false);
733
734   brw_inst_set_binding_table_index(brw, insn, binding_table_index);
735   brw_inst_set_dp_read_msg_type(brw, insn, msg_type);
736   brw_inst_set_dp_read_msg_control(brw, insn, msg_control);
737   if (brw->gen < 6)
738      brw_inst_set_dp_read_target_cache(brw, insn, target_cache);
739}
740
741void
742brw_set_sampler_message(struct brw_compile *p,
743                        struct brw_instruction *inst,
744                        unsigned binding_table_index,
745                        unsigned sampler,
746                        unsigned msg_type,
747                        unsigned response_length,
748                        unsigned msg_length,
749                        unsigned header_present,
750                        unsigned simd_mode,
751                        unsigned return_format)
752{
753   struct brw_context *brw = p->brw;
754
755   brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
756			      response_length, header_present, false);
757
758   brw_inst_set_binding_table_index(brw, inst, binding_table_index);
759   brw_inst_set_sampler(brw, inst, sampler);
760   brw_inst_set_sampler_msg_type(brw, inst, msg_type);
761   if (brw->gen >= 5) {
762      brw_inst_set_sampler_simd_mode(brw, inst, simd_mode);
763   } else if (brw->gen == 4 && !brw->is_g4x) {
764      brw_inst_set_sampler_return_format(brw, inst, return_format);
765   }
766}
767
768static void
769gen7_set_dp_scratch_message(struct brw_compile *p,
770                            struct brw_instruction *inst,
771                            bool write,
772                            bool dword,
773                            bool invalidate_after_read,
774                            unsigned num_regs,
775                            unsigned addr_offset,
776                            unsigned mlen,
777                            unsigned rlen,
778                            bool header_present)
779{
780   const struct brw_context *brw = p->brw;
781   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
782          (brw->gen >= 8 && num_regs == 8));
783   brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
784                              mlen, rlen, header_present, false);
785   brw_inst_set_dp_category(brw, inst, 1); /* Scratch Block Read/Write msgs */
786   brw_inst_set_scratch_read_write(brw, inst, write);
787   brw_inst_set_scratch_type(brw, inst, dword);
788   brw_inst_set_scratch_invalidate_after_read(brw, inst, invalidate_after_read);
789   brw_inst_set_scratch_block_size(brw, inst, ffs(num_regs) - 1);
790   brw_inst_set_scratch_addr_offset(brw, inst, addr_offset);
791}
792
793#define next_insn brw_next_insn
794struct brw_instruction *
795brw_next_insn(struct brw_compile *p, unsigned opcode)
796{
797   const struct brw_context *brw = p->brw;
798   struct brw_instruction *insn;
799
800   if (p->nr_insn + 1 > p->store_size) {
801      p->store_size <<= 1;
802      p->store = reralloc(p->mem_ctx, p->store,
803                          struct brw_instruction, p->store_size);
804   }
805
806   p->next_insn_offset += 16;
807   insn = &p->store[p->nr_insn++];
808   memcpy(insn, p->current, sizeof(*insn));
809
810   brw_inst_set_opcode(brw, insn, opcode);
811   return insn;
812}
813
814static struct brw_instruction *brw_alu1( struct brw_compile *p,
815					 unsigned opcode,
816					 struct brw_reg dest,
817					 struct brw_reg src )
818{
819   struct brw_instruction *insn = next_insn(p, opcode);
820   brw_set_dest(p, insn, dest);
821   brw_set_src0(p, insn, src);
822   return insn;
823}
824
825static struct brw_instruction *brw_alu2(struct brw_compile *p,
826					unsigned opcode,
827					struct brw_reg dest,
828					struct brw_reg src0,
829					struct brw_reg src1 )
830{
831   struct brw_instruction *insn = next_insn(p, opcode);
832   brw_set_dest(p, insn, dest);
833   brw_set_src0(p, insn, src0);
834   brw_set_src1(p, insn, src1);
835   return insn;
836}
837
838static int
839get_3src_subreg_nr(struct brw_reg reg)
840{
841   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
842      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
843      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
844   } else {
845      return reg.subnr / 4;
846   }
847}
848
849static struct brw_instruction *brw_alu3(struct brw_compile *p,
850					unsigned opcode,
851					struct brw_reg dest,
852					struct brw_reg src0,
853					struct brw_reg src1,
854					struct brw_reg src2)
855{
856   struct brw_context *brw = p->brw;
857   struct brw_instruction *inst = next_insn(p, opcode);
858
859   gen7_convert_mrf_to_grf(p, &dest);
860
861   assert(brw_inst_access_mode(brw, inst) == BRW_ALIGN_16);
862
863   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
864	  dest.file == BRW_MESSAGE_REGISTER_FILE);
865   assert(dest.nr < 128);
866   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
867   assert(dest.type == BRW_REGISTER_TYPE_F ||
868          dest.type == BRW_REGISTER_TYPE_D ||
869          dest.type == BRW_REGISTER_TYPE_UD);
870   if (brw->gen == 6) {
871      brw_inst_set_3src_dst_reg_file(brw, inst,
872                                     dest.file == BRW_MESSAGE_REGISTER_FILE);
873   }
874   brw_inst_set_3src_dst_reg_nr(brw, inst, dest.nr);
875   brw_inst_set_3src_dst_subreg_nr(brw, inst, dest.subnr / 16);
876   brw_inst_set_3src_dst_writemask(brw, inst, dest.dw1.bits.writemask);
877   guess_execution_size(p, inst, dest);
878
879   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
880   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
881   assert(src0.nr < 128);
882   brw_inst_set_3src_src0_swizzle(brw, inst, src0.dw1.bits.swizzle);
883   brw_inst_set_3src_src0_subreg_nr(brw, inst, get_3src_subreg_nr(src0));
884   brw_inst_set_3src_src0_reg_nr(brw, inst, src0.nr);
885   brw_inst_set_3src_src0_abs(brw, inst, src0.abs);
886   brw_inst_set_3src_src0_negate(brw, inst, src0.negate);
887   brw_inst_set_3src_src0_rep_ctrl(brw, inst,
888                                   src0.vstride == BRW_VERTICAL_STRIDE_0);
889
890   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
891   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
892   assert(src1.nr < 128);
893   brw_inst_set_3src_src1_swizzle(brw, inst, src1.dw1.bits.swizzle);
894   brw_inst_set_3src_src1_subreg_nr(brw, inst, get_3src_subreg_nr(src1));
895   brw_inst_set_3src_src1_reg_nr(brw, inst, src1.nr);
896   brw_inst_set_3src_src1_abs(brw, inst, src1.abs);
897   brw_inst_set_3src_src1_negate(brw, inst, src1.negate);
898   brw_inst_set_3src_src1_rep_ctrl(brw, inst,
899                                   src1.vstride == BRW_VERTICAL_STRIDE_0);
900
901   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
902   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
903   assert(src2.nr < 128);
904   brw_inst_set_3src_src2_swizzle(brw, inst, src2.dw1.bits.swizzle);
905   brw_inst_set_3src_src2_subreg_nr(brw, inst, get_3src_subreg_nr(src2));
906   brw_inst_set_3src_src2_reg_nr(brw, inst, src2.nr);
907   brw_inst_set_3src_src2_abs(brw, inst, src2.abs);
908   brw_inst_set_3src_src2_negate(brw, inst, src2.negate);
909   brw_inst_set_3src_src2_rep_ctrl(brw, inst,
910                                   src2.vstride == BRW_VERTICAL_STRIDE_0);
911
912   if (brw->gen >= 7) {
913      /* Set both the source and destination types based on dest.type,
914       * ignoring the source register types.  The MAD and LRP emitters ensure
915       * that all four types are float.  The BFE and BFI2 emitters, however,
916       * may send us mixed D and UD types and want us to ignore that and use
917       * the destination type.
918       */
919      switch (dest.type) {
920      case BRW_REGISTER_TYPE_F:
921         brw_inst_set_3src_src_type(brw, inst, BRW_3SRC_TYPE_F);
922         brw_inst_set_3src_dst_type(brw, inst, BRW_3SRC_TYPE_F);
923         break;
924      case BRW_REGISTER_TYPE_D:
925         brw_inst_set_3src_src_type(brw, inst, BRW_3SRC_TYPE_D);
926         brw_inst_set_3src_dst_type(brw, inst, BRW_3SRC_TYPE_D);
927         break;
928      case BRW_REGISTER_TYPE_UD:
929         brw_inst_set_3src_src_type(brw, inst, BRW_3SRC_TYPE_UD);
930         brw_inst_set_3src_dst_type(brw, inst, BRW_3SRC_TYPE_UD);
931         break;
932      }
933   }
934
935   return inst;
936}
937
938
939/***********************************************************************
940 * Convenience routines.
941 */
942#define ALU1(OP)					\
943struct brw_instruction *brw_##OP(struct brw_compile *p,	\
944	      struct brw_reg dest,			\
945	      struct brw_reg src0)   			\
946{							\
947   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
948}
949
950#define ALU2(OP)					\
951struct brw_instruction *brw_##OP(struct brw_compile *p,	\
952	      struct brw_reg dest,			\
953	      struct brw_reg src0,			\
954	      struct brw_reg src1)   			\
955{							\
956   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
957}
958
959#define ALU3(OP)					\
960struct brw_instruction *brw_##OP(struct brw_compile *p,	\
961	      struct brw_reg dest,			\
962	      struct brw_reg src0,			\
963	      struct brw_reg src1,			\
964	      struct brw_reg src2)   			\
965{							\
966   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
967}
968
969#define ALU3F(OP)                                               \
970struct brw_instruction *brw_##OP(struct brw_compile *p,         \
971                                 struct brw_reg dest,           \
972                                 struct brw_reg src0,           \
973                                 struct brw_reg src1,           \
974                                 struct brw_reg src2)           \
975{                                                               \
976   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
977   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
978   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
979   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
980   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
981}
982
983/* Rounding operations (other than RNDD) require two instructions - the first
984 * stores a rounded value (possibly the wrong way) in the dest register, but
985 * also sets a per-channel "increment bit" in the flag register.  A predicated
986 * add of 1.0 fixes dest to contain the desired result.
987 *
988 * Sandybridge and later appear to round correctly without an ADD.
989 */
990#define ROUND(OP)							      \
991void brw_##OP(struct brw_compile *p,					      \
992	      struct brw_reg dest,					      \
993	      struct brw_reg src)					      \
994{									      \
995   struct brw_context *brw = p->brw;					      \
996   struct brw_instruction *rnd, *add;					      \
997   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
998   brw_set_dest(p, rnd, dest);						      \
999   brw_set_src0(p, rnd, src);						      \
1000									      \
1001   if (brw->gen < 6) {							      \
1002      /* turn on round-increments */					      \
1003      brw_inst_set_cond_modifier(brw, rnd, BRW_CONDITIONAL_R);                \
1004      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
1005      brw_inst_set_pred_control(brw, add, BRW_PREDICATE_NORMAL);              \
1006   }									      \
1007}
1008
1009
1010ALU1(MOV)
1011ALU2(SEL)
1012ALU1(NOT)
1013ALU2(AND)
1014ALU2(OR)
1015ALU2(XOR)
1016ALU2(SHR)
1017ALU2(SHL)
1018ALU2(ASR)
1019ALU1(F32TO16)
1020ALU1(F16TO32)
1021ALU1(FRC)
1022ALU1(RNDD)
1023ALU2(MAC)
1024ALU2(MACH)
1025ALU1(LZD)
1026ALU2(DP4)
1027ALU2(DPH)
1028ALU2(DP3)
1029ALU2(DP2)
1030ALU2(LINE)
1031ALU2(PLN)
1032ALU3F(MAD)
1033ALU3F(LRP)
1034ALU1(BFREV)
1035ALU3(BFE)
1036ALU2(BFI1)
1037ALU3(BFI2)
1038ALU1(FBH)
1039ALU1(FBL)
1040ALU1(CBIT)
1041ALU2(ADDC)
1042ALU2(SUBB)
1043
1044ROUND(RNDZ)
1045ROUND(RNDE)
1046
1047
1048struct brw_instruction *brw_ADD(struct brw_compile *p,
1049				struct brw_reg dest,
1050				struct brw_reg src0,
1051				struct brw_reg src1)
1052{
1053   /* 6.2.2: add */
1054   if (src0.type == BRW_REGISTER_TYPE_F ||
1055       (src0.file == BRW_IMMEDIATE_VALUE &&
1056	src0.type == BRW_REGISTER_TYPE_VF)) {
1057      assert(src1.type != BRW_REGISTER_TYPE_UD);
1058      assert(src1.type != BRW_REGISTER_TYPE_D);
1059   }
1060
1061   if (src1.type == BRW_REGISTER_TYPE_F ||
1062       (src1.file == BRW_IMMEDIATE_VALUE &&
1063	src1.type == BRW_REGISTER_TYPE_VF)) {
1064      assert(src0.type != BRW_REGISTER_TYPE_UD);
1065      assert(src0.type != BRW_REGISTER_TYPE_D);
1066   }
1067
1068   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1069}
1070
1071struct brw_instruction *brw_AVG(struct brw_compile *p,
1072                                struct brw_reg dest,
1073                                struct brw_reg src0,
1074                                struct brw_reg src1)
1075{
1076   assert(dest.type == src0.type);
1077   assert(src0.type == src1.type);
1078   switch (src0.type) {
1079   case BRW_REGISTER_TYPE_B:
1080   case BRW_REGISTER_TYPE_UB:
1081   case BRW_REGISTER_TYPE_W:
1082   case BRW_REGISTER_TYPE_UW:
1083   case BRW_REGISTER_TYPE_D:
1084   case BRW_REGISTER_TYPE_UD:
1085      break;
1086   default:
1087      assert(!"Bad type for brw_AVG");
1088   }
1089
1090   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1091}
1092
1093struct brw_instruction *brw_MUL(struct brw_compile *p,
1094				struct brw_reg dest,
1095				struct brw_reg src0,
1096				struct brw_reg src1)
1097{
1098   /* 6.32.38: mul */
1099   if (src0.type == BRW_REGISTER_TYPE_D ||
1100       src0.type == BRW_REGISTER_TYPE_UD ||
1101       src1.type == BRW_REGISTER_TYPE_D ||
1102       src1.type == BRW_REGISTER_TYPE_UD) {
1103      assert(dest.type != BRW_REGISTER_TYPE_F);
1104   }
1105
1106   if (src0.type == BRW_REGISTER_TYPE_F ||
1107       (src0.file == BRW_IMMEDIATE_VALUE &&
1108	src0.type == BRW_REGISTER_TYPE_VF)) {
1109      assert(src1.type != BRW_REGISTER_TYPE_UD);
1110      assert(src1.type != BRW_REGISTER_TYPE_D);
1111   }
1112
1113   if (src1.type == BRW_REGISTER_TYPE_F ||
1114       (src1.file == BRW_IMMEDIATE_VALUE &&
1115	src1.type == BRW_REGISTER_TYPE_VF)) {
1116      assert(src0.type != BRW_REGISTER_TYPE_UD);
1117      assert(src0.type != BRW_REGISTER_TYPE_D);
1118   }
1119
1120   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1121	  src0.nr != BRW_ARF_ACCUMULATOR);
1122   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1123	  src1.nr != BRW_ARF_ACCUMULATOR);
1124
1125   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1126}
1127
1128
1129void brw_NOP(struct brw_compile *p)
1130{
1131   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1132   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1133   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1134   brw_set_src1(p, insn, brw_imm_ud(0x0));
1135}
1136
1137
1138
1139
1140
1141/***********************************************************************
1142 * Comparisons, if/else/endif
1143 */
1144
1145struct brw_instruction *brw_JMPI(struct brw_compile *p,
1146                                 struct brw_reg index,
1147                                 unsigned predicate_control)
1148{
1149   const struct brw_context *brw = p->brw;
1150   struct brw_reg ip = brw_ip_reg();
1151   struct brw_instruction *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1152
1153   brw_inst_set_exec_size(brw, inst, BRW_EXECUTE_2);
1154   brw_inst_set_qtr_control(brw, inst, BRW_COMPRESSION_NONE);
1155   brw_inst_set_mask_control(brw, inst, BRW_MASK_DISABLE);
1156   brw_inst_set_pred_control(brw, inst, predicate_control);
1157
1158   return inst;
1159}
1160
1161static void
1162push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1163{
1164   p->if_stack[p->if_stack_depth] = inst - p->store;
1165
1166   p->if_stack_depth++;
1167   if (p->if_stack_array_size <= p->if_stack_depth) {
1168      p->if_stack_array_size *= 2;
1169      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1170			     p->if_stack_array_size);
1171   }
1172}
1173
1174static struct brw_instruction *
1175pop_if_stack(struct brw_compile *p)
1176{
1177   p->if_stack_depth--;
1178   return &p->store[p->if_stack[p->if_stack_depth]];
1179}
1180
1181static void
1182push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1183{
1184   if (p->loop_stack_array_size < p->loop_stack_depth) {
1185      p->loop_stack_array_size *= 2;
1186      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1187			       p->loop_stack_array_size);
1188      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1189				     p->loop_stack_array_size);
1190   }
1191
1192   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1193   p->loop_stack_depth++;
1194   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1195}
1196
1197static struct brw_instruction *
1198get_inner_do_insn(struct brw_compile *p)
1199{
1200   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1201}
1202
1203/* EU takes the value from the flag register and pushes it onto some
1204 * sort of a stack (presumably merging with any flag value already on
1205 * the stack).  Within an if block, the flags at the top of the stack
1206 * control execution on each channel of the unit, eg. on each of the
1207 * 16 pixel values in our wm programs.
1208 *
1209 * When the matching 'else' instruction is reached (presumably by
1210 * countdown of the instruction count patched in by our ELSE/ENDIF
1211 * functions), the relevent flags are inverted.
1212 *
1213 * When the matching 'endif' instruction is reached, the flags are
1214 * popped off.  If the stack is now empty, normal execution resumes.
1215 */
1216struct brw_instruction *
1217brw_IF(struct brw_compile *p, unsigned execute_size)
1218{
1219   struct brw_context *brw = p->brw;
1220   struct brw_instruction *insn;
1221
1222   insn = next_insn(p, BRW_OPCODE_IF);
1223
1224   /* Override the defaults for this instruction:
1225    */
1226   if (brw->gen < 6) {
1227      brw_set_dest(p, insn, brw_ip_reg());
1228      brw_set_src0(p, insn, brw_ip_reg());
1229      brw_set_src1(p, insn, brw_imm_d(0x0));
1230   } else if (brw->gen == 6) {
1231      brw_set_dest(p, insn, brw_imm_w(0));
1232      brw_inst_set_gen6_jump_count(brw, insn, 0);
1233      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1234      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1235   } else {
1236      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1237      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1238      brw_set_src1(p, insn, brw_imm_ud(0));
1239      brw_inst_set_jip(brw, insn, 0);
1240      brw_inst_set_uip(brw, insn, 0);
1241   }
1242
1243   brw_inst_set_exec_size(brw, insn, execute_size);
1244   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1245   brw_inst_set_pred_control(brw, insn, BRW_PREDICATE_NORMAL);
1246   brw_inst_set_mask_control(brw, insn, BRW_MASK_ENABLE);
1247   if (!p->single_program_flow && brw->gen < 6)
1248      brw_inst_set_thread_control(brw, insn, BRW_THREAD_SWITCH);
1249
1250   push_if_stack(p, insn);
1251   p->if_depth_in_loop[p->loop_stack_depth]++;
1252   return insn;
1253}
1254
1255/* This function is only used for gen6-style IF instructions with an
1256 * embedded comparison (conditional modifier).  It is not used on gen7.
1257 */
1258struct brw_instruction *
1259gen6_IF(struct brw_compile *p, uint32_t conditional,
1260	struct brw_reg src0, struct brw_reg src1)
1261{
1262   const struct brw_context *brw = p->brw;
1263   struct brw_instruction *insn;
1264
1265   insn = next_insn(p, BRW_OPCODE_IF);
1266
1267   brw_set_dest(p, insn, brw_imm_w(0));
1268   brw_inst_set_exec_size(brw, insn, p->compressed ? BRW_EXECUTE_16
1269                                                   : BRW_EXECUTE_8);
1270   brw_inst_set_gen6_jump_count(brw, insn, 0);
1271   brw_set_src0(p, insn, src0);
1272   brw_set_src1(p, insn, src1);
1273
1274   assert(brw_inst_qtr_control(brw, insn) == BRW_COMPRESSION_NONE);
1275   assert(brw_inst_pred_control(brw, insn) == BRW_PREDICATE_NONE);
1276   brw_inst_set_cond_modifier(brw, insn, conditional);
1277
1278   push_if_stack(p, insn);
1279   return insn;
1280}
1281
1282/**
1283 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1284 */
1285static void
1286convert_IF_ELSE_to_ADD(struct brw_compile *p,
1287		       struct brw_instruction *if_inst,
1288		       struct brw_instruction *else_inst)
1289{
1290   const struct brw_context *brw = p->brw;
1291
1292   /* The next instruction (where the ENDIF would be, if it existed) */
1293   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1294
1295   assert(p->single_program_flow);
1296   assert(if_inst != NULL && brw_inst_opcode(brw, if_inst) == BRW_OPCODE_IF);
1297   assert(else_inst == NULL || brw_inst_opcode(brw, else_inst) == BRW_OPCODE_ELSE);
1298   assert(brw_inst_exec_size(brw, if_inst) == BRW_EXECUTE_1);
1299
1300   /* Convert IF to an ADD instruction that moves the instruction pointer
1301    * to the first instruction of the ELSE block.  If there is no ELSE
1302    * block, point to where ENDIF would be.  Reverse the predicate.
1303    *
1304    * There's no need to execute an ENDIF since we don't need to do any
1305    * stack operations, and if we're currently executing, we just want to
1306    * continue normally.
1307    */
1308   brw_inst_set_opcode(brw, if_inst, BRW_OPCODE_ADD);
1309   brw_inst_set_pred_inv(brw, if_inst, true);
1310
1311   if (else_inst != NULL) {
1312      /* Convert ELSE to an ADD instruction that points where the ENDIF
1313       * would be.
1314       */
1315      brw_inst_set_opcode(brw, else_inst, BRW_OPCODE_ADD);
1316
1317      brw_inst_set_imm_ud(brw, if_inst, (else_inst - if_inst + 1) * 16);
1318      brw_inst_set_imm_ud(brw, else_inst, (next_inst - else_inst) * 16);
1319   } else {
1320      brw_inst_set_imm_ud(brw, if_inst, (next_inst - if_inst) * 16);
1321   }
1322}
1323
1324/**
1325 * Patch IF and ELSE instructions with appropriate jump targets.
1326 */
1327static void
1328patch_IF_ELSE(struct brw_compile *p,
1329	      struct brw_instruction *if_inst,
1330	      struct brw_instruction *else_inst,
1331	      struct brw_instruction *endif_inst)
1332{
1333   struct brw_context *brw = p->brw;
1334
1335   /* We shouldn't be patching IF and ELSE instructions in single program flow
1336    * mode when gen < 6, because in single program flow mode on those
1337    * platforms, we convert flow control instructions to conditional ADDs that
1338    * operate on IP (see brw_ENDIF).
1339    *
1340    * However, on Gen6, writing to IP doesn't work in single program flow mode
1341    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1342    * not be updated by non-flow control instructions.").  And on later
1343    * platforms, there is no significant benefit to converting control flow
1344    * instructions to conditional ADDs.  So we do patch IF and ELSE
1345    * instructions in single program flow mode on those platforms.
1346    */
1347   if (brw->gen < 6)
1348      assert(!p->single_program_flow);
1349
1350   assert(if_inst != NULL && brw_inst_opcode(brw, if_inst) == BRW_OPCODE_IF);
1351   assert(endif_inst != NULL);
1352   assert(else_inst == NULL || brw_inst_opcode(brw, else_inst) == BRW_OPCODE_ELSE);
1353
1354   unsigned br = 1;
1355   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1356    * requires 2 chunks.
1357    */
1358   if (brw->gen >= 5)
1359      br = 2;
1360
1361   assert(brw_inst_opcode(brw, endif_inst) == BRW_OPCODE_ENDIF);
1362   brw_inst_set_exec_size(brw, endif_inst, brw_inst_exec_size(brw, if_inst));
1363
1364   if (else_inst == NULL) {
1365      /* Patch IF -> ENDIF */
1366      if (brw->gen < 6) {
1367	 /* Turn it into an IFF, which means no mask stack operations for
1368	  * all-false and jumping past the ENDIF.
1369	  */
1370         brw_inst_set_opcode(brw, if_inst, BRW_OPCODE_IFF);
1371         brw_inst_set_gen4_jump_count(brw, if_inst,
1372                                      br * (endif_inst - if_inst + 1));
1373         brw_inst_set_gen4_pop_count(brw, if_inst, 0);
1374      } else if (brw->gen == 6) {
1375	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1376         brw_inst_set_gen6_jump_count(brw, if_inst, br*(endif_inst - if_inst));
1377      } else {
1378         brw_inst_set_uip(brw, if_inst, br * (endif_inst - if_inst));
1379         brw_inst_set_jip(brw, if_inst, br * (endif_inst - if_inst));
1380      }
1381   } else {
1382      brw_inst_set_exec_size(brw, else_inst, brw_inst_exec_size(brw, if_inst));
1383
1384      /* Patch IF -> ELSE */
1385      if (brw->gen < 6) {
1386         brw_inst_set_gen4_jump_count(brw, if_inst,
1387                                      br * (else_inst - if_inst));
1388         brw_inst_set_gen4_pop_count(brw, if_inst, 0);
1389      } else if (brw->gen == 6) {
1390         brw_inst_set_gen6_jump_count(brw, if_inst,
1391                                      br * (else_inst - if_inst + 1));
1392      }
1393
1394      /* Patch ELSE -> ENDIF */
1395      if (brw->gen < 6) {
1396	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1397	  * matching ENDIF.
1398	  */
1399         brw_inst_set_gen4_jump_count(brw, else_inst,
1400                                      br * (endif_inst - else_inst + 1));
1401         brw_inst_set_gen4_pop_count(brw, else_inst, 1);
1402      } else if (brw->gen == 6) {
1403	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1404         brw_inst_set_gen6_jump_count(brw, else_inst,
1405                                      br * (endif_inst - else_inst));
1406      } else {
1407	 /* The IF instruction's JIP should point just past the ELSE */
1408         brw_inst_set_jip(brw, if_inst, br * (else_inst - if_inst + 1));
1409	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1410         brw_inst_set_uip(brw, if_inst, br * (endif_inst - if_inst));
1411         brw_inst_set_jip(brw, else_inst, br * (endif_inst - else_inst));
1412      }
1413   }
1414}
1415
1416void
1417brw_ELSE(struct brw_compile *p)
1418{
1419   struct brw_context *brw = p->brw;
1420   struct brw_instruction *insn;
1421
1422   insn = next_insn(p, BRW_OPCODE_ELSE);
1423
1424   if (brw->gen < 6) {
1425      brw_set_dest(p, insn, brw_ip_reg());
1426      brw_set_src0(p, insn, brw_ip_reg());
1427      brw_set_src1(p, insn, brw_imm_d(0x0));
1428   } else if (brw->gen == 6) {
1429      brw_set_dest(p, insn, brw_imm_w(0));
1430      brw_inst_set_gen6_jump_count(brw, insn, 0);
1431      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1432      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1433   } else {
1434      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1435      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1436      brw_set_src1(p, insn, brw_imm_ud(0));
1437      brw_inst_set_jip(brw, insn, 0);
1438      brw_inst_set_uip(brw, insn, 0);
1439   }
1440
1441   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1442   brw_inst_set_mask_control(brw, insn, BRW_MASK_ENABLE);
1443   if (!p->single_program_flow && brw->gen < 6)
1444      brw_inst_set_thread_control(brw, insn, BRW_THREAD_SWITCH);
1445
1446   push_if_stack(p, insn);
1447}
1448
1449void
1450brw_ENDIF(struct brw_compile *p)
1451{
1452   struct brw_context *brw = p->brw;
1453   struct brw_instruction *insn = NULL;
1454   struct brw_instruction *else_inst = NULL;
1455   struct brw_instruction *if_inst = NULL;
1456   struct brw_instruction *tmp;
1457   bool emit_endif = true;
1458
1459   /* In single program flow mode, we can express IF and ELSE instructions
1460    * equivalently as ADD instructions that operate on IP.  On platforms prior
1461    * to Gen6, flow control instructions cause an implied thread switch, so
1462    * this is a significant savings.
1463    *
1464    * However, on Gen6, writing to IP doesn't work in single program flow mode
1465    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1466    * not be updated by non-flow control instructions.").  And on later
1467    * platforms, there is no significant benefit to converting control flow
1468    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1469    * Gen5.
1470    */
1471   if (brw->gen < 6 && p->single_program_flow)
1472      emit_endif = false;
1473
1474   /*
1475    * A single next_insn() may change the base adress of instruction store
1476    * memory(p->store), so call it first before referencing the instruction
1477    * store pointer from an index
1478    */
1479   if (emit_endif)
1480      insn = next_insn(p, BRW_OPCODE_ENDIF);
1481
1482   /* Pop the IF and (optional) ELSE instructions from the stack */
1483   p->if_depth_in_loop[p->loop_stack_depth]--;
1484   tmp = pop_if_stack(p);
1485   if (brw_inst_opcode(brw, tmp) == BRW_OPCODE_ELSE) {
1486      else_inst = tmp;
1487      tmp = pop_if_stack(p);
1488   }
1489   if_inst = tmp;
1490
1491   if (!emit_endif) {
1492      /* ENDIF is useless; don't bother emitting it. */
1493      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1494      return;
1495   }
1496
1497   if (brw->gen < 6) {
1498      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1499      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1500      brw_set_src1(p, insn, brw_imm_d(0x0));
1501   } else if (brw->gen == 6) {
1502      brw_set_dest(p, insn, brw_imm_w(0));
1503      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1504      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1505   } else {
1506      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1507      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1508      brw_set_src1(p, insn, brw_imm_ud(0));
1509   }
1510
1511   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1512   brw_inst_set_mask_control(brw, insn, BRW_MASK_ENABLE);
1513   if (brw->gen < 6)
1514      brw_inst_set_thread_control(brw, insn, BRW_THREAD_SWITCH);
1515
1516   /* Also pop item off the stack in the endif instruction: */
1517   if (brw->gen < 6) {
1518      brw_inst_set_gen4_jump_count(brw, insn, 0);
1519      brw_inst_set_gen4_pop_count(brw, insn, 1);
1520   } else if (brw->gen == 6) {
1521      brw_inst_set_gen6_jump_count(brw, insn, 2);
1522   } else {
1523      brw_inst_set_jip(brw, insn, 2);
1524   }
1525   patch_IF_ELSE(p, if_inst, else_inst, insn);
1526}
1527
1528struct brw_instruction *brw_BREAK(struct brw_compile *p)
1529{
1530   struct brw_context *brw = p->brw;
1531   struct brw_instruction *insn;
1532
1533   insn = next_insn(p, BRW_OPCODE_BREAK);
1534   if (brw->gen >= 6) {
1535      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1536      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1537      brw_set_src1(p, insn, brw_imm_d(0x0));
1538   } else {
1539      brw_set_dest(p, insn, brw_ip_reg());
1540      brw_set_src0(p, insn, brw_ip_reg());
1541      brw_set_src1(p, insn, brw_imm_d(0x0));
1542      brw_inst_set_gen4_pop_count(brw, insn,
1543                                  p->if_depth_in_loop[p->loop_stack_depth]);
1544   }
1545   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1546   brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_8);
1547
1548   return insn;
1549}
1550
1551struct brw_instruction *gen6_CONT(struct brw_compile *p)
1552{
1553   const struct brw_context *brw = p->brw;
1554   struct brw_instruction *insn;
1555
1556   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1557   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1558   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1559   brw_set_dest(p, insn, brw_ip_reg());
1560   brw_set_src0(p, insn, brw_ip_reg());
1561   brw_set_src1(p, insn, brw_imm_d(0x0));
1562
1563   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1564   brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_8);
1565   return insn;
1566}
1567
1568struct brw_instruction *brw_CONT(struct brw_compile *p)
1569{
1570   const struct brw_context *brw = p->brw;
1571   struct brw_instruction *insn;
1572   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1573   brw_set_dest(p, insn, brw_ip_reg());
1574   brw_set_src0(p, insn, brw_ip_reg());
1575   brw_set_src1(p, insn, brw_imm_d(0x0));
1576   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1577   brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_8);
1578   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1579   brw_inst_set_gen4_pop_count(brw, insn,
1580                               p->if_depth_in_loop[p->loop_stack_depth]);
1581   return insn;
1582}
1583
1584struct brw_instruction *gen6_HALT(struct brw_compile *p)
1585{
1586   const struct brw_context *brw = p->brw;
1587   struct brw_instruction *insn;
1588
1589   insn = next_insn(p, BRW_OPCODE_HALT);
1590   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1591   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1592   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1593
1594   if (p->compressed) {
1595      brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_16);
1596   } else {
1597      brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1598      brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_8);
1599   }
1600   return insn;
1601}
1602
1603/* DO/WHILE loop:
1604 *
1605 * The DO/WHILE is just an unterminated loop -- break or continue are
1606 * used for control within the loop.  We have a few ways they can be
1607 * done.
1608 *
1609 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1610 * jip and no DO instruction.
1611 *
1612 * For non-uniform control flow pre-gen6, there's a DO instruction to
1613 * push the mask, and a WHILE to jump back, and BREAK to get out and
1614 * pop the mask.
1615 *
1616 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1617 * just points back to the first instruction of the loop.
1618 */
1619struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
1620{
1621   struct brw_context *brw = p->brw;
1622
1623   if (brw->gen >= 6 || p->single_program_flow) {
1624      push_loop_stack(p, &p->store[p->nr_insn]);
1625      return &p->store[p->nr_insn];
1626   } else {
1627      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1628
1629      push_loop_stack(p, insn);
1630
1631      /* Override the defaults for this instruction:
1632       */
1633      brw_set_dest(p, insn, brw_null_reg());
1634      brw_set_src0(p, insn, brw_null_reg());
1635      brw_set_src1(p, insn, brw_null_reg());
1636
1637      brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1638      brw_inst_set_exec_size(brw, insn, execute_size);
1639      brw_inst_set_pred_control(brw, insn, BRW_PREDICATE_NONE);
1640
1641      return insn;
1642   }
1643}
1644
1645/**
1646 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1647 * instruction here.
1648 *
1649 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1650 * nesting, since it can always just point to the end of the block/current loop.
1651 */
1652static void
1653brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1654{
1655   struct brw_context *brw = p->brw;
1656   struct brw_instruction *do_inst = get_inner_do_insn(p);
1657   struct brw_instruction *inst;
1658   int br = (brw->gen == 5) ? 2 : 1;
1659
1660   for (inst = while_inst - 1; inst != do_inst; inst--) {
1661      /* If the jump count is != 0, that means that this instruction has already
1662       * been patched because it's part of a loop inside of the one we're
1663       * patching.
1664       */
1665      if (brw_inst_opcode(brw, inst) == BRW_OPCODE_BREAK &&
1666          brw_inst_gen4_jump_count(brw, inst) == 0) {
1667         brw_inst_set_gen4_jump_count(brw, inst, br*((while_inst - inst) + 1));
1668      } else if (brw_inst_opcode(brw, inst) == BRW_OPCODE_CONTINUE &&
1669                 brw_inst_gen4_jump_count(brw, inst) == 0) {
1670         brw_inst_set_gen4_jump_count(brw, inst, br * (while_inst - inst));
1671      }
1672   }
1673}
1674
1675struct brw_instruction *brw_WHILE(struct brw_compile *p)
1676{
1677   struct brw_context *brw = p->brw;
1678   struct brw_instruction *insn, *do_insn;
1679   unsigned br = 1;
1680
1681   if (brw->gen >= 5)
1682      br = 2;
1683
1684   if (brw->gen >= 7) {
1685      insn = next_insn(p, BRW_OPCODE_WHILE);
1686      do_insn = get_inner_do_insn(p);
1687
1688      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1689      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1690      brw_set_src1(p, insn, brw_imm_ud(0));
1691      brw_inst_set_jip(brw, insn, br * (do_insn - insn));
1692
1693      brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_8);
1694   } else if (brw->gen == 6) {
1695      insn = next_insn(p, BRW_OPCODE_WHILE);
1696      do_insn = get_inner_do_insn(p);
1697
1698      brw_set_dest(p, insn, brw_imm_w(0));
1699      brw_inst_set_gen6_jump_count(brw, insn, br * (do_insn - insn));
1700      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1701      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1702
1703      brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_8);
1704   } else {
1705      if (p->single_program_flow) {
1706	 insn = next_insn(p, BRW_OPCODE_ADD);
1707         do_insn = get_inner_do_insn(p);
1708
1709	 brw_set_dest(p, insn, brw_ip_reg());
1710	 brw_set_src0(p, insn, brw_ip_reg());
1711	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1712         brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_1);
1713      } else {
1714	 insn = next_insn(p, BRW_OPCODE_WHILE);
1715         do_insn = get_inner_do_insn(p);
1716
1717         assert(brw_inst_opcode(brw, do_insn) == BRW_OPCODE_DO);
1718
1719	 brw_set_dest(p, insn, brw_ip_reg());
1720	 brw_set_src0(p, insn, brw_ip_reg());
1721	 brw_set_src1(p, insn, brw_imm_d(0));
1722
1723         brw_inst_set_exec_size(brw, insn, brw_inst_exec_size(brw, do_insn));
1724         brw_inst_set_gen4_jump_count(brw, insn, br * (do_insn - insn + 1));
1725         brw_inst_set_gen4_pop_count(brw, insn, 0);
1726
1727	 brw_patch_break_cont(p, insn);
1728      }
1729   }
1730   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1731
1732   p->loop_stack_depth--;
1733
1734   return insn;
1735}
1736
1737/* FORWARD JUMPS:
1738 */
1739void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1740{
1741   struct brw_context *brw = p->brw;
1742   struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1743   unsigned jmpi = 1;
1744
1745   if (brw->gen >= 5)
1746      jmpi = 2;
1747
1748   assert(brw_inst_opcode(brw, jmp_insn) == BRW_OPCODE_JMPI);
1749   assert(brw_inst_src1_reg_file(brw, jmp_insn) == BRW_IMMEDIATE_VALUE);
1750
1751   brw_inst_set_gen4_jump_count(brw, jmp_insn,
1752                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
1753}
1754
1755/* To integrate with the above, it makes sense that the comparison
1756 * instruction should populate the flag register.  It might be simpler
1757 * just to use the flag reg for most WM tasks?
1758 */
1759void brw_CMP(struct brw_compile *p,
1760	     struct brw_reg dest,
1761	     unsigned conditional,
1762	     struct brw_reg src0,
1763	     struct brw_reg src1)
1764{
1765   struct brw_context *brw = p->brw;
1766   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1767
1768   brw_inst_set_cond_modifier(brw, insn, conditional);
1769   brw_set_dest(p, insn, dest);
1770   brw_set_src0(p, insn, src0);
1771   brw_set_src1(p, insn, src1);
1772
1773   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1774    * page says:
1775    *    "Any CMP instruction with a null destination must use a {switch}."
1776    *
1777    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1778    * mentioned on their work-arounds pages.
1779    */
1780   if (brw->gen == 7) {
1781      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1782          dest.nr == BRW_ARF_NULL) {
1783         brw_inst_set_thread_control(brw, insn, BRW_THREAD_SWITCH);
1784      }
1785   }
1786}
1787
1788/***********************************************************************
1789 * Helpers for the various SEND message types:
1790 */
1791
1792/** Extended math function, float[8].
1793 */
1794void gen4_math(struct brw_compile *p,
1795	       struct brw_reg dest,
1796	       unsigned function,
1797	       unsigned msg_reg_nr,
1798	       struct brw_reg src,
1799	       unsigned data_type,
1800	       unsigned precision )
1801{
1802   struct brw_context *brw = p->brw;
1803   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1804
1805   assert(brw->gen < 6);
1806
1807   /* Example code doesn't set predicate_control for send
1808    * instructions.
1809    */
1810   brw_inst_set_pred_control(brw, insn, 0);
1811   brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
1812
1813   brw_set_dest(p, insn, dest);
1814   brw_set_src0(p, insn, src);
1815   brw_set_math_message(p,
1816                        insn,
1817                        function,
1818                        src.type == BRW_REGISTER_TYPE_D,
1819                        precision,
1820                        data_type);
1821}
1822
1823void gen6_math(struct brw_compile *p,
1824	       struct brw_reg dest,
1825	       unsigned function,
1826	       struct brw_reg src0,
1827	       struct brw_reg src1)
1828{
1829   struct brw_context *brw = p->brw;
1830   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1831
1832   assert(brw->gen >= 6);
1833
1834   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1835          (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1836   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1837
1838   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1839   if (brw->gen == 6) {
1840      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1841      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1842   }
1843
1844   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1845       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1846       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1847      assert(src0.type != BRW_REGISTER_TYPE_F);
1848      assert(src1.type != BRW_REGISTER_TYPE_F);
1849      assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1850   } else {
1851      assert(src0.type == BRW_REGISTER_TYPE_F);
1852      assert(src1.type == BRW_REGISTER_TYPE_F);
1853      if (function == BRW_MATH_FUNCTION_POW) {
1854         assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1855      } else {
1856         assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1857                src1.nr == BRW_ARF_NULL);
1858      }
1859   }
1860
1861   /* Source modifiers are ignored for extended math instructions on Gen6. */
1862   if (brw->gen == 6) {
1863      assert(!src0.negate);
1864      assert(!src0.abs);
1865      assert(!src1.negate);
1866      assert(!src1.abs);
1867   }
1868
1869   brw_inst_set_math_function(brw, insn, function);
1870
1871   brw_set_dest(p, insn, dest);
1872   brw_set_src0(p, insn, src0);
1873   brw_set_src1(p, insn, src1);
1874}
1875
1876
1877/**
1878 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1879 * using a constant offset per channel.
1880 *
1881 * The offset must be aligned to oword size (16 bytes).  Used for
1882 * register spilling.
1883 */
1884void brw_oword_block_write_scratch(struct brw_compile *p,
1885				   struct brw_reg mrf,
1886				   int num_regs,
1887				   unsigned offset)
1888{
1889   struct brw_context *brw = p->brw;
1890   uint32_t msg_control, msg_type;
1891   int mlen;
1892
1893   if (brw->gen >= 6)
1894      offset /= 16;
1895
1896   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1897
1898   if (num_regs == 1) {
1899      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1900      mlen = 2;
1901   } else {
1902      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1903      mlen = 3;
1904   }
1905
1906   /* Set up the message header.  This is g0, with g0.2 filled with
1907    * the offset.  We don't want to leave our offset around in g0 or
1908    * it'll screw up texture samples, so set it up inside the message
1909    * reg.
1910    */
1911   {
1912      brw_push_insn_state(p);
1913      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1914      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1915
1916      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1917
1918      /* set message header global offset field (reg 0, element 2) */
1919      brw_MOV(p,
1920	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1921				  mrf.nr,
1922				  2), BRW_REGISTER_TYPE_UD),
1923	      brw_imm_ud(offset));
1924
1925      brw_pop_insn_state(p);
1926   }
1927
1928   {
1929      struct brw_reg dest;
1930      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1931      int send_commit_msg;
1932      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1933					 BRW_REGISTER_TYPE_UW);
1934
1935      if (brw_inst_qtr_control(brw, insn) != BRW_COMPRESSION_NONE) {
1936         brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1937	 src_header = vec16(src_header);
1938      }
1939      assert(brw_inst_pred_control(brw, insn) == BRW_PREDICATE_NONE);
1940      if (brw->gen < 6)
1941         brw_inst_set_base_mrf(brw, insn, mrf.nr);
1942
1943      /* Until gen6, writes followed by reads from the same location
1944       * are not guaranteed to be ordered unless write_commit is set.
1945       * If set, then a no-op write is issued to the destination
1946       * register to set a dependency, and a read from the destination
1947       * can be used to ensure the ordering.
1948       *
1949       * For gen6, only writes between different threads need ordering
1950       * protection.  Our use of DP writes is all about register
1951       * spilling within a thread.
1952       */
1953      if (brw->gen >= 6) {
1954	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1955	 send_commit_msg = 0;
1956      } else {
1957	 dest = src_header;
1958	 send_commit_msg = 1;
1959      }
1960
1961      brw_set_dest(p, insn, dest);
1962      if (brw->gen >= 6) {
1963	 brw_set_src0(p, insn, mrf);
1964      } else {
1965	 brw_set_src0(p, insn, brw_null_reg());
1966      }
1967
1968      if (brw->gen >= 6)
1969	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1970      else
1971	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1972
1973      brw_set_dp_write_message(p,
1974			       insn,
1975			       255, /* binding table index (255=stateless) */
1976			       msg_control,
1977			       msg_type,
1978			       mlen,
1979			       true, /* header_present */
1980			       0, /* not a render target */
1981			       send_commit_msg, /* response_length */
1982			       0, /* eot */
1983			       send_commit_msg);
1984   }
1985}
1986
1987
1988/**
1989 * Read a block of owords (half a GRF each) from the scratch buffer
1990 * using a constant index per channel.
1991 *
1992 * Offset must be aligned to oword size (16 bytes).  Used for register
1993 * spilling.
1994 */
1995void
1996brw_oword_block_read_scratch(struct brw_compile *p,
1997			     struct brw_reg dest,
1998			     struct brw_reg mrf,
1999			     int num_regs,
2000			     unsigned offset)
2001{
2002   struct brw_context *brw = p->brw;
2003   uint32_t msg_control;
2004   int rlen;
2005
2006   if (brw->gen >= 6)
2007      offset /= 16;
2008
2009   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2010   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2011
2012   if (num_regs == 1) {
2013      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2014      rlen = 1;
2015   } else {
2016      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2017      rlen = 2;
2018   }
2019
2020   {
2021      brw_push_insn_state(p);
2022      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2023      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2024
2025      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2026
2027      /* set message header global offset field (reg 0, element 2) */
2028      brw_MOV(p,
2029	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2030				  mrf.nr,
2031				  2), BRW_REGISTER_TYPE_UD),
2032	      brw_imm_ud(offset));
2033
2034      brw_pop_insn_state(p);
2035   }
2036
2037   {
2038      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2039
2040      assert(brw_inst_pred_control(brw, insn) == 0);
2041      brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
2042
2043      brw_set_dest(p, insn, dest);	/* UW? */
2044      if (brw->gen >= 6) {
2045	 brw_set_src0(p, insn, mrf);
2046      } else {
2047	 brw_set_src0(p, insn, brw_null_reg());
2048         brw_inst_set_base_mrf(brw, insn, mrf.nr);
2049      }
2050
2051      brw_set_dp_read_message(p,
2052			      insn,
2053			      255, /* binding table index (255=stateless) */
2054			      msg_control,
2055			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2056			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2057			      1, /* msg_length */
2058                              true, /* header_present */
2059			      rlen);
2060   }
2061}
2062
2063void
2064gen7_block_read_scratch(struct brw_compile *p,
2065                        struct brw_reg dest,
2066                        int num_regs,
2067                        unsigned offset)
2068{
2069   const struct brw_context *brw = p->brw;
2070   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2071   assert(brw_inst_pred_control(brw, insn) == BRW_PREDICATE_NONE);
2072
2073   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
2074   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2075
2076   /* The HW requires that the header is present; this is to get the g0.5
2077    * scratch offset.
2078    */
2079   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2080
2081   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2082    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2083    * is 32 bytes, which happens to be the size of a register.
2084    */
2085   offset /= REG_SIZE;
2086   assert(offset < (1 << 12));
2087
2088   gen7_set_dp_scratch_message(p, insn,
2089                               false, /* scratch read */
2090                               false, /* OWords */
2091                               false, /* invalidate after read */
2092                               num_regs,
2093                               offset,
2094                               1,        /* mlen: just g0 */
2095                               num_regs, /* rlen */
2096                               true);    /* header present */
2097}
2098
2099/**
2100 * Read a float[4] vector from the data port Data Cache (const buffer).
2101 * Location (in buffer) should be a multiple of 16.
2102 * Used for fetching shader constants.
2103 */
2104void brw_oword_block_read(struct brw_compile *p,
2105			  struct brw_reg dest,
2106			  struct brw_reg mrf,
2107			  uint32_t offset,
2108			  uint32_t bind_table_index)
2109{
2110   struct brw_context *brw = p->brw;
2111
2112   /* On newer hardware, offset is in units of owords. */
2113   if (brw->gen >= 6)
2114      offset /= 16;
2115
2116   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2117
2118   brw_push_insn_state(p);
2119   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2120   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2121   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2122
2123   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2124
2125   /* set message header global offset field (reg 0, element 2) */
2126   brw_MOV(p,
2127	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2128			       mrf.nr,
2129			       2), BRW_REGISTER_TYPE_UD),
2130	   brw_imm_ud(offset));
2131
2132   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2133
2134   /* cast dest to a uword[8] vector */
2135   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2136
2137   brw_set_dest(p, insn, dest);
2138   if (brw->gen >= 6) {
2139      brw_set_src0(p, insn, mrf);
2140   } else {
2141      brw_set_src0(p, insn, brw_null_reg());
2142      brw_inst_set_base_mrf(brw, insn, mrf.nr);
2143   }
2144
2145   brw_set_dp_read_message(p,
2146			   insn,
2147			   bind_table_index,
2148			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2149			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2150			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2151			   1, /* msg_length */
2152                           true, /* header_present */
2153			   1); /* response_length (1 reg, 2 owords!) */
2154
2155   brw_pop_insn_state(p);
2156}
2157
2158
2159void brw_fb_WRITE(struct brw_compile *p,
2160		  int dispatch_width,
2161                  unsigned msg_reg_nr,
2162                  struct brw_reg src0,
2163                  unsigned msg_control,
2164                  unsigned binding_table_index,
2165                  unsigned msg_length,
2166                  unsigned response_length,
2167                  bool eot,
2168                  bool header_present)
2169{
2170   struct brw_context *brw = p->brw;
2171   struct brw_instruction *insn;
2172   unsigned msg_type;
2173   struct brw_reg dest;
2174
2175   if (dispatch_width == 16)
2176      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2177   else
2178      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2179
2180   if (brw->gen >= 6) {
2181      insn = next_insn(p, BRW_OPCODE_SENDC);
2182   } else {
2183      insn = next_insn(p, BRW_OPCODE_SEND);
2184   }
2185   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
2186
2187   if (brw->gen >= 6) {
2188      /* headerless version, just submit color payload */
2189      src0 = brw_message_reg(msg_reg_nr);
2190
2191      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2192   } else {
2193      brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
2194
2195      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2196   }
2197
2198   brw_set_dest(p, insn, dest);
2199   brw_set_src0(p, insn, src0);
2200   brw_set_dp_write_message(p,
2201			    insn,
2202			    binding_table_index,
2203			    msg_control,
2204			    msg_type,
2205			    msg_length,
2206			    header_present,
2207			    eot, /* last render target write */
2208			    response_length,
2209			    eot,
2210			    0 /* send_commit_msg */);
2211}
2212
2213
2214/**
2215 * Texture sample instruction.
2216 * Note: the msg_type plus msg_length values determine exactly what kind
2217 * of sampling operation is performed.  See volume 4, page 161 of docs.
2218 */
2219void brw_SAMPLE(struct brw_compile *p,
2220		struct brw_reg dest,
2221		unsigned msg_reg_nr,
2222		struct brw_reg src0,
2223		unsigned binding_table_index,
2224		unsigned sampler,
2225		unsigned msg_type,
2226		unsigned response_length,
2227		unsigned msg_length,
2228		unsigned header_present,
2229		unsigned simd_mode,
2230		unsigned return_format)
2231{
2232   struct brw_context *brw = p->brw;
2233   struct brw_instruction *insn;
2234
2235   if (msg_reg_nr != -1)
2236      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2237
2238   insn = next_insn(p, BRW_OPCODE_SEND);
2239   brw_inst_set_pred_control(brw, insn, BRW_PREDICATE_NONE); /* XXX */
2240
2241   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2242    *
2243    *    "Instruction compression is not allowed for this instruction (that
2244    *     is, send). The hardware behavior is undefined if this instruction is
2245    *     set as compressed. However, compress control can be set to "SecHalf"
2246    *     to affect the EMask generation."
2247    *
2248    * No similar wording is found in later PRMs, but there are examples
2249    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2250    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2251    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2252    */
2253   if (brw_inst_qtr_control(brw, insn) != BRW_COMPRESSION_2NDHALF)
2254      brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
2255
2256   if (brw->gen < 6)
2257      brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
2258
2259   brw_set_dest(p, insn, dest);
2260   brw_set_src0(p, insn, src0);
2261   brw_set_sampler_message(p, insn,
2262                           binding_table_index,
2263                           sampler,
2264                           msg_type,
2265                           response_length,
2266                           msg_length,
2267                           header_present,
2268                           simd_mode,
2269                           return_format);
2270}
2271
2272/* All these variables are pretty confusing - we might be better off
2273 * using bitmasks and macros for this, in the old style.  Or perhaps
2274 * just having the caller instantiate the fields in dword3 itself.
2275 */
2276void brw_urb_WRITE(struct brw_compile *p,
2277		   struct brw_reg dest,
2278		   unsigned msg_reg_nr,
2279		   struct brw_reg src0,
2280                   enum brw_urb_write_flags flags,
2281		   unsigned msg_length,
2282		   unsigned response_length,
2283		   unsigned offset,
2284		   unsigned swizzle)
2285{
2286   struct brw_context *brw = p->brw;
2287   struct brw_instruction *insn;
2288
2289   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2290
2291   if (brw->gen == 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2292      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2293      brw_push_insn_state(p);
2294      brw_set_default_access_mode(p, BRW_ALIGN_1);
2295      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2296      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2297		       BRW_REGISTER_TYPE_UD),
2298	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2299		brw_imm_ud(0xff00));
2300      brw_pop_insn_state(p);
2301   }
2302
2303   insn = next_insn(p, BRW_OPCODE_SEND);
2304
2305   assert(msg_length < BRW_MAX_MRF);
2306
2307   brw_set_dest(p, insn, dest);
2308   brw_set_src0(p, insn, src0);
2309   brw_set_src1(p, insn, brw_imm_d(0));
2310
2311   if (brw->gen < 6)
2312      brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
2313
2314   brw_set_urb_message(p,
2315		       insn,
2316		       flags,
2317		       msg_length,
2318		       response_length,
2319		       offset,
2320		       swizzle);
2321}
2322
2323static int
2324brw_find_next_block_end(struct brw_compile *p, int start_offset)
2325{
2326   int offset;
2327   void *store = p->store;
2328   const struct brw_context *brw = p->brw;
2329
2330   for (offset = next_offset(brw, store, start_offset);
2331        offset < p->next_insn_offset;
2332        offset = next_offset(brw, store, offset)) {
2333      struct brw_instruction *insn = store + offset;
2334
2335      switch (brw_inst_opcode(brw, insn)) {
2336      case BRW_OPCODE_ENDIF:
2337      case BRW_OPCODE_ELSE:
2338      case BRW_OPCODE_WHILE:
2339      case BRW_OPCODE_HALT:
2340	 return offset;
2341      }
2342   }
2343
2344   return 0;
2345}
2346
2347/* There is no DO instruction on gen6, so to find the end of the loop
2348 * we have to see if the loop is jumping back before our start
2349 * instruction.
2350 */
2351static int
2352brw_find_loop_end(struct brw_compile *p, int start_offset)
2353{
2354   struct brw_context *brw = p->brw;
2355   int offset;
2356   int scale = 8;
2357   void *store = p->store;
2358
2359   /* Always start after the instruction (such as a WHILE) we're trying to fix
2360    * up.
2361    */
2362   for (offset = next_offset(brw, store, start_offset);
2363        offset < p->next_insn_offset;
2364        offset = next_offset(brw, store, offset)) {
2365      struct brw_instruction *insn = store + offset;
2366
2367      if (brw_inst_opcode(brw, insn) == BRW_OPCODE_WHILE) {
2368         int jip = brw->gen == 6 ? brw_inst_gen6_jump_count(brw, insn)
2369                                 : brw_inst_jip(brw, insn);
2370	 if (offset + jip * scale <= start_offset)
2371	    return offset;
2372      }
2373   }
2374   assert(!"not reached");
2375   return start_offset;
2376}
2377
2378/* After program generation, go back and update the UIP and JIP of
2379 * BREAK, CONT, and HALT instructions to their correct locations.
2380 */
2381void
2382brw_set_uip_jip(struct brw_compile *p)
2383{
2384   struct brw_context *brw = p->brw;
2385   int offset;
2386   int scale = 8;
2387   void *store = p->store;
2388
2389   if (brw->gen < 6)
2390      return;
2391
2392   for (offset = 0; offset < p->next_insn_offset;
2393        offset = next_offset(brw, store, offset)) {
2394      struct brw_instruction *insn = store + offset;
2395
2396      if (brw_inst_cmpt_control(brw, insn)) {
2397	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2398         assert(brw_inst_opcode(brw, insn) != BRW_OPCODE_BREAK &&
2399                brw_inst_opcode(brw, insn) != BRW_OPCODE_CONTINUE &&
2400                brw_inst_opcode(brw, insn) != BRW_OPCODE_HALT);
2401	 continue;
2402      }
2403
2404      int block_end_offset = brw_find_next_block_end(p, offset);
2405      switch (brw_inst_opcode(brw, insn)) {
2406      case BRW_OPCODE_BREAK:
2407         assert(block_end_offset != 0);
2408         brw_inst_set_jip(brw, insn, (block_end_offset - offset) / scale);
2409	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2410         brw_inst_set_uip(brw, insn,
2411	    (brw_find_loop_end(p, offset) - offset +
2412             (brw->gen == 6 ? 16 : 0)) / scale);
2413	 break;
2414      case BRW_OPCODE_CONTINUE:
2415         assert(block_end_offset != 0);
2416         brw_inst_set_jip(brw, insn, (block_end_offset - offset) / scale);
2417         brw_inst_set_uip(brw, insn,
2418            (brw_find_loop_end(p, offset) - offset) / scale);
2419
2420         assert(brw_inst_uip(brw, insn) != 0);
2421         assert(brw_inst_jip(brw, insn) != 0);
2422	 break;
2423
2424      case BRW_OPCODE_ENDIF:
2425         if (block_end_offset == 0)
2426            brw_inst_set_jip(brw, insn, 2);
2427         else
2428            brw_inst_set_jip(brw, insn, (block_end_offset - offset) / scale);
2429	 break;
2430
2431      case BRW_OPCODE_HALT:
2432	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2433	  *
2434	  *    "In case of the halt instruction not inside any conditional
2435	  *     code block, the value of <JIP> and <UIP> should be the
2436	  *     same. In case of the halt instruction inside conditional code
2437	  *     block, the <UIP> should be the end of the program, and the
2438	  *     <JIP> should be end of the most inner conditional code block."
2439	  *
2440	  * The uip will have already been set by whoever set up the
2441	  * instruction.
2442	  */
2443	 if (block_end_offset == 0) {
2444            brw_inst_set_jip(brw, insn, brw_inst_uip(brw, insn));
2445	 } else {
2446            brw_inst_set_jip(brw, insn, (block_end_offset - offset) / scale);
2447	 }
2448         assert(brw_inst_uip(brw, insn) != 0);
2449         assert(brw_inst_jip(brw, insn) != 0);
2450	 break;
2451      }
2452   }
2453}
2454
2455void brw_ff_sync(struct brw_compile *p,
2456		   struct brw_reg dest,
2457		   unsigned msg_reg_nr,
2458		   struct brw_reg src0,
2459		   bool allocate,
2460		   unsigned response_length,
2461		   bool eot)
2462{
2463   struct brw_context *brw = p->brw;
2464   struct brw_instruction *insn;
2465
2466   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2467
2468   insn = next_insn(p, BRW_OPCODE_SEND);
2469   brw_set_dest(p, insn, dest);
2470   brw_set_src0(p, insn, src0);
2471   brw_set_src1(p, insn, brw_imm_d(0));
2472
2473   if (brw->gen < 6)
2474      brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
2475
2476   brw_set_ff_sync_message(p,
2477			   insn,
2478			   allocate,
2479			   response_length,
2480			   eot);
2481}
2482
2483/**
2484 * Emit the SEND instruction necessary to generate stream output data on Gen6
2485 * (for transform feedback).
2486 *
2487 * If send_commit_msg is true, this is the last piece of stream output data
2488 * from this thread, so send the data as a committed write.  According to the
2489 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2490 *
2491 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2492 *   writes are complete by sending the final write as a committed write."
2493 */
2494void
2495brw_svb_write(struct brw_compile *p,
2496              struct brw_reg dest,
2497              unsigned msg_reg_nr,
2498              struct brw_reg src0,
2499              unsigned binding_table_index,
2500              bool   send_commit_msg)
2501{
2502   struct brw_instruction *insn;
2503
2504   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2505
2506   insn = next_insn(p, BRW_OPCODE_SEND);
2507   brw_set_dest(p, insn, dest);
2508   brw_set_src0(p, insn, src0);
2509   brw_set_src1(p, insn, brw_imm_d(0));
2510   brw_set_dp_write_message(p, insn,
2511                            binding_table_index,
2512                            0, /* msg_control: ignored */
2513                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2514                            1, /* msg_length */
2515                            true, /* header_present */
2516                            0, /* last_render_target: ignored */
2517                            send_commit_msg, /* response_length */
2518                            0, /* end_of_thread */
2519                            send_commit_msg); /* send_commit_msg */
2520}
2521
2522static void
2523brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2524                                  struct brw_instruction *insn,
2525                                  unsigned atomic_op,
2526                                  unsigned bind_table_index,
2527                                  unsigned msg_length,
2528                                  unsigned response_length,
2529                                  bool header_present)
2530{
2531   const struct brw_context *brw = p->brw;
2532
2533   unsigned msg_control =
2534      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2535      (response_length ? 1 << 5 : 0); /* Return data expected */
2536
2537   if (brw->is_haswell) {
2538      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2539                                 msg_length, response_length,
2540                                 header_present, false);
2541
2542
2543      if (brw_inst_access_mode(brw, insn) == BRW_ALIGN_1) {
2544         if (brw_inst_exec_size(brw, insn) != BRW_EXECUTE_16)
2545            msg_control |= 1 << 4; /* SIMD8 mode */
2546
2547         brw_inst_set_dp_msg_type(brw, insn,
2548                                  HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2549      } else {
2550         brw_inst_set_dp_msg_type(brw, insn,
2551            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2552      }
2553   } else {
2554      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2555                                 msg_length, response_length,
2556                                 header_present, false);
2557
2558      brw_inst_set_dp_msg_type(brw, insn, GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2559
2560      if (brw_inst_exec_size(brw, insn) != BRW_EXECUTE_16)
2561         msg_control |= 1 << 4; /* SIMD8 mode */
2562   }
2563
2564   brw_inst_set_binding_table_index(brw, insn, bind_table_index);
2565   brw_inst_set_dp_msg_control(brw, insn, msg_control);
2566}
2567
2568void
2569brw_untyped_atomic(struct brw_compile *p,
2570                   struct brw_reg dest,
2571                   struct brw_reg mrf,
2572                   unsigned atomic_op,
2573                   unsigned bind_table_index,
2574                   unsigned msg_length,
2575                   unsigned response_length) {
2576   const struct brw_context *brw = p->brw;
2577   struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2578
2579   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2580   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2581   brw_set_src1(p, insn, brw_imm_d(0));
2582   brw_set_dp_untyped_atomic_message(
2583      p, insn, atomic_op, bind_table_index, msg_length, response_length,
2584      brw_inst_access_mode(brw, insn) == BRW_ALIGN_1);
2585}
2586
2587static void
2588brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2589                                        struct brw_instruction *insn,
2590                                        unsigned bind_table_index,
2591                                        unsigned msg_length,
2592                                        unsigned response_length,
2593                                        bool header_present)
2594{
2595   const struct brw_context *brw = p->brw;
2596   const unsigned dispatch_width =
2597      (brw_inst_exec_size(brw, insn) == BRW_EXECUTE_16 ? 16 : 8);
2598   const unsigned num_channels = response_length / (dispatch_width / 8);
2599
2600   if (brw->is_haswell) {
2601      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2602                                 msg_length, response_length,
2603                                 header_present, false);
2604
2605      brw_inst_set_dp_msg_type(brw, insn,
2606                               HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ);
2607   } else {
2608      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2609                                 msg_length, response_length,
2610                                 header_present, false);
2611
2612      brw_inst_set_dp_msg_type(brw, insn,
2613                               GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ);
2614   }
2615
2616   /* Set mask of 32-bit channels to drop. */
2617   unsigned msg_control = (0xf & (0xf << num_channels));
2618
2619   if (brw_inst_access_mode(brw, insn) == BRW_ALIGN_1) {
2620      if (dispatch_width == 16)
2621         msg_control |= 1 << 4; /* SIMD16 mode */
2622      else
2623         msg_control |= 2 << 4; /* SIMD8 mode */
2624   }
2625
2626   brw_inst_set_binding_table_index(brw, insn, bind_table_index);
2627   brw_inst_set_dp_msg_control(brw, insn, msg_control);
2628}
2629
2630void
2631brw_untyped_surface_read(struct brw_compile *p,
2632                         struct brw_reg dest,
2633                         struct brw_reg mrf,
2634                         unsigned bind_table_index,
2635                         unsigned msg_length,
2636                         unsigned response_length)
2637{
2638   const struct brw_context *brw = p->brw;
2639   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2640
2641   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2642   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2643   brw_set_dp_untyped_surface_read_message(
2644      p, insn, bind_table_index, msg_length, response_length,
2645      brw_inst_access_mode(brw, insn) == BRW_ALIGN_1);
2646}
2647
2648/**
2649 * This instruction is generated as a single-channel align1 instruction by
2650 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2651 *
2652 * We can't use the typed atomic op in the FS because that has the execution
2653 * mask ANDed with the pixel mask, but we just want to write the one dword for
2654 * all the pixels.
2655 *
2656 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2657 * one u32.  So we use the same untyped atomic write message as the pixel
2658 * shader.
2659 *
2660 * The untyped atomic operation requires a BUFFER surface type with RAW
2661 * format, and is only accessible through the legacy DATA_CACHE dataport
2662 * messages.
2663 */
2664void brw_shader_time_add(struct brw_compile *p,
2665                         struct brw_reg payload,
2666                         uint32_t surf_index)
2667{
2668   assert(p->brw->gen >= 7);
2669
2670   brw_push_insn_state(p);
2671   brw_set_default_access_mode(p, BRW_ALIGN_1);
2672   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2673   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2674   brw_pop_insn_state(p);
2675
2676   /* We use brw_vec1_reg and unmasked because we want to increment the given
2677    * offset only once.
2678    */
2679   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2680                                      BRW_ARF_NULL, 0));
2681   brw_set_src0(p, send, brw_vec1_reg(payload.file,
2682                                      payload.nr, 0));
2683   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2684                                     2 /* message length */,
2685                                     0 /* response length */,
2686                                     false /* header present */);
2687}
2688