brw_eu_emit.c revision a7d319c00be425be219a101b5b4d48f1cbe4ec01
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "util/ralloc.h"
38
39/**
40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
41 * registers, implicitly moving the operand to a message register.
42 *
43 * On Sandybridge, this is no longer the case.  This function performs the
44 * explicit move; it should be called before emitting a SEND instruction.
45 */
46void
47gen6_resolve_implied_move(struct brw_codegen *p,
48			  struct brw_reg *src,
49			  unsigned msg_reg_nr)
50{
51   const struct brw_device_info *devinfo = p->devinfo;
52   if (devinfo->gen < 6)
53      return;
54
55   if (src->file == BRW_MESSAGE_REGISTER_FILE)
56      return;
57
58   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
59      brw_push_insn_state(p);
60      brw_set_default_exec_size(p, BRW_EXECUTE_8);
61      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64	      retype(*src, BRW_REGISTER_TYPE_UD));
65      brw_pop_insn_state(p);
66   }
67   *src = brw_message_reg(msg_reg_nr);
68}
69
70static void
71gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72{
73   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74    * "The send with EOT should use register space R112-R127 for <src>. This is
75    *  to enable loading of a new thread into the same slot while the message
76    *  with EOT for current thread is pending dispatch."
77    *
78    * Since we're pretending to have 16 MRFs anyway, we may as well use the
79    * registers required for messages with EOT.
80    */
81   const struct brw_device_info *devinfo = p->devinfo;
82   if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83      reg->file = BRW_GENERAL_REGISTER_FILE;
84      reg->nr += GEN7_MRF_HACK_START;
85   }
86}
87
88/**
89 * Convert a brw_reg_type enumeration value into the hardware representation.
90 *
91 * The hardware encoding may depend on whether the value is an immediate.
92 */
93unsigned
94brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
95                        enum brw_reg_type type, enum brw_reg_file file)
96{
97   if (file == BRW_IMMEDIATE_VALUE) {
98      static const int imm_hw_types[] = {
99         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
100         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
101         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
102         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
103         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
104         [BRW_REGISTER_TYPE_UB] = -1,
105         [BRW_REGISTER_TYPE_B]  = -1,
106         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
107         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
108         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
109         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
110         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
111         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
112         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
113      };
114      assert(type < ARRAY_SIZE(imm_hw_types));
115      assert(imm_hw_types[type] != -1);
116      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
117      return imm_hw_types[type];
118   } else {
119      /* Non-immediate registers */
120      static const int hw_types[] = {
121         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
122         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
123         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
124         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
125         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
126         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
127         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
128         [BRW_REGISTER_TYPE_UV] = -1,
129         [BRW_REGISTER_TYPE_VF] = -1,
130         [BRW_REGISTER_TYPE_V]  = -1,
131         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
132         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
133         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
134         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
135      };
136      assert(type < ARRAY_SIZE(hw_types));
137      assert(hw_types[type] != -1);
138      assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
139      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
140      return hw_types[type];
141   }
142}
143
144void
145brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
146{
147   const struct brw_device_info *devinfo = p->devinfo;
148
149   if (dest.file == BRW_MESSAGE_REGISTER_FILE)
150      assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
151   else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
152      assert(dest.nr < 128);
153
154   gen7_convert_mrf_to_grf(p, &dest);
155
156   brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
157   brw_inst_set_dst_reg_type(devinfo, inst,
158                             brw_reg_type_to_hw_type(devinfo, dest.type,
159                                                     dest.file));
160   brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
161
162   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
163      brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
164
165      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
166         brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
167	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
168	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
169         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
170      } else {
171         brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
172         brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
173         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
174             dest.file == BRW_MESSAGE_REGISTER_FILE) {
175            assert(dest.writemask != 0);
176         }
177	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
178	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
179	  *    this to be programmed as "01".
180	  */
181         brw_inst_set_dst_hstride(devinfo, inst, 1);
182      }
183   } else {
184      brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
185
186      /* These are different sizes in align1 vs align16:
187       */
188      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
189         brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
190                                       dest.indirect_offset);
191	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
192	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
193         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
194      } else {
195         brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
196                                        dest.indirect_offset);
197	 /* even ignored in da16, still need to set as '01' */
198         brw_inst_set_dst_hstride(devinfo, inst, 1);
199      }
200   }
201
202   /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
203    * or 16 (SIMD16), as that's normally correct.  However, when dealing with
204    * small registers, we automatically reduce it to match the register size.
205    *
206    * In platforms that support fp64 we can emit instructions with a width of
207    * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
208    * cases we need to make sure that these instructions have their exec sizes
209    * set properly when they are emitted and we can't rely on this code to fix
210    * it.
211    */
212   bool fix_exec_size;
213   if (devinfo->gen >= 6)
214      fix_exec_size = dest.width < BRW_EXECUTE_4;
215   else
216      fix_exec_size = dest.width < BRW_EXECUTE_8;
217
218   if (fix_exec_size)
219      brw_inst_set_exec_size(devinfo, inst, dest.width);
220}
221
222extern int reg_type_size[];
223
224static void
225validate_reg(const struct brw_device_info *devinfo,
226             brw_inst *inst, struct brw_reg reg)
227{
228   const int hstride_for_reg[] = {0, 1, 2, 4};
229   const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
230   const int width_for_reg[] = {1, 2, 4, 8, 16};
231   const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
232   int width, hstride, vstride, execsize;
233
234   if (reg.file == BRW_IMMEDIATE_VALUE) {
235      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
236       * mean the destination has to be 128-bit aligned and the
237       * destination horiz stride has to be a word.
238       */
239      if (reg.type == BRW_REGISTER_TYPE_V) {
240         assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
241                reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
242      }
243
244      return;
245   }
246
247   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
248       reg.file == BRW_ARF_NULL)
249      return;
250
251   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
252    *
253    *    "Swizzling is not allowed when an accumulator is used as an implicit
254    *    source or an explicit source in an instruction."
255    */
256   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
257       reg.nr == BRW_ARF_ACCUMULATOR)
258      assert(reg.swizzle == BRW_SWIZZLE_XYZW);
259
260   assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
261   hstride = hstride_for_reg[reg.hstride];
262
263   if (reg.vstride == 0xf) {
264      vstride = -1;
265   } else {
266      assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
267      vstride = vstride_for_reg[reg.vstride];
268   }
269
270   assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
271   width = width_for_reg[reg.width];
272
273   assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
274          brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
275   execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
276
277   /* Restrictions from 3.3.10: Register Region Restrictions. */
278   /* 3. */
279   assert(execsize >= width);
280
281   /* 4. */
282   if (execsize == width && hstride != 0) {
283      assert(vstride == -1 || vstride == width * hstride);
284   }
285
286   /* 5. */
287   if (execsize == width && hstride == 0) {
288      /* no restriction on vstride. */
289   }
290
291   /* 6. */
292   if (width == 1) {
293      assert(hstride == 0);
294   }
295
296   /* 7. */
297   if (execsize == 1 && width == 1) {
298      assert(hstride == 0);
299      assert(vstride == 0);
300   }
301
302   /* 8. */
303   if (vstride == 0 && hstride == 0) {
304      assert(width == 1);
305   }
306
307   /* 10. Check destination issues. */
308}
309
310static bool
311is_compactable_immediate(unsigned imm)
312{
313   /* We get the low 12 bits as-is. */
314   imm &= ~0xfff;
315
316   /* We get one bit replicated through the top 20 bits. */
317   return imm == 0 || imm == 0xfffff000;
318}
319
320void
321brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
322{
323   const struct brw_device_info *devinfo = p->devinfo;
324
325   if (reg.file == BRW_MESSAGE_REGISTER_FILE)
326      assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
327   else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
328      assert(reg.nr < 128);
329
330   gen7_convert_mrf_to_grf(p, &reg);
331
332   if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
333                             brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
334      /* Any source modifiers or regions will be ignored, since this just
335       * identifies the MRF/GRF to start reading the message contents from.
336       * Check for some likely failures.
337       */
338      assert(!reg.negate);
339      assert(!reg.abs);
340      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
341   }
342
343   validate_reg(devinfo, inst, reg);
344
345   brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
346   brw_inst_set_src0_reg_type(devinfo, inst,
347                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
348   brw_inst_set_src0_abs(devinfo, inst, reg.abs);
349   brw_inst_set_src0_negate(devinfo, inst, reg.negate);
350   brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
351
352   if (reg.file == BRW_IMMEDIATE_VALUE) {
353      if (reg.type == BRW_REGISTER_TYPE_DF)
354         brw_inst_set_imm_df(devinfo, inst, reg.df);
355      else
356         brw_inst_set_imm_ud(devinfo, inst, reg.ud);
357
358      /* The Bspec's section titled "Non-present Operands" claims that if src0
359       * is an immediate that src1's type must be the same as that of src0.
360       *
361       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
362       * that do not follow this rule. E.g., from the IVB/HSW table:
363       *
364       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
365       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
366       *
367       * And from the SNB table:
368       *
369       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
370       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
371       *
372       * Neither of these cause warnings from the simulator when used,
373       * compacted or otherwise. In fact, all compaction mappings that have an
374       * immediate in src0 use a:ud for src1.
375       *
376       * The GM45 instruction compaction tables do not contain mapped meanings
377       * so it's not clear whether it has the restriction. We'll assume it was
378       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
379       *
380       * Don't do any of this for 64-bit immediates, since the src1 fields
381       * overlap with the immediate and setting them would overwrite the
382       * immediate we set.
383       */
384      if (type_sz(reg.type) < 8) {
385         brw_inst_set_src1_reg_file(devinfo, inst,
386                                    BRW_ARCHITECTURE_REGISTER_FILE);
387         if (devinfo->gen < 6) {
388            brw_inst_set_src1_reg_type(devinfo, inst,
389                                       brw_inst_src0_reg_type(devinfo, inst));
390         } else {
391            brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
392         }
393      }
394
395      /* Compacted instructions only have 12-bits (plus 1 for the other 20)
396       * for immediate values. Presumably the hardware engineers realized
397       * that the only useful floating-point value that could be represented
398       * in this format is 0.0, which can also be represented as a VF-typed
399       * immediate, so they gave us the previously mentioned mapping on IVB+.
400       *
401       * Strangely, we do have a mapping for imm:f in src1, so we don't need
402       * to do this there.
403       *
404       * If we see a 0.0:F, change the type to VF so that it can be compacted.
405       */
406      if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
407          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
408         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
409      }
410
411      /* There are no mappings for dst:d | i:d, so if the immediate is suitable
412       * set the types to :UD so the instruction can be compacted.
413       */
414      if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
415          brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
416          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
417          brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
418         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
419         brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
420      }
421   } else {
422      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
423         brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
424         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
425             brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
426	 } else {
427            brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
428	 }
429      } else {
430         brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
431
432         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
433            brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
434	 } else {
435            brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
436	 }
437      }
438
439      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
440	 if (reg.width == BRW_WIDTH_1 &&
441             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
442            brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
443            brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
444            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
445	 } else {
446            brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
447            brw_inst_set_src0_width(devinfo, inst, reg.width);
448            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
449	 }
450      } else {
451         brw_inst_set_src0_da16_swiz_x(devinfo, inst,
452            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
453         brw_inst_set_src0_da16_swiz_y(devinfo, inst,
454            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
455         brw_inst_set_src0_da16_swiz_z(devinfo, inst,
456            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
457         brw_inst_set_src0_da16_swiz_w(devinfo, inst,
458            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
459
460	 /* This is an oddity of the fact we're using the same
461	  * descriptions for registers in align_16 as align_1:
462	  */
463	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
464            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
465	 else
466            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
467      }
468   }
469}
470
471
472void
473brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
474{
475   const struct brw_device_info *devinfo = p->devinfo;
476
477   if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
478      assert(reg.nr < 128);
479
480   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
481    *
482    *    "Accumulator registers may be accessed explicitly as src0
483    *    operands only."
484    */
485   assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
486          reg.nr != BRW_ARF_ACCUMULATOR);
487
488   gen7_convert_mrf_to_grf(p, &reg);
489   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
490
491   validate_reg(devinfo, inst, reg);
492
493   brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
494   brw_inst_set_src1_reg_type(devinfo, inst,
495                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
496   brw_inst_set_src1_abs(devinfo, inst, reg.abs);
497   brw_inst_set_src1_negate(devinfo, inst, reg.negate);
498
499   /* Only src1 can be immediate in two-argument instructions.
500    */
501   assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
502
503   if (reg.file == BRW_IMMEDIATE_VALUE) {
504      /* two-argument instructions can only use 32-bit immediates */
505      assert(type_sz(reg.type) < 8);
506      brw_inst_set_imm_ud(devinfo, inst, reg.ud);
507   } else {
508      /* This is a hardware restriction, which may or may not be lifted
509       * in the future:
510       */
511      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
512      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
513
514      brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
515      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
516         brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
517      } else {
518         brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
519      }
520
521      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
522	 if (reg.width == BRW_WIDTH_1 &&
523             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
524            brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
525            brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
526            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
527	 } else {
528            brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
529            brw_inst_set_src1_width(devinfo, inst, reg.width);
530            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
531	 }
532      } else {
533         brw_inst_set_src1_da16_swiz_x(devinfo, inst,
534            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
535         brw_inst_set_src1_da16_swiz_y(devinfo, inst,
536            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
537         brw_inst_set_src1_da16_swiz_z(devinfo, inst,
538            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
539         brw_inst_set_src1_da16_swiz_w(devinfo, inst,
540            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
541
542	 /* This is an oddity of the fact we're using the same
543	  * descriptions for registers in align_16 as align_1:
544	  */
545	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
546            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
547	 else
548            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
549      }
550   }
551}
552
553/**
554 * Set the Message Descriptor and Extended Message Descriptor fields
555 * for SEND messages.
556 *
557 * \note This zeroes out the Function Control bits, so it must be called
558 *       \b before filling out any message-specific data.  Callers can
559 *       choose not to fill in irrelevant bits; they will be zero.
560 */
561void
562brw_set_message_descriptor(struct brw_codegen *p,
563			   brw_inst *inst,
564			   enum brw_message_target sfid,
565			   unsigned msg_length,
566			   unsigned response_length,
567			   bool header_present,
568			   bool end_of_thread)
569{
570   const struct brw_device_info *devinfo = p->devinfo;
571
572   brw_set_src1(p, inst, brw_imm_d(0));
573
574   /* For indirect sends, `inst` will not be the SEND/SENDC instruction
575    * itself; instead, it will be a MOV/OR into the address register.
576    *
577    * In this case, we avoid setting the extended message descriptor bits,
578    * since they go on the later SEND/SENDC instead and if set here would
579    * instead clobber the conditionalmod bits.
580    */
581   unsigned opcode = brw_inst_opcode(devinfo, inst);
582   if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
583      brw_inst_set_sfid(devinfo, inst, sfid);
584   }
585
586   brw_inst_set_mlen(devinfo, inst, msg_length);
587   brw_inst_set_rlen(devinfo, inst, response_length);
588   brw_inst_set_eot(devinfo, inst, end_of_thread);
589
590   if (devinfo->gen >= 5) {
591      brw_inst_set_header_present(devinfo, inst, header_present);
592   }
593}
594
595static void brw_set_math_message( struct brw_codegen *p,
596				  brw_inst *inst,
597				  unsigned function,
598				  unsigned integer_type,
599				  bool low_precision,
600				  unsigned dataType )
601{
602   const struct brw_device_info *devinfo = p->devinfo;
603   unsigned msg_length;
604   unsigned response_length;
605
606   /* Infer message length from the function */
607   switch (function) {
608   case BRW_MATH_FUNCTION_POW:
609   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
610   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
611   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
612      msg_length = 2;
613      break;
614   default:
615      msg_length = 1;
616      break;
617   }
618
619   /* Infer response length from the function */
620   switch (function) {
621   case BRW_MATH_FUNCTION_SINCOS:
622   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
623      response_length = 2;
624      break;
625   default:
626      response_length = 1;
627      break;
628   }
629
630
631   brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
632			      msg_length, response_length, false, false);
633   brw_inst_set_math_msg_function(devinfo, inst, function);
634   brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
635   brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
636   brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
637   brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
638   brw_inst_set_saturate(devinfo, inst, 0);
639}
640
641
642static void brw_set_ff_sync_message(struct brw_codegen *p,
643				    brw_inst *insn,
644				    bool allocate,
645				    unsigned response_length,
646				    bool end_of_thread)
647{
648   const struct brw_device_info *devinfo = p->devinfo;
649
650   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
651			      1, response_length, true, end_of_thread);
652   brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
653   brw_inst_set_urb_allocate(devinfo, insn, allocate);
654   /* The following fields are not used by FF_SYNC: */
655   brw_inst_set_urb_global_offset(devinfo, insn, 0);
656   brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
657   brw_inst_set_urb_used(devinfo, insn, 0);
658   brw_inst_set_urb_complete(devinfo, insn, 0);
659}
660
661static void brw_set_urb_message( struct brw_codegen *p,
662				 brw_inst *insn,
663                                 enum brw_urb_write_flags flags,
664				 unsigned msg_length,
665				 unsigned response_length,
666				 unsigned offset,
667				 unsigned swizzle_control )
668{
669   const struct brw_device_info *devinfo = p->devinfo;
670
671   assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
672   assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
673   assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
674
675   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
676			      msg_length, response_length, true,
677                              flags & BRW_URB_WRITE_EOT);
678
679   if (flags & BRW_URB_WRITE_OWORD) {
680      assert(msg_length == 2); /* header + one OWORD of data */
681      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
682   } else {
683      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
684   }
685
686   brw_inst_set_urb_global_offset(devinfo, insn, offset);
687   brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
688
689   if (devinfo->gen < 8) {
690      brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
691   }
692
693   if (devinfo->gen < 7) {
694      brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
695      brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
696   } else {
697      brw_inst_set_urb_per_slot_offset(devinfo, insn,
698         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
699   }
700}
701
702void
703brw_set_dp_write_message(struct brw_codegen *p,
704			 brw_inst *insn,
705			 unsigned binding_table_index,
706			 unsigned msg_control,
707			 unsigned msg_type,
708			 unsigned msg_length,
709			 bool header_present,
710			 unsigned last_render_target,
711			 unsigned response_length,
712			 unsigned end_of_thread,
713			 unsigned send_commit_msg)
714{
715   const struct brw_device_info *devinfo = p->devinfo;
716   unsigned sfid;
717
718   if (devinfo->gen >= 7) {
719      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
720      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
721	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
722      else
723	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
724   } else if (devinfo->gen == 6) {
725      /* Use the render cache for all write messages. */
726      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
727   } else {
728      sfid = BRW_SFID_DATAPORT_WRITE;
729   }
730
731   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
732			      header_present, end_of_thread);
733
734   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
735   brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
736   brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
737   brw_inst_set_rt_last(devinfo, insn, last_render_target);
738   if (devinfo->gen < 7) {
739      brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
740   }
741}
742
743void
744brw_set_dp_read_message(struct brw_codegen *p,
745			brw_inst *insn,
746			unsigned binding_table_index,
747			unsigned msg_control,
748			unsigned msg_type,
749			unsigned target_cache,
750			unsigned msg_length,
751                        bool header_present,
752			unsigned response_length)
753{
754   const struct brw_device_info *devinfo = p->devinfo;
755   unsigned sfid;
756
757   if (devinfo->gen >= 7) {
758      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
759   } else if (devinfo->gen == 6) {
760      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
761	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
762      else
763	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
764   } else {
765      sfid = BRW_SFID_DATAPORT_READ;
766   }
767
768   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
769			      header_present, false);
770
771   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
772   brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
773   brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
774   if (devinfo->gen < 6)
775      brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
776}
777
778void
779brw_set_sampler_message(struct brw_codegen *p,
780                        brw_inst *inst,
781                        unsigned binding_table_index,
782                        unsigned sampler,
783                        unsigned msg_type,
784                        unsigned response_length,
785                        unsigned msg_length,
786                        unsigned header_present,
787                        unsigned simd_mode,
788                        unsigned return_format)
789{
790   const struct brw_device_info *devinfo = p->devinfo;
791
792   brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
793			      response_length, header_present, false);
794
795   brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
796   brw_inst_set_sampler(devinfo, inst, sampler);
797   brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
798   if (devinfo->gen >= 5) {
799      brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
800   } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
801      brw_inst_set_sampler_return_format(devinfo, inst, return_format);
802   }
803}
804
805static void
806gen7_set_dp_scratch_message(struct brw_codegen *p,
807                            brw_inst *inst,
808                            bool write,
809                            bool dword,
810                            bool invalidate_after_read,
811                            unsigned num_regs,
812                            unsigned addr_offset,
813                            unsigned mlen,
814                            unsigned rlen,
815                            bool header_present)
816{
817   const struct brw_device_info *devinfo = p->devinfo;
818   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
819          (devinfo->gen >= 8 && num_regs == 8));
820   const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
821                                num_regs - 1);
822
823   brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
824                              mlen, rlen, header_present, false);
825   brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
826   brw_inst_set_scratch_read_write(devinfo, inst, write);
827   brw_inst_set_scratch_type(devinfo, inst, dword);
828   brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
829   brw_inst_set_scratch_block_size(devinfo, inst, block_size);
830   brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
831}
832
833#define next_insn brw_next_insn
834brw_inst *
835brw_next_insn(struct brw_codegen *p, unsigned opcode)
836{
837   const struct brw_device_info *devinfo = p->devinfo;
838   brw_inst *insn;
839
840   if (p->nr_insn + 1 > p->store_size) {
841      p->store_size <<= 1;
842      p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
843   }
844
845   p->next_insn_offset += 16;
846   insn = &p->store[p->nr_insn++];
847   memcpy(insn, p->current, sizeof(*insn));
848
849   brw_inst_set_opcode(devinfo, insn, opcode);
850   return insn;
851}
852
853static brw_inst *
854brw_alu1(struct brw_codegen *p, unsigned opcode,
855         struct brw_reg dest, struct brw_reg src)
856{
857   brw_inst *insn = next_insn(p, opcode);
858   brw_set_dest(p, insn, dest);
859   brw_set_src0(p, insn, src);
860   return insn;
861}
862
863static brw_inst *
864brw_alu2(struct brw_codegen *p, unsigned opcode,
865         struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
866{
867   /* 64-bit immediates are only supported on 1-src instructions */
868   assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
869   assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
870
871   brw_inst *insn = next_insn(p, opcode);
872   brw_set_dest(p, insn, dest);
873   brw_set_src0(p, insn, src0);
874   brw_set_src1(p, insn, src1);
875   return insn;
876}
877
878static int
879get_3src_subreg_nr(struct brw_reg reg)
880{
881   /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
882    * use 32-bit units (components 0..7).  Since they only support F/D/UD
883    * types, this doesn't lose any flexibility, but uses fewer bits.
884    */
885   return reg.subnr / 4;
886}
887
888static brw_inst *
889brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
890         struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
891{
892   const struct brw_device_info *devinfo = p->devinfo;
893   brw_inst *inst = next_insn(p, opcode);
894
895   gen7_convert_mrf_to_grf(p, &dest);
896
897   assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
898
899   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
900	  dest.file == BRW_MESSAGE_REGISTER_FILE);
901   assert(dest.nr < 128);
902   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
903   assert(dest.type == BRW_REGISTER_TYPE_F  ||
904          dest.type == BRW_REGISTER_TYPE_DF ||
905          dest.type == BRW_REGISTER_TYPE_D  ||
906          dest.type == BRW_REGISTER_TYPE_UD);
907   if (devinfo->gen == 6) {
908      brw_inst_set_3src_dst_reg_file(devinfo, inst,
909                                     dest.file == BRW_MESSAGE_REGISTER_FILE);
910   }
911   brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
912   brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
913   brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
914
915   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
916   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
917   assert(src0.nr < 128);
918   brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
919   brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
920   brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
921   brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
922   brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
923   brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
924                                   src0.vstride == BRW_VERTICAL_STRIDE_0);
925
926   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
927   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
928   assert(src1.nr < 128);
929   brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
930   brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
931   brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
932   brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
933   brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
934   brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
935                                   src1.vstride == BRW_VERTICAL_STRIDE_0);
936
937   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
938   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
939   assert(src2.nr < 128);
940   brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
941   brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
942   brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
943   brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
944   brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
945   brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
946                                   src2.vstride == BRW_VERTICAL_STRIDE_0);
947
948   if (devinfo->gen >= 7) {
949      /* Set both the source and destination types based on dest.type,
950       * ignoring the source register types.  The MAD and LRP emitters ensure
951       * that all four types are float.  The BFE and BFI2 emitters, however,
952       * may send us mixed D and UD types and want us to ignore that and use
953       * the destination type.
954       */
955      switch (dest.type) {
956      case BRW_REGISTER_TYPE_F:
957         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
958         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
959         break;
960      case BRW_REGISTER_TYPE_DF:
961         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
962         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
963         break;
964      case BRW_REGISTER_TYPE_D:
965         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
966         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
967         break;
968      case BRW_REGISTER_TYPE_UD:
969         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
970         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
971         break;
972      default:
973         unreachable("not reached");
974      }
975   }
976
977   return inst;
978}
979
980
981/***********************************************************************
982 * Convenience routines.
983 */
984#define ALU1(OP)					\
985brw_inst *brw_##OP(struct brw_codegen *p,		\
986	      struct brw_reg dest,			\
987	      struct brw_reg src0)   			\
988{							\
989   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
990}
991
992#define ALU2(OP)					\
993brw_inst *brw_##OP(struct brw_codegen *p,		\
994	      struct brw_reg dest,			\
995	      struct brw_reg src0,			\
996	      struct brw_reg src1)   			\
997{							\
998   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
999}
1000
1001#define ALU3(OP)					\
1002brw_inst *brw_##OP(struct brw_codegen *p,		\
1003	      struct brw_reg dest,			\
1004	      struct brw_reg src0,			\
1005	      struct brw_reg src1,			\
1006	      struct brw_reg src2)   			\
1007{							\
1008   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
1009}
1010
1011#define ALU3F(OP)                                               \
1012brw_inst *brw_##OP(struct brw_codegen *p,         \
1013                                 struct brw_reg dest,           \
1014                                 struct brw_reg src0,           \
1015                                 struct brw_reg src1,           \
1016                                 struct brw_reg src2)           \
1017{                                                               \
1018   assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
1019          dest.type == BRW_REGISTER_TYPE_DF);                   \
1020   if (dest.type == BRW_REGISTER_TYPE_F) {                      \
1021      assert(src0.type == BRW_REGISTER_TYPE_F);                 \
1022      assert(src1.type == BRW_REGISTER_TYPE_F);                 \
1023      assert(src2.type == BRW_REGISTER_TYPE_F);                 \
1024   } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
1025      assert(src0.type == BRW_REGISTER_TYPE_DF);                \
1026      assert(src1.type == BRW_REGISTER_TYPE_DF);                \
1027      assert(src2.type == BRW_REGISTER_TYPE_DF);                \
1028   }                                                            \
1029   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1030}
1031
1032/* Rounding operations (other than RNDD) require two instructions - the first
1033 * stores a rounded value (possibly the wrong way) in the dest register, but
1034 * also sets a per-channel "increment bit" in the flag register.  A predicated
1035 * add of 1.0 fixes dest to contain the desired result.
1036 *
1037 * Sandybridge and later appear to round correctly without an ADD.
1038 */
1039#define ROUND(OP)							      \
1040void brw_##OP(struct brw_codegen *p,					      \
1041	      struct brw_reg dest,					      \
1042	      struct brw_reg src)					      \
1043{									      \
1044   const struct brw_device_info *devinfo = p->devinfo;					      \
1045   brw_inst *rnd, *add;							      \
1046   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
1047   brw_set_dest(p, rnd, dest);						      \
1048   brw_set_src0(p, rnd, src);						      \
1049									      \
1050   if (devinfo->gen < 6) {							      \
1051      /* turn on round-increments */					      \
1052      brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
1053      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
1054      brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
1055   }									      \
1056}
1057
1058
1059ALU1(MOV)
1060ALU2(SEL)
1061ALU1(NOT)
1062ALU2(AND)
1063ALU2(OR)
1064ALU2(XOR)
1065ALU2(SHR)
1066ALU2(SHL)
1067ALU2(ASR)
1068ALU1(FRC)
1069ALU1(RNDD)
1070ALU2(MAC)
1071ALU2(MACH)
1072ALU1(LZD)
1073ALU2(DP4)
1074ALU2(DPH)
1075ALU2(DP3)
1076ALU2(DP2)
1077ALU3F(MAD)
1078ALU3F(LRP)
1079ALU1(BFREV)
1080ALU3(BFE)
1081ALU2(BFI1)
1082ALU3(BFI2)
1083ALU1(FBH)
1084ALU1(FBL)
1085ALU1(CBIT)
1086ALU2(ADDC)
1087ALU2(SUBB)
1088
1089ROUND(RNDZ)
1090ROUND(RNDE)
1091
1092
1093brw_inst *
1094brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1095        struct brw_reg src0, struct brw_reg src1)
1096{
1097   /* 6.2.2: add */
1098   if (src0.type == BRW_REGISTER_TYPE_F ||
1099       (src0.file == BRW_IMMEDIATE_VALUE &&
1100	src0.type == BRW_REGISTER_TYPE_VF)) {
1101      assert(src1.type != BRW_REGISTER_TYPE_UD);
1102      assert(src1.type != BRW_REGISTER_TYPE_D);
1103   }
1104
1105   if (src1.type == BRW_REGISTER_TYPE_F ||
1106       (src1.file == BRW_IMMEDIATE_VALUE &&
1107	src1.type == BRW_REGISTER_TYPE_VF)) {
1108      assert(src0.type != BRW_REGISTER_TYPE_UD);
1109      assert(src0.type != BRW_REGISTER_TYPE_D);
1110   }
1111
1112   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1113}
1114
1115brw_inst *
1116brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1117        struct brw_reg src0, struct brw_reg src1)
1118{
1119   assert(dest.type == src0.type);
1120   assert(src0.type == src1.type);
1121   switch (src0.type) {
1122   case BRW_REGISTER_TYPE_B:
1123   case BRW_REGISTER_TYPE_UB:
1124   case BRW_REGISTER_TYPE_W:
1125   case BRW_REGISTER_TYPE_UW:
1126   case BRW_REGISTER_TYPE_D:
1127   case BRW_REGISTER_TYPE_UD:
1128      break;
1129   default:
1130      unreachable("Bad type for brw_AVG");
1131   }
1132
1133   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1134}
1135
1136brw_inst *
1137brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1138        struct brw_reg src0, struct brw_reg src1)
1139{
1140   /* 6.32.38: mul */
1141   if (src0.type == BRW_REGISTER_TYPE_D ||
1142       src0.type == BRW_REGISTER_TYPE_UD ||
1143       src1.type == BRW_REGISTER_TYPE_D ||
1144       src1.type == BRW_REGISTER_TYPE_UD) {
1145      assert(dest.type != BRW_REGISTER_TYPE_F);
1146   }
1147
1148   if (src0.type == BRW_REGISTER_TYPE_F ||
1149       (src0.file == BRW_IMMEDIATE_VALUE &&
1150	src0.type == BRW_REGISTER_TYPE_VF)) {
1151      assert(src1.type != BRW_REGISTER_TYPE_UD);
1152      assert(src1.type != BRW_REGISTER_TYPE_D);
1153   }
1154
1155   if (src1.type == BRW_REGISTER_TYPE_F ||
1156       (src1.file == BRW_IMMEDIATE_VALUE &&
1157	src1.type == BRW_REGISTER_TYPE_VF)) {
1158      assert(src0.type != BRW_REGISTER_TYPE_UD);
1159      assert(src0.type != BRW_REGISTER_TYPE_D);
1160   }
1161
1162   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1163	  src0.nr != BRW_ARF_ACCUMULATOR);
1164   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1165	  src1.nr != BRW_ARF_ACCUMULATOR);
1166
1167   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1168}
1169
1170brw_inst *
1171brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1172         struct brw_reg src0, struct brw_reg src1)
1173{
1174   src0.vstride = BRW_VERTICAL_STRIDE_0;
1175   src0.width = BRW_WIDTH_1;
1176   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1177   return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1178}
1179
1180brw_inst *
1181brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1182        struct brw_reg src0, struct brw_reg src1)
1183{
1184   src0.vstride = BRW_VERTICAL_STRIDE_0;
1185   src0.width = BRW_WIDTH_1;
1186   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1187   src1.vstride = BRW_VERTICAL_STRIDE_8;
1188   src1.width = BRW_WIDTH_8;
1189   src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1190   return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1191}
1192
1193brw_inst *
1194brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1195{
1196   const struct brw_device_info *devinfo = p->devinfo;
1197   const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1198   /* The F32TO16 instruction doesn't support 32-bit destination types in
1199    * Align1 mode, and neither does the Gen8 implementation in terms of a
1200    * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
1201    * an undocumented feature.
1202    */
1203   const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1204                                 (!align16 || devinfo->gen >= 8));
1205   brw_inst *inst;
1206
1207   if (align16) {
1208      assert(dst.type == BRW_REGISTER_TYPE_UD);
1209   } else {
1210      assert(dst.type == BRW_REGISTER_TYPE_UD ||
1211             dst.type == BRW_REGISTER_TYPE_W ||
1212             dst.type == BRW_REGISTER_TYPE_UW ||
1213             dst.type == BRW_REGISTER_TYPE_HF);
1214   }
1215
1216   brw_push_insn_state(p);
1217
1218   if (needs_zero_fill) {
1219      brw_set_default_access_mode(p, BRW_ALIGN_1);
1220      dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1221   }
1222
1223   if (devinfo->gen >= 8) {
1224      inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1225   } else {
1226      assert(devinfo->gen == 7);
1227      inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1228   }
1229
1230   if (needs_zero_fill) {
1231      brw_inst_set_no_dd_clear(devinfo, inst, true);
1232      inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
1233      brw_inst_set_no_dd_check(devinfo, inst, true);
1234   }
1235
1236   brw_pop_insn_state(p);
1237   return inst;
1238}
1239
1240brw_inst *
1241brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1242{
1243   const struct brw_device_info *devinfo = p->devinfo;
1244   bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1245
1246   if (align16) {
1247      assert(src.type == BRW_REGISTER_TYPE_UD);
1248   } else {
1249      /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1250       *
1251       *   Because this instruction does not have a 16-bit floating-point
1252       *   type, the source data type must be Word (W). The destination type
1253       *   must be F (Float).
1254       */
1255      if (src.type == BRW_REGISTER_TYPE_UD)
1256         src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1257
1258      assert(src.type == BRW_REGISTER_TYPE_W ||
1259             src.type == BRW_REGISTER_TYPE_UW ||
1260             src.type == BRW_REGISTER_TYPE_HF);
1261   }
1262
1263   if (devinfo->gen >= 8) {
1264      return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1265   } else {
1266      assert(devinfo->gen == 7);
1267      return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1268   }
1269}
1270
1271
1272void brw_NOP(struct brw_codegen *p)
1273{
1274   brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1275   brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_1);
1276   brw_set_dest(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1277   brw_set_src0(p, insn, retype(brw_vec1_grf(0,0), BRW_REGISTER_TYPE_UD));
1278   brw_set_src1(p, insn, brw_imm_ud(0x0));
1279}
1280
1281
1282
1283
1284
1285/***********************************************************************
1286 * Comparisons, if/else/endif
1287 */
1288
1289brw_inst *
1290brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1291         unsigned predicate_control)
1292{
1293   const struct brw_device_info *devinfo = p->devinfo;
1294   struct brw_reg ip = brw_ip_reg();
1295   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1296
1297   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1298   brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1299   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1300   brw_inst_set_pred_control(devinfo, inst, predicate_control);
1301
1302   return inst;
1303}
1304
1305static void
1306push_if_stack(struct brw_codegen *p, brw_inst *inst)
1307{
1308   p->if_stack[p->if_stack_depth] = inst - p->store;
1309
1310   p->if_stack_depth++;
1311   if (p->if_stack_array_size <= p->if_stack_depth) {
1312      p->if_stack_array_size *= 2;
1313      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1314			     p->if_stack_array_size);
1315   }
1316}
1317
1318static brw_inst *
1319pop_if_stack(struct brw_codegen *p)
1320{
1321   p->if_stack_depth--;
1322   return &p->store[p->if_stack[p->if_stack_depth]];
1323}
1324
1325static void
1326push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1327{
1328   if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1329      p->loop_stack_array_size *= 2;
1330      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1331			       p->loop_stack_array_size);
1332      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1333				     p->loop_stack_array_size);
1334   }
1335
1336   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1337   p->loop_stack_depth++;
1338   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1339}
1340
1341static brw_inst *
1342get_inner_do_insn(struct brw_codegen *p)
1343{
1344   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1345}
1346
1347/* EU takes the value from the flag register and pushes it onto some
1348 * sort of a stack (presumably merging with any flag value already on
1349 * the stack).  Within an if block, the flags at the top of the stack
1350 * control execution on each channel of the unit, eg. on each of the
1351 * 16 pixel values in our wm programs.
1352 *
1353 * When the matching 'else' instruction is reached (presumably by
1354 * countdown of the instruction count patched in by our ELSE/ENDIF
1355 * functions), the relevant flags are inverted.
1356 *
1357 * When the matching 'endif' instruction is reached, the flags are
1358 * popped off.  If the stack is now empty, normal execution resumes.
1359 */
1360brw_inst *
1361brw_IF(struct brw_codegen *p, unsigned execute_size)
1362{
1363   const struct brw_device_info *devinfo = p->devinfo;
1364   brw_inst *insn;
1365
1366   insn = next_insn(p, BRW_OPCODE_IF);
1367
1368   /* Override the defaults for this instruction:
1369    */
1370   if (devinfo->gen < 6) {
1371      brw_set_dest(p, insn, brw_ip_reg());
1372      brw_set_src0(p, insn, brw_ip_reg());
1373      brw_set_src1(p, insn, brw_imm_d(0x0));
1374   } else if (devinfo->gen == 6) {
1375      brw_set_dest(p, insn, brw_imm_w(0));
1376      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1377      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1378      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1379   } else if (devinfo->gen == 7) {
1380      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1381      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1382      brw_set_src1(p, insn, brw_imm_w(0));
1383      brw_inst_set_jip(devinfo, insn, 0);
1384      brw_inst_set_uip(devinfo, insn, 0);
1385   } else {
1386      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1387      brw_set_src0(p, insn, brw_imm_d(0));
1388      brw_inst_set_jip(devinfo, insn, 0);
1389      brw_inst_set_uip(devinfo, insn, 0);
1390   }
1391
1392   brw_inst_set_exec_size(devinfo, insn, execute_size);
1393   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1394   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1395   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1396   if (!p->single_program_flow && devinfo->gen < 6)
1397      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1398
1399   push_if_stack(p, insn);
1400   p->if_depth_in_loop[p->loop_stack_depth]++;
1401   return insn;
1402}
1403
1404/* This function is only used for gen6-style IF instructions with an
1405 * embedded comparison (conditional modifier).  It is not used on gen7.
1406 */
1407brw_inst *
1408gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1409	struct brw_reg src0, struct brw_reg src1)
1410{
1411   const struct brw_device_info *devinfo = p->devinfo;
1412   brw_inst *insn;
1413
1414   insn = next_insn(p, BRW_OPCODE_IF);
1415
1416   brw_set_dest(p, insn, brw_imm_w(0));
1417   brw_inst_set_exec_size(devinfo, insn,
1418                          brw_inst_exec_size(devinfo, p->current));
1419   brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1420   brw_set_src0(p, insn, src0);
1421   brw_set_src1(p, insn, src1);
1422
1423   assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1424   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1425   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1426
1427   push_if_stack(p, insn);
1428   return insn;
1429}
1430
1431/**
1432 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1433 */
1434static void
1435convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1436                       brw_inst *if_inst, brw_inst *else_inst)
1437{
1438   const struct brw_device_info *devinfo = p->devinfo;
1439
1440   /* The next instruction (where the ENDIF would be, if it existed) */
1441   brw_inst *next_inst = &p->store[p->nr_insn];
1442
1443   assert(p->single_program_flow);
1444   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1445   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1446   assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1447
1448   /* Convert IF to an ADD instruction that moves the instruction pointer
1449    * to the first instruction of the ELSE block.  If there is no ELSE
1450    * block, point to where ENDIF would be.  Reverse the predicate.
1451    *
1452    * There's no need to execute an ENDIF since we don't need to do any
1453    * stack operations, and if we're currently executing, we just want to
1454    * continue normally.
1455    */
1456   brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1457   brw_inst_set_pred_inv(devinfo, if_inst, true);
1458
1459   if (else_inst != NULL) {
1460      /* Convert ELSE to an ADD instruction that points where the ENDIF
1461       * would be.
1462       */
1463      brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1464
1465      brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1466      brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1467   } else {
1468      brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1469   }
1470}
1471
1472/**
1473 * Patch IF and ELSE instructions with appropriate jump targets.
1474 */
1475static void
1476patch_IF_ELSE(struct brw_codegen *p,
1477              brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1478{
1479   const struct brw_device_info *devinfo = p->devinfo;
1480
1481   /* We shouldn't be patching IF and ELSE instructions in single program flow
1482    * mode when gen < 6, because in single program flow mode on those
1483    * platforms, we convert flow control instructions to conditional ADDs that
1484    * operate on IP (see brw_ENDIF).
1485    *
1486    * However, on Gen6, writing to IP doesn't work in single program flow mode
1487    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1488    * not be updated by non-flow control instructions.").  And on later
1489    * platforms, there is no significant benefit to converting control flow
1490    * instructions to conditional ADDs.  So we do patch IF and ELSE
1491    * instructions in single program flow mode on those platforms.
1492    */
1493   if (devinfo->gen < 6)
1494      assert(!p->single_program_flow);
1495
1496   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1497   assert(endif_inst != NULL);
1498   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1499
1500   unsigned br = brw_jump_scale(devinfo);
1501
1502   assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1503   brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1504
1505   if (else_inst == NULL) {
1506      /* Patch IF -> ENDIF */
1507      if (devinfo->gen < 6) {
1508	 /* Turn it into an IFF, which means no mask stack operations for
1509	  * all-false and jumping past the ENDIF.
1510	  */
1511         brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1512         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1513                                      br * (endif_inst - if_inst + 1));
1514         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1515      } else if (devinfo->gen == 6) {
1516	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1517         brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1518      } else {
1519         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1520         brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1521      }
1522   } else {
1523      brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1524
1525      /* Patch IF -> ELSE */
1526      if (devinfo->gen < 6) {
1527         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1528                                      br * (else_inst - if_inst));
1529         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1530      } else if (devinfo->gen == 6) {
1531         brw_inst_set_gen6_jump_count(devinfo, if_inst,
1532                                      br * (else_inst - if_inst + 1));
1533      }
1534
1535      /* Patch ELSE -> ENDIF */
1536      if (devinfo->gen < 6) {
1537	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1538	  * matching ENDIF.
1539	  */
1540         brw_inst_set_gen4_jump_count(devinfo, else_inst,
1541                                      br * (endif_inst - else_inst + 1));
1542         brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1543      } else if (devinfo->gen == 6) {
1544	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1545         brw_inst_set_gen6_jump_count(devinfo, else_inst,
1546                                      br * (endif_inst - else_inst));
1547      } else {
1548	 /* The IF instruction's JIP should point just past the ELSE */
1549         brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1550	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1551         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1552         brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1553         if (devinfo->gen >= 8) {
1554            /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1555             * should point to ENDIF.
1556             */
1557            brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1558         }
1559      }
1560   }
1561}
1562
1563void
1564brw_ELSE(struct brw_codegen *p)
1565{
1566   const struct brw_device_info *devinfo = p->devinfo;
1567   brw_inst *insn;
1568
1569   insn = next_insn(p, BRW_OPCODE_ELSE);
1570
1571   if (devinfo->gen < 6) {
1572      brw_set_dest(p, insn, brw_ip_reg());
1573      brw_set_src0(p, insn, brw_ip_reg());
1574      brw_set_src1(p, insn, brw_imm_d(0x0));
1575   } else if (devinfo->gen == 6) {
1576      brw_set_dest(p, insn, brw_imm_w(0));
1577      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1578      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1579      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1580   } else if (devinfo->gen == 7) {
1581      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1582      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1583      brw_set_src1(p, insn, brw_imm_w(0));
1584      brw_inst_set_jip(devinfo, insn, 0);
1585      brw_inst_set_uip(devinfo, insn, 0);
1586   } else {
1587      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1588      brw_set_src0(p, insn, brw_imm_d(0));
1589      brw_inst_set_jip(devinfo, insn, 0);
1590      brw_inst_set_uip(devinfo, insn, 0);
1591   }
1592
1593   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1594   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1595   if (!p->single_program_flow && devinfo->gen < 6)
1596      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1597
1598   push_if_stack(p, insn);
1599}
1600
1601void
1602brw_ENDIF(struct brw_codegen *p)
1603{
1604   const struct brw_device_info *devinfo = p->devinfo;
1605   brw_inst *insn = NULL;
1606   brw_inst *else_inst = NULL;
1607   brw_inst *if_inst = NULL;
1608   brw_inst *tmp;
1609   bool emit_endif = true;
1610
1611   /* In single program flow mode, we can express IF and ELSE instructions
1612    * equivalently as ADD instructions that operate on IP.  On platforms prior
1613    * to Gen6, flow control instructions cause an implied thread switch, so
1614    * this is a significant savings.
1615    *
1616    * However, on Gen6, writing to IP doesn't work in single program flow mode
1617    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1618    * not be updated by non-flow control instructions.").  And on later
1619    * platforms, there is no significant benefit to converting control flow
1620    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1621    * Gen5.
1622    */
1623   if (devinfo->gen < 6 && p->single_program_flow)
1624      emit_endif = false;
1625
1626   /*
1627    * A single next_insn() may change the base address of instruction store
1628    * memory(p->store), so call it first before referencing the instruction
1629    * store pointer from an index
1630    */
1631   if (emit_endif)
1632      insn = next_insn(p, BRW_OPCODE_ENDIF);
1633
1634   /* Pop the IF and (optional) ELSE instructions from the stack */
1635   p->if_depth_in_loop[p->loop_stack_depth]--;
1636   tmp = pop_if_stack(p);
1637   if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1638      else_inst = tmp;
1639      tmp = pop_if_stack(p);
1640   }
1641   if_inst = tmp;
1642
1643   if (!emit_endif) {
1644      /* ENDIF is useless; don't bother emitting it. */
1645      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1646      return;
1647   }
1648
1649   if (devinfo->gen < 6) {
1650      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1651      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1652      brw_set_src1(p, insn, brw_imm_d(0x0));
1653   } else if (devinfo->gen == 6) {
1654      brw_set_dest(p, insn, brw_imm_w(0));
1655      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1656      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1657   } else if (devinfo->gen == 7) {
1658      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1659      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1660      brw_set_src1(p, insn, brw_imm_w(0));
1661   } else {
1662      brw_set_src0(p, insn, brw_imm_d(0));
1663   }
1664
1665   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1666   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1667   if (devinfo->gen < 6)
1668      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1669
1670   /* Also pop item off the stack in the endif instruction: */
1671   if (devinfo->gen < 6) {
1672      brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1673      brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1674   } else if (devinfo->gen == 6) {
1675      brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1676   } else {
1677      brw_inst_set_jip(devinfo, insn, 2);
1678   }
1679   patch_IF_ELSE(p, if_inst, else_inst, insn);
1680}
1681
1682brw_inst *
1683brw_BREAK(struct brw_codegen *p)
1684{
1685   const struct brw_device_info *devinfo = p->devinfo;
1686   brw_inst *insn;
1687
1688   insn = next_insn(p, BRW_OPCODE_BREAK);
1689   if (devinfo->gen >= 8) {
1690      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1691      brw_set_src0(p, insn, brw_imm_d(0x0));
1692   } else if (devinfo->gen >= 6) {
1693      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1694      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1695      brw_set_src1(p, insn, brw_imm_d(0x0));
1696   } else {
1697      brw_set_dest(p, insn, brw_ip_reg());
1698      brw_set_src0(p, insn, brw_ip_reg());
1699      brw_set_src1(p, insn, brw_imm_d(0x0));
1700      brw_inst_set_gen4_pop_count(devinfo, insn,
1701                                  p->if_depth_in_loop[p->loop_stack_depth]);
1702   }
1703   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1704   brw_inst_set_exec_size(devinfo, insn,
1705                          brw_inst_exec_size(devinfo, p->current));
1706
1707   return insn;
1708}
1709
1710brw_inst *
1711brw_CONT(struct brw_codegen *p)
1712{
1713   const struct brw_device_info *devinfo = p->devinfo;
1714   brw_inst *insn;
1715
1716   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1717   brw_set_dest(p, insn, brw_ip_reg());
1718   if (devinfo->gen >= 8) {
1719      brw_set_src0(p, insn, brw_imm_d(0x0));
1720   } else {
1721      brw_set_src0(p, insn, brw_ip_reg());
1722      brw_set_src1(p, insn, brw_imm_d(0x0));
1723   }
1724
1725   if (devinfo->gen < 6) {
1726      brw_inst_set_gen4_pop_count(devinfo, insn,
1727                                  p->if_depth_in_loop[p->loop_stack_depth]);
1728   }
1729   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1730   brw_inst_set_exec_size(devinfo, insn,
1731                          brw_inst_exec_size(devinfo, p->current));
1732   return insn;
1733}
1734
1735brw_inst *
1736gen6_HALT(struct brw_codegen *p)
1737{
1738   const struct brw_device_info *devinfo = p->devinfo;
1739   brw_inst *insn;
1740
1741   insn = next_insn(p, BRW_OPCODE_HALT);
1742   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1743   if (devinfo->gen >= 8) {
1744      brw_set_src0(p, insn, brw_imm_d(0x0));
1745   } else {
1746      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1747      brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1748   }
1749
1750   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1751   brw_inst_set_exec_size(devinfo, insn,
1752                          brw_inst_exec_size(devinfo, p->current));
1753   return insn;
1754}
1755
1756/* DO/WHILE loop:
1757 *
1758 * The DO/WHILE is just an unterminated loop -- break or continue are
1759 * used for control within the loop.  We have a few ways they can be
1760 * done.
1761 *
1762 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1763 * jip and no DO instruction.
1764 *
1765 * For non-uniform control flow pre-gen6, there's a DO instruction to
1766 * push the mask, and a WHILE to jump back, and BREAK to get out and
1767 * pop the mask.
1768 *
1769 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1770 * just points back to the first instruction of the loop.
1771 */
1772brw_inst *
1773brw_DO(struct brw_codegen *p, unsigned execute_size)
1774{
1775   const struct brw_device_info *devinfo = p->devinfo;
1776
1777   if (devinfo->gen >= 6 || p->single_program_flow) {
1778      push_loop_stack(p, &p->store[p->nr_insn]);
1779      return &p->store[p->nr_insn];
1780   } else {
1781      brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1782
1783      push_loop_stack(p, insn);
1784
1785      /* Override the defaults for this instruction:
1786       */
1787      brw_set_dest(p, insn, brw_null_reg());
1788      brw_set_src0(p, insn, brw_null_reg());
1789      brw_set_src1(p, insn, brw_null_reg());
1790
1791      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1792      brw_inst_set_exec_size(devinfo, insn, execute_size);
1793      brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1794
1795      return insn;
1796   }
1797}
1798
1799/**
1800 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1801 * instruction here.
1802 *
1803 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1804 * nesting, since it can always just point to the end of the block/current loop.
1805 */
1806static void
1807brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1808{
1809   const struct brw_device_info *devinfo = p->devinfo;
1810   brw_inst *do_inst = get_inner_do_insn(p);
1811   brw_inst *inst;
1812   unsigned br = brw_jump_scale(devinfo);
1813
1814   assert(devinfo->gen < 6);
1815
1816   for (inst = while_inst - 1; inst != do_inst; inst--) {
1817      /* If the jump count is != 0, that means that this instruction has already
1818       * been patched because it's part of a loop inside of the one we're
1819       * patching.
1820       */
1821      if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1822          brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1823         brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1824      } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1825                 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1826         brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1827      }
1828   }
1829}
1830
1831brw_inst *
1832brw_WHILE(struct brw_codegen *p)
1833{
1834   const struct brw_device_info *devinfo = p->devinfo;
1835   brw_inst *insn, *do_insn;
1836   unsigned br = brw_jump_scale(devinfo);
1837
1838   if (devinfo->gen >= 6) {
1839      insn = next_insn(p, BRW_OPCODE_WHILE);
1840      do_insn = get_inner_do_insn(p);
1841
1842      if (devinfo->gen >= 8) {
1843         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1844         brw_set_src0(p, insn, brw_imm_d(0));
1845         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1846      } else if (devinfo->gen == 7) {
1847         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1848         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1849         brw_set_src1(p, insn, brw_imm_w(0));
1850         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1851      } else {
1852         brw_set_dest(p, insn, brw_imm_w(0));
1853         brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1854         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1855         brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1856      }
1857
1858      brw_inst_set_exec_size(devinfo, insn,
1859                             brw_inst_exec_size(devinfo, p->current));
1860
1861   } else {
1862      if (p->single_program_flow) {
1863	 insn = next_insn(p, BRW_OPCODE_ADD);
1864         do_insn = get_inner_do_insn(p);
1865
1866	 brw_set_dest(p, insn, brw_ip_reg());
1867	 brw_set_src0(p, insn, brw_ip_reg());
1868	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1869         brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1870      } else {
1871	 insn = next_insn(p, BRW_OPCODE_WHILE);
1872         do_insn = get_inner_do_insn(p);
1873
1874         assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1875
1876	 brw_set_dest(p, insn, brw_ip_reg());
1877	 brw_set_src0(p, insn, brw_ip_reg());
1878	 brw_set_src1(p, insn, brw_imm_d(0));
1879
1880         brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1881         brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1882         brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1883
1884	 brw_patch_break_cont(p, insn);
1885      }
1886   }
1887   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1888
1889   p->loop_stack_depth--;
1890
1891   return insn;
1892}
1893
1894/* FORWARD JUMPS:
1895 */
1896void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1897{
1898   const struct brw_device_info *devinfo = p->devinfo;
1899   brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1900   unsigned jmpi = 1;
1901
1902   if (devinfo->gen >= 5)
1903      jmpi = 2;
1904
1905   assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1906   assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1907
1908   brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1909                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
1910}
1911
1912/* To integrate with the above, it makes sense that the comparison
1913 * instruction should populate the flag register.  It might be simpler
1914 * just to use the flag reg for most WM tasks?
1915 */
1916void brw_CMP(struct brw_codegen *p,
1917	     struct brw_reg dest,
1918	     unsigned conditional,
1919	     struct brw_reg src0,
1920	     struct brw_reg src1)
1921{
1922   const struct brw_device_info *devinfo = p->devinfo;
1923   brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1924
1925   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1926   brw_set_dest(p, insn, dest);
1927   brw_set_src0(p, insn, src0);
1928   brw_set_src1(p, insn, src1);
1929
1930   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1931    * page says:
1932    *    "Any CMP instruction with a null destination must use a {switch}."
1933    *
1934    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1935    * mentioned on their work-arounds pages.
1936    */
1937   if (devinfo->gen == 7) {
1938      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1939          dest.nr == BRW_ARF_NULL) {
1940         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1941      }
1942   }
1943}
1944
1945/***********************************************************************
1946 * Helpers for the various SEND message types:
1947 */
1948
1949/** Extended math function, float[8].
1950 */
1951void gen4_math(struct brw_codegen *p,
1952	       struct brw_reg dest,
1953	       unsigned function,
1954	       unsigned msg_reg_nr,
1955	       struct brw_reg src,
1956	       unsigned precision )
1957{
1958   const struct brw_device_info *devinfo = p->devinfo;
1959   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1960   unsigned data_type;
1961   if (has_scalar_region(src)) {
1962      data_type = BRW_MATH_DATA_SCALAR;
1963   } else {
1964      data_type = BRW_MATH_DATA_VECTOR;
1965   }
1966
1967   assert(devinfo->gen < 6);
1968
1969   /* Example code doesn't set predicate_control for send
1970    * instructions.
1971    */
1972   brw_inst_set_pred_control(devinfo, insn, 0);
1973   brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1974
1975   brw_set_dest(p, insn, dest);
1976   brw_set_src0(p, insn, src);
1977   brw_set_math_message(p,
1978                        insn,
1979                        function,
1980                        src.type == BRW_REGISTER_TYPE_D,
1981                        precision,
1982                        data_type);
1983}
1984
1985void gen6_math(struct brw_codegen *p,
1986	       struct brw_reg dest,
1987	       unsigned function,
1988	       struct brw_reg src0,
1989	       struct brw_reg src1)
1990{
1991   const struct brw_device_info *devinfo = p->devinfo;
1992   brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1993
1994   assert(devinfo->gen >= 6);
1995
1996   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1997          (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1998   assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
1999          (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
2000
2001   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2002   if (devinfo->gen == 6) {
2003      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2004      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2005   }
2006
2007   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2008       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2009       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2010      assert(src0.type != BRW_REGISTER_TYPE_F);
2011      assert(src1.type != BRW_REGISTER_TYPE_F);
2012      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2013             (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2014   } else {
2015      assert(src0.type == BRW_REGISTER_TYPE_F);
2016      assert(src1.type == BRW_REGISTER_TYPE_F);
2017      if (function == BRW_MATH_FUNCTION_POW) {
2018         assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2019                (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2020      } else {
2021         assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2022                src1.nr == BRW_ARF_NULL);
2023      }
2024   }
2025
2026   /* Source modifiers are ignored for extended math instructions on Gen6. */
2027   if (devinfo->gen == 6) {
2028      assert(!src0.negate);
2029      assert(!src0.abs);
2030      assert(!src1.negate);
2031      assert(!src1.abs);
2032   }
2033
2034   brw_inst_set_math_function(devinfo, insn, function);
2035
2036   brw_set_dest(p, insn, dest);
2037   brw_set_src0(p, insn, src0);
2038   brw_set_src1(p, insn, src1);
2039}
2040
2041/**
2042 * Return the right surface index to access the thread scratch space using
2043 * stateless dataport messages.
2044 */
2045unsigned
2046brw_scratch_surface_idx(const struct brw_codegen *p)
2047{
2048   /* The scratch space is thread-local so IA coherency is unnecessary. */
2049   if (p->devinfo->gen >= 8)
2050      return GEN8_BTI_STATELESS_NON_COHERENT;
2051   else
2052      return BRW_BTI_STATELESS;
2053}
2054
2055/**
2056 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2057 * using a constant offset per channel.
2058 *
2059 * The offset must be aligned to oword size (16 bytes).  Used for
2060 * register spilling.
2061 */
2062void brw_oword_block_write_scratch(struct brw_codegen *p,
2063				   struct brw_reg mrf,
2064				   int num_regs,
2065				   unsigned offset)
2066{
2067   const struct brw_device_info *devinfo = p->devinfo;
2068   uint32_t msg_type;
2069
2070   if (devinfo->gen >= 6)
2071      offset /= 16;
2072
2073   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2074
2075   const unsigned mlen = 1 + num_regs;
2076   const unsigned msg_control =
2077      (num_regs == 1 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :
2078       num_regs == 2 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :
2079       num_regs == 4 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : 0);
2080   assert(msg_control);
2081
2082   /* Set up the message header.  This is g0, with g0.2 filled with
2083    * the offset.  We don't want to leave our offset around in g0 or
2084    * it'll screw up texture samples, so set it up inside the message
2085    * reg.
2086    */
2087   {
2088      brw_push_insn_state(p);
2089      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2090      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2091      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2092
2093      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2094
2095      /* set message header global offset field (reg 0, element 2) */
2096      brw_MOV(p,
2097	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2098				  mrf.nr,
2099				  2), BRW_REGISTER_TYPE_UD),
2100	      brw_imm_ud(offset));
2101
2102      brw_pop_insn_state(p);
2103   }
2104
2105   {
2106      struct brw_reg dest;
2107      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2108      int send_commit_msg;
2109      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2110					 BRW_REGISTER_TYPE_UW);
2111
2112      brw_inst_set_compression(devinfo, insn, false);
2113
2114      if (brw_inst_exec_size(devinfo, insn) >= 16)
2115	 src_header = vec16(src_header);
2116
2117      assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2118      if (devinfo->gen < 6)
2119         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2120
2121      /* Until gen6, writes followed by reads from the same location
2122       * are not guaranteed to be ordered unless write_commit is set.
2123       * If set, then a no-op write is issued to the destination
2124       * register to set a dependency, and a read from the destination
2125       * can be used to ensure the ordering.
2126       *
2127       * For gen6, only writes between different threads need ordering
2128       * protection.  Our use of DP writes is all about register
2129       * spilling within a thread.
2130       */
2131      if (devinfo->gen >= 6) {
2132	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2133	 send_commit_msg = 0;
2134      } else {
2135	 dest = src_header;
2136	 send_commit_msg = 1;
2137      }
2138
2139      brw_set_dest(p, insn, dest);
2140      if (devinfo->gen >= 6) {
2141	 brw_set_src0(p, insn, mrf);
2142      } else {
2143	 brw_set_src0(p, insn, brw_null_reg());
2144      }
2145
2146      if (devinfo->gen >= 6)
2147	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2148      else
2149	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2150
2151      brw_set_dp_write_message(p,
2152			       insn,
2153                               brw_scratch_surface_idx(p),
2154			       msg_control,
2155			       msg_type,
2156			       mlen,
2157			       true, /* header_present */
2158			       0, /* not a render target */
2159			       send_commit_msg, /* response_length */
2160			       0, /* eot */
2161			       send_commit_msg);
2162   }
2163}
2164
2165
2166/**
2167 * Read a block of owords (half a GRF each) from the scratch buffer
2168 * using a constant index per channel.
2169 *
2170 * Offset must be aligned to oword size (16 bytes).  Used for register
2171 * spilling.
2172 */
2173void
2174brw_oword_block_read_scratch(struct brw_codegen *p,
2175			     struct brw_reg dest,
2176			     struct brw_reg mrf,
2177			     int num_regs,
2178			     unsigned offset)
2179{
2180   const struct brw_device_info *devinfo = p->devinfo;
2181
2182   if (devinfo->gen >= 6)
2183      offset /= 16;
2184
2185   if (p->devinfo->gen >= 7) {
2186      /* On gen 7 and above, we no longer have message registers and we can
2187       * send from any register we want.  By using the destination register
2188       * for the message, we guarantee that the implied message write won't
2189       * accidentally overwrite anything.  This has been a problem because
2190       * the MRF registers and source for the final FB write are both fixed
2191       * and may overlap.
2192       */
2193      mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2194   } else {
2195      mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2196   }
2197   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2198
2199   const unsigned rlen = num_regs;
2200   const unsigned msg_control =
2201      (num_regs == 1 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :
2202       num_regs == 2 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :
2203       num_regs == 4 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS : 0);
2204   assert(msg_control);
2205
2206   {
2207      brw_push_insn_state(p);
2208      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2209      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2210      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2211
2212      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2213
2214      /* set message header global offset field (reg 0, element 2) */
2215      brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2216
2217      brw_pop_insn_state(p);
2218   }
2219
2220   {
2221      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2222
2223      assert(brw_inst_pred_control(devinfo, insn) == 0);
2224      brw_inst_set_compression(devinfo, insn, false);
2225
2226      brw_set_dest(p, insn, dest);	/* UW? */
2227      if (devinfo->gen >= 6) {
2228	 brw_set_src0(p, insn, mrf);
2229      } else {
2230	 brw_set_src0(p, insn, brw_null_reg());
2231         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2232      }
2233
2234      brw_set_dp_read_message(p,
2235			      insn,
2236                              brw_scratch_surface_idx(p),
2237			      msg_control,
2238			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2239			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2240			      1, /* msg_length */
2241                              true, /* header_present */
2242			      rlen);
2243   }
2244}
2245
2246void
2247gen7_block_read_scratch(struct brw_codegen *p,
2248                        struct brw_reg dest,
2249                        int num_regs,
2250                        unsigned offset)
2251{
2252   const struct brw_device_info *devinfo = p->devinfo;
2253   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2254   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2255
2256   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2257
2258   /* The HW requires that the header is present; this is to get the g0.5
2259    * scratch offset.
2260    */
2261   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2262
2263   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2264    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2265    * is 32 bytes, which happens to be the size of a register.
2266    */
2267   offset /= REG_SIZE;
2268   assert(offset < (1 << 12));
2269
2270   gen7_set_dp_scratch_message(p, insn,
2271                               false, /* scratch read */
2272                               false, /* OWords */
2273                               false, /* invalidate after read */
2274                               num_regs,
2275                               offset,
2276                               1,        /* mlen: just g0 */
2277                               num_regs, /* rlen */
2278                               true);    /* header present */
2279}
2280
2281/**
2282 * Read a float[4] vector from the data port Data Cache (const buffer).
2283 * Location (in buffer) should be a multiple of 16.
2284 * Used for fetching shader constants.
2285 */
2286void brw_oword_block_read(struct brw_codegen *p,
2287			  struct brw_reg dest,
2288			  struct brw_reg mrf,
2289			  uint32_t offset,
2290			  uint32_t bind_table_index)
2291{
2292   const struct brw_device_info *devinfo = p->devinfo;
2293
2294   /* On newer hardware, offset is in units of owords. */
2295   if (devinfo->gen >= 6)
2296      offset /= 16;
2297
2298   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2299
2300   brw_push_insn_state(p);
2301   brw_set_default_exec_size(p, BRW_EXECUTE_8);
2302   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2303   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2304   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2305
2306   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2307
2308   /* set message header global offset field (reg 0, element 2) */
2309   brw_MOV(p,
2310	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2311			       mrf.nr,
2312			       2), BRW_REGISTER_TYPE_UD),
2313	   brw_imm_ud(offset));
2314
2315   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2316
2317   /* cast dest to a uword[8] vector */
2318   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2319
2320   brw_set_dest(p, insn, dest);
2321   if (devinfo->gen >= 6) {
2322      brw_set_src0(p, insn, mrf);
2323   } else {
2324      brw_set_src0(p, insn, brw_null_reg());
2325      brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2326   }
2327
2328   brw_set_dp_read_message(p,
2329			   insn,
2330			   bind_table_index,
2331			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2332			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2333			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2334			   1, /* msg_length */
2335                           true, /* header_present */
2336			   1); /* response_length (1 reg, 2 owords!) */
2337
2338   brw_pop_insn_state(p);
2339}
2340
2341
2342void brw_fb_WRITE(struct brw_codegen *p,
2343                  struct brw_reg payload,
2344                  struct brw_reg implied_header,
2345                  unsigned msg_control,
2346                  unsigned binding_table_index,
2347                  unsigned msg_length,
2348                  unsigned response_length,
2349                  bool eot,
2350                  bool last_render_target,
2351                  bool header_present)
2352{
2353   const struct brw_device_info *devinfo = p->devinfo;
2354   brw_inst *insn;
2355   unsigned msg_type;
2356   struct brw_reg dest, src0;
2357
2358   if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2359      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2360   else
2361      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2362
2363   if (devinfo->gen >= 6) {
2364      insn = next_insn(p, BRW_OPCODE_SENDC);
2365   } else {
2366      insn = next_insn(p, BRW_OPCODE_SEND);
2367   }
2368   brw_inst_set_compression(devinfo, insn, false);
2369
2370   if (devinfo->gen >= 6) {
2371      /* headerless version, just submit color payload */
2372      src0 = payload;
2373
2374      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2375   } else {
2376      assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2377      brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2378      src0 = implied_header;
2379
2380      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2381   }
2382
2383   brw_set_dest(p, insn, dest);
2384   brw_set_src0(p, insn, src0);
2385   brw_set_dp_write_message(p,
2386			    insn,
2387			    binding_table_index,
2388			    msg_control,
2389			    msg_type,
2390			    msg_length,
2391			    header_present,
2392			    last_render_target,
2393			    response_length,
2394			    eot,
2395			    0 /* send_commit_msg */);
2396}
2397
2398
2399/**
2400 * Texture sample instruction.
2401 * Note: the msg_type plus msg_length values determine exactly what kind
2402 * of sampling operation is performed.  See volume 4, page 161 of docs.
2403 */
2404void brw_SAMPLE(struct brw_codegen *p,
2405		struct brw_reg dest,
2406		unsigned msg_reg_nr,
2407		struct brw_reg src0,
2408		unsigned binding_table_index,
2409		unsigned sampler,
2410		unsigned msg_type,
2411		unsigned response_length,
2412		unsigned msg_length,
2413		unsigned header_present,
2414		unsigned simd_mode,
2415		unsigned return_format)
2416{
2417   const struct brw_device_info *devinfo = p->devinfo;
2418   brw_inst *insn;
2419
2420   if (msg_reg_nr != -1)
2421      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2422
2423   insn = next_insn(p, BRW_OPCODE_SEND);
2424   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2425
2426   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2427    *
2428    *    "Instruction compression is not allowed for this instruction (that
2429    *     is, send). The hardware behavior is undefined if this instruction is
2430    *     set as compressed. However, compress control can be set to "SecHalf"
2431    *     to affect the EMask generation."
2432    *
2433    * No similar wording is found in later PRMs, but there are examples
2434    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2435    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2436    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2437    */
2438   brw_inst_set_compression(devinfo, insn, false);
2439
2440   if (devinfo->gen < 6)
2441      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2442
2443   brw_set_dest(p, insn, dest);
2444   brw_set_src0(p, insn, src0);
2445   brw_set_sampler_message(p, insn,
2446                           binding_table_index,
2447                           sampler,
2448                           msg_type,
2449                           response_length,
2450                           msg_length,
2451                           header_present,
2452                           simd_mode,
2453                           return_format);
2454}
2455
2456/* Adjust the message header's sampler state pointer to
2457 * select the correct group of 16 samplers.
2458 */
2459void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2460                                      struct brw_reg header,
2461                                      struct brw_reg sampler_index)
2462{
2463   /* The "Sampler Index" field can only store values between 0 and 15.
2464    * However, we can add an offset to the "Sampler State Pointer"
2465    * field, effectively selecting a different set of 16 samplers.
2466    *
2467    * The "Sampler State Pointer" needs to be aligned to a 32-byte
2468    * offset, and each sampler state is only 16-bytes, so we can't
2469    * exclusively use the offset - we have to use both.
2470    */
2471
2472   const struct brw_device_info *devinfo = p->devinfo;
2473
2474   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2475      const int sampler_state_size = 16; /* 16 bytes */
2476      uint32_t sampler = sampler_index.ud;
2477
2478      if (sampler >= 16) {
2479         assert(devinfo->is_haswell || devinfo->gen >= 8);
2480         brw_ADD(p,
2481                 get_element_ud(header, 3),
2482                 get_element_ud(brw_vec8_grf(0, 0), 3),
2483                 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2484      }
2485   } else {
2486      /* Non-const sampler array indexing case */
2487      if (devinfo->gen < 8 && !devinfo->is_haswell) {
2488         return;
2489      }
2490
2491      struct brw_reg temp = get_element_ud(header, 3);
2492
2493      brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2494      brw_SHL(p, temp, temp, brw_imm_ud(4));
2495      brw_ADD(p,
2496              get_element_ud(header, 3),
2497              get_element_ud(brw_vec8_grf(0, 0), 3),
2498              temp);
2499   }
2500}
2501
2502/* All these variables are pretty confusing - we might be better off
2503 * using bitmasks and macros for this, in the old style.  Or perhaps
2504 * just having the caller instantiate the fields in dword3 itself.
2505 */
2506void brw_urb_WRITE(struct brw_codegen *p,
2507		   struct brw_reg dest,
2508		   unsigned msg_reg_nr,
2509		   struct brw_reg src0,
2510                   enum brw_urb_write_flags flags,
2511		   unsigned msg_length,
2512		   unsigned response_length,
2513		   unsigned offset,
2514		   unsigned swizzle)
2515{
2516   const struct brw_device_info *devinfo = p->devinfo;
2517   brw_inst *insn;
2518
2519   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2520
2521   if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2522      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2523      brw_push_insn_state(p);
2524      brw_set_default_access_mode(p, BRW_ALIGN_1);
2525      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2526      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2527		       BRW_REGISTER_TYPE_UD),
2528	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2529		brw_imm_ud(0xff00));
2530      brw_pop_insn_state(p);
2531   }
2532
2533   insn = next_insn(p, BRW_OPCODE_SEND);
2534
2535   assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2536
2537   brw_set_dest(p, insn, dest);
2538   brw_set_src0(p, insn, src0);
2539   brw_set_src1(p, insn, brw_imm_d(0));
2540
2541   if (devinfo->gen < 6)
2542      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2543
2544   brw_set_urb_message(p,
2545		       insn,
2546		       flags,
2547		       msg_length,
2548		       response_length,
2549		       offset,
2550		       swizzle);
2551}
2552
2553struct brw_inst *
2554brw_send_indirect_message(struct brw_codegen *p,
2555                          unsigned sfid,
2556                          struct brw_reg dst,
2557                          struct brw_reg payload,
2558                          struct brw_reg desc)
2559{
2560   const struct brw_device_info *devinfo = p->devinfo;
2561   struct brw_inst *send;
2562   int setup;
2563
2564   dst = retype(dst, BRW_REGISTER_TYPE_UW);
2565
2566   assert(desc.type == BRW_REGISTER_TYPE_UD);
2567
2568   /* We hold on to the setup instruction (the SEND in the direct case, the OR
2569    * in the indirect case) by its index in the instruction store.  The
2570    * pointer returned by next_insn() may become invalid if emitting the SEND
2571    * in the indirect case reallocs the store.
2572    */
2573
2574   if (desc.file == BRW_IMMEDIATE_VALUE) {
2575      setup = p->nr_insn;
2576      send = next_insn(p, BRW_OPCODE_SEND);
2577      brw_set_src1(p, send, desc);
2578
2579   } else {
2580      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2581
2582      brw_push_insn_state(p);
2583      brw_set_default_access_mode(p, BRW_ALIGN_1);
2584      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2585      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2586
2587      /* Load the indirect descriptor to an address register using OR so the
2588       * caller can specify additional descriptor bits with the usual
2589       * brw_set_*_message() helper functions.
2590       */
2591      setup = p->nr_insn;
2592      brw_OR(p, addr, desc, brw_imm_ud(0));
2593
2594      brw_pop_insn_state(p);
2595
2596      send = next_insn(p, BRW_OPCODE_SEND);
2597      brw_set_src1(p, send, addr);
2598   }
2599
2600   if (dst.width < BRW_EXECUTE_8)
2601      brw_inst_set_exec_size(devinfo, send, dst.width);
2602
2603   brw_set_dest(p, send, dst);
2604   brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2605   brw_inst_set_sfid(devinfo, send, sfid);
2606
2607   return &p->store[setup];
2608}
2609
2610static struct brw_inst *
2611brw_send_indirect_surface_message(struct brw_codegen *p,
2612                                  unsigned sfid,
2613                                  struct brw_reg dst,
2614                                  struct brw_reg payload,
2615                                  struct brw_reg surface,
2616                                  unsigned message_len,
2617                                  unsigned response_len,
2618                                  bool header_present)
2619{
2620   const struct brw_device_info *devinfo = p->devinfo;
2621   struct brw_inst *insn;
2622
2623   if (surface.file != BRW_IMMEDIATE_VALUE) {
2624      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2625
2626      brw_push_insn_state(p);
2627      brw_set_default_access_mode(p, BRW_ALIGN_1);
2628      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2629      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2630
2631      /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2632       * some surface array is accessed out of bounds.
2633       */
2634      insn = brw_AND(p, addr,
2635                     suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2636                               BRW_GET_SWZ(surface.swizzle, 0)),
2637                     brw_imm_ud(0xff));
2638
2639      brw_pop_insn_state(p);
2640
2641      surface = addr;
2642   }
2643
2644   insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2645   brw_inst_set_mlen(devinfo, insn, message_len);
2646   brw_inst_set_rlen(devinfo, insn, response_len);
2647   brw_inst_set_header_present(devinfo, insn, header_present);
2648
2649   return insn;
2650}
2651
2652static bool
2653while_jumps_before_offset(const struct brw_device_info *devinfo,
2654                          brw_inst *insn, int while_offset, int start_offset)
2655{
2656   int scale = 16 / brw_jump_scale(devinfo);
2657   int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2658                               : brw_inst_jip(devinfo, insn);
2659   return while_offset + jip * scale <= start_offset;
2660}
2661
2662
2663static int
2664brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2665{
2666   int offset;
2667   void *store = p->store;
2668   const struct brw_device_info *devinfo = p->devinfo;
2669
2670   int depth = 0;
2671
2672   for (offset = next_offset(devinfo, store, start_offset);
2673        offset < p->next_insn_offset;
2674        offset = next_offset(devinfo, store, offset)) {
2675      brw_inst *insn = store + offset;
2676
2677      switch (brw_inst_opcode(devinfo, insn)) {
2678      case BRW_OPCODE_IF:
2679         depth++;
2680         break;
2681      case BRW_OPCODE_ENDIF:
2682         if (depth == 0)
2683            return offset;
2684         depth--;
2685         break;
2686      case BRW_OPCODE_WHILE:
2687         /* If the while doesn't jump before our instruction, it's the end
2688          * of a sibling do...while loop.  Ignore it.
2689          */
2690         if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2691            continue;
2692         /* fallthrough */
2693      case BRW_OPCODE_ELSE:
2694      case BRW_OPCODE_HALT:
2695         if (depth == 0)
2696            return offset;
2697      }
2698   }
2699
2700   return 0;
2701}
2702
2703/* There is no DO instruction on gen6, so to find the end of the loop
2704 * we have to see if the loop is jumping back before our start
2705 * instruction.
2706 */
2707static int
2708brw_find_loop_end(struct brw_codegen *p, int start_offset)
2709{
2710   const struct brw_device_info *devinfo = p->devinfo;
2711   int offset;
2712   void *store = p->store;
2713
2714   assert(devinfo->gen >= 6);
2715
2716   /* Always start after the instruction (such as a WHILE) we're trying to fix
2717    * up.
2718    */
2719   for (offset = next_offset(devinfo, store, start_offset);
2720        offset < p->next_insn_offset;
2721        offset = next_offset(devinfo, store, offset)) {
2722      brw_inst *insn = store + offset;
2723
2724      if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2725	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2726	    return offset;
2727      }
2728   }
2729   assert(!"not reached");
2730   return start_offset;
2731}
2732
2733/* After program generation, go back and update the UIP and JIP of
2734 * BREAK, CONT, and HALT instructions to their correct locations.
2735 */
2736void
2737brw_set_uip_jip(struct brw_codegen *p)
2738{
2739   const struct brw_device_info *devinfo = p->devinfo;
2740   int offset;
2741   int br = brw_jump_scale(devinfo);
2742   int scale = 16 / br;
2743   void *store = p->store;
2744
2745   if (devinfo->gen < 6)
2746      return;
2747
2748   for (offset = 0; offset < p->next_insn_offset;
2749        offset = next_offset(devinfo, store, offset)) {
2750      brw_inst *insn = store + offset;
2751
2752      if (brw_inst_cmpt_control(devinfo, insn)) {
2753	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2754         assert(brw_inst_opcode(devinfo, insn) != BRW_OPCODE_BREAK &&
2755                brw_inst_opcode(devinfo, insn) != BRW_OPCODE_CONTINUE &&
2756                brw_inst_opcode(devinfo, insn) != BRW_OPCODE_HALT);
2757	 continue;
2758      }
2759
2760      int block_end_offset = brw_find_next_block_end(p, offset);
2761      switch (brw_inst_opcode(devinfo, insn)) {
2762      case BRW_OPCODE_BREAK:
2763         assert(block_end_offset != 0);
2764         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2765	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2766         brw_inst_set_uip(devinfo, insn,
2767	    (brw_find_loop_end(p, offset) - offset +
2768             (devinfo->gen == 6 ? 16 : 0)) / scale);
2769	 break;
2770      case BRW_OPCODE_CONTINUE:
2771         assert(block_end_offset != 0);
2772         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2773         brw_inst_set_uip(devinfo, insn,
2774            (brw_find_loop_end(p, offset) - offset) / scale);
2775
2776         assert(brw_inst_uip(devinfo, insn) != 0);
2777         assert(brw_inst_jip(devinfo, insn) != 0);
2778	 break;
2779
2780      case BRW_OPCODE_ENDIF: {
2781         int32_t jump = (block_end_offset == 0) ?
2782                        1 * br : (block_end_offset - offset) / scale;
2783         if (devinfo->gen >= 7)
2784            brw_inst_set_jip(devinfo, insn, jump);
2785         else
2786            brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2787	 break;
2788      }
2789
2790      case BRW_OPCODE_HALT:
2791	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2792	  *
2793	  *    "In case of the halt instruction not inside any conditional
2794	  *     code block, the value of <JIP> and <UIP> should be the
2795	  *     same. In case of the halt instruction inside conditional code
2796	  *     block, the <UIP> should be the end of the program, and the
2797	  *     <JIP> should be end of the most inner conditional code block."
2798	  *
2799	  * The uip will have already been set by whoever set up the
2800	  * instruction.
2801	  */
2802	 if (block_end_offset == 0) {
2803            brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2804	 } else {
2805            brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2806	 }
2807         assert(brw_inst_uip(devinfo, insn) != 0);
2808         assert(brw_inst_jip(devinfo, insn) != 0);
2809	 break;
2810      }
2811   }
2812}
2813
2814void brw_ff_sync(struct brw_codegen *p,
2815		   struct brw_reg dest,
2816		   unsigned msg_reg_nr,
2817		   struct brw_reg src0,
2818		   bool allocate,
2819		   unsigned response_length,
2820		   bool eot)
2821{
2822   const struct brw_device_info *devinfo = p->devinfo;
2823   brw_inst *insn;
2824
2825   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2826
2827   insn = next_insn(p, BRW_OPCODE_SEND);
2828   brw_set_dest(p, insn, dest);
2829   brw_set_src0(p, insn, src0);
2830   brw_set_src1(p, insn, brw_imm_d(0));
2831
2832   if (devinfo->gen < 6)
2833      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2834
2835   brw_set_ff_sync_message(p,
2836			   insn,
2837			   allocate,
2838			   response_length,
2839			   eot);
2840}
2841
2842/**
2843 * Emit the SEND instruction necessary to generate stream output data on Gen6
2844 * (for transform feedback).
2845 *
2846 * If send_commit_msg is true, this is the last piece of stream output data
2847 * from this thread, so send the data as a committed write.  According to the
2848 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2849 *
2850 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2851 *   writes are complete by sending the final write as a committed write."
2852 */
2853void
2854brw_svb_write(struct brw_codegen *p,
2855              struct brw_reg dest,
2856              unsigned msg_reg_nr,
2857              struct brw_reg src0,
2858              unsigned binding_table_index,
2859              bool   send_commit_msg)
2860{
2861   brw_inst *insn;
2862
2863   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2864
2865   insn = next_insn(p, BRW_OPCODE_SEND);
2866   brw_set_dest(p, insn, dest);
2867   brw_set_src0(p, insn, src0);
2868   brw_set_src1(p, insn, brw_imm_d(0));
2869   brw_set_dp_write_message(p, insn,
2870                            binding_table_index,
2871                            0, /* msg_control: ignored */
2872                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2873                            1, /* msg_length */
2874                            true, /* header_present */
2875                            0, /* last_render_target: ignored */
2876                            send_commit_msg, /* response_length */
2877                            0, /* end_of_thread */
2878                            send_commit_msg); /* send_commit_msg */
2879}
2880
2881static unsigned
2882brw_surface_payload_size(struct brw_codegen *p,
2883                         unsigned num_channels,
2884                         bool has_simd4x2,
2885                         bool has_simd16)
2886{
2887   if (has_simd4x2 &&
2888       brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2889      return 1;
2890   else if (has_simd16 &&
2891            brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2892      return 2 * num_channels;
2893   else
2894      return num_channels;
2895}
2896
2897static void
2898brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2899                                  brw_inst *insn,
2900                                  unsigned atomic_op,
2901                                  bool response_expected)
2902{
2903   const struct brw_device_info *devinfo = p->devinfo;
2904   unsigned msg_control =
2905      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2906      (response_expected ? 1 << 5 : 0); /* Return data expected */
2907
2908   if (devinfo->gen >= 8 || devinfo->is_haswell) {
2909      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2910         if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_8)
2911            msg_control |= 1 << 4; /* SIMD8 mode */
2912
2913         brw_inst_set_dp_msg_type(devinfo, insn,
2914                                  HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2915      } else {
2916         brw_inst_set_dp_msg_type(devinfo, insn,
2917            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2918      }
2919   } else {
2920      brw_inst_set_dp_msg_type(devinfo, insn,
2921                               GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2922
2923      if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_8)
2924         msg_control |= 1 << 4; /* SIMD8 mode */
2925   }
2926
2927   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2928}
2929
2930void
2931brw_untyped_atomic(struct brw_codegen *p,
2932                   struct brw_reg dst,
2933                   struct brw_reg payload,
2934                   struct brw_reg surface,
2935                   unsigned atomic_op,
2936                   unsigned msg_length,
2937                   bool response_expected)
2938{
2939   const struct brw_device_info *devinfo = p->devinfo;
2940   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2941                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2942                          GEN7_SFID_DATAPORT_DATA_CACHE);
2943   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2944   /* Mask out unused components -- This is especially important in Align16
2945    * mode on generations that don't have native support for SIMD4x2 atomics,
2946    * because unused but enabled components will cause the dataport to perform
2947    * additional atomic operations on the addresses that happen to be in the
2948    * uninitialized Y, Z and W coordinates of the payload.
2949    */
2950   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2951   struct brw_inst *insn = brw_send_indirect_surface_message(
2952      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2953      brw_surface_payload_size(p, response_expected,
2954                               devinfo->gen >= 8 || devinfo->is_haswell, true),
2955      align1);
2956
2957   brw_set_dp_untyped_atomic_message(
2958      p, insn, atomic_op, response_expected);
2959}
2960
2961static void
2962brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2963                                        struct brw_inst *insn,
2964                                        unsigned num_channels)
2965{
2966   const struct brw_device_info *devinfo = p->devinfo;
2967   /* Set mask of 32-bit channels to drop. */
2968   unsigned msg_control = 0xf & (0xf << num_channels);
2969
2970   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2971      if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2972         msg_control |= 1 << 4; /* SIMD16 mode */
2973      else
2974         msg_control |= 2 << 4; /* SIMD8 mode */
2975   }
2976
2977   brw_inst_set_dp_msg_type(devinfo, insn,
2978                            (devinfo->gen >= 8 || devinfo->is_haswell ?
2979                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2980                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2981   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2982}
2983
2984void
2985brw_untyped_surface_read(struct brw_codegen *p,
2986                         struct brw_reg dst,
2987                         struct brw_reg payload,
2988                         struct brw_reg surface,
2989                         unsigned msg_length,
2990                         unsigned num_channels)
2991{
2992   const struct brw_device_info *devinfo = p->devinfo;
2993   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2994                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2995                          GEN7_SFID_DATAPORT_DATA_CACHE);
2996   struct brw_inst *insn = brw_send_indirect_surface_message(
2997      p, sfid, dst, payload, surface, msg_length,
2998      brw_surface_payload_size(p, num_channels, true, true),
2999      false);
3000
3001   brw_set_dp_untyped_surface_read_message(
3002      p, insn, num_channels);
3003}
3004
3005static void
3006brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
3007                                         struct brw_inst *insn,
3008                                         unsigned num_channels)
3009{
3010   const struct brw_device_info *devinfo = p->devinfo;
3011   /* Set mask of 32-bit channels to drop. */
3012   unsigned msg_control = 0xf & (0xf << num_channels);
3013
3014   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3015      if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3016         msg_control |= 1 << 4; /* SIMD16 mode */
3017      else
3018         msg_control |= 2 << 4; /* SIMD8 mode */
3019   } else {
3020      if (devinfo->gen >= 8 || devinfo->is_haswell)
3021         msg_control |= 0 << 4; /* SIMD4x2 mode */
3022      else
3023         msg_control |= 2 << 4; /* SIMD8 mode */
3024   }
3025
3026   brw_inst_set_dp_msg_type(devinfo, insn,
3027                            devinfo->gen >= 8 || devinfo->is_haswell ?
3028                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3029                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3030   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3031}
3032
3033void
3034brw_untyped_surface_write(struct brw_codegen *p,
3035                          struct brw_reg payload,
3036                          struct brw_reg surface,
3037                          unsigned msg_length,
3038                          unsigned num_channels)
3039{
3040   const struct brw_device_info *devinfo = p->devinfo;
3041   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3042                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3043                          GEN7_SFID_DATAPORT_DATA_CACHE);
3044   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3045   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3046   const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3047                          WRITEMASK_X : WRITEMASK_XYZW;
3048   struct brw_inst *insn = brw_send_indirect_surface_message(
3049      p, sfid, brw_writemask(brw_null_reg(), mask),
3050      payload, surface, msg_length, 0, align1);
3051
3052   brw_set_dp_untyped_surface_write_message(
3053      p, insn, num_channels);
3054}
3055
3056static void
3057brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3058                                struct brw_inst *insn,
3059                                unsigned atomic_op,
3060                                bool response_expected)
3061{
3062   const struct brw_device_info *devinfo = p->devinfo;
3063   unsigned msg_control =
3064      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3065      (response_expected ? 1 << 5 : 0); /* Return data expected */
3066
3067   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3068      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3069         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3070            msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3071
3072         brw_inst_set_dp_msg_type(devinfo, insn,
3073                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3074      } else {
3075         brw_inst_set_dp_msg_type(devinfo, insn,
3076                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3077      }
3078
3079   } else {
3080      brw_inst_set_dp_msg_type(devinfo, insn,
3081                               GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3082
3083      if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3084         msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3085   }
3086
3087   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3088}
3089
3090void
3091brw_typed_atomic(struct brw_codegen *p,
3092                 struct brw_reg dst,
3093                 struct brw_reg payload,
3094                 struct brw_reg surface,
3095                 unsigned atomic_op,
3096                 unsigned msg_length,
3097                 bool response_expected) {
3098   const struct brw_device_info *devinfo = p->devinfo;
3099   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3100                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3101                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3102   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3103   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3104   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3105   struct brw_inst *insn = brw_send_indirect_surface_message(
3106      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3107      brw_surface_payload_size(p, response_expected,
3108                               devinfo->gen >= 8 || devinfo->is_haswell, false),
3109      true);
3110
3111   brw_set_dp_typed_atomic_message(
3112      p, insn, atomic_op, response_expected);
3113}
3114
3115static void
3116brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3117                                      struct brw_inst *insn,
3118                                      unsigned num_channels)
3119{
3120   const struct brw_device_info *devinfo = p->devinfo;
3121   /* Set mask of unused channels. */
3122   unsigned msg_control = 0xf & (0xf << num_channels);
3123
3124   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3125      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3126         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3127            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3128         else
3129            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3130      }
3131
3132      brw_inst_set_dp_msg_type(devinfo, insn,
3133                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3134   } else {
3135      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3136         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3137            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3138      }
3139
3140      brw_inst_set_dp_msg_type(devinfo, insn,
3141                               GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3142   }
3143
3144   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3145}
3146
3147void
3148brw_typed_surface_read(struct brw_codegen *p,
3149                       struct brw_reg dst,
3150                       struct brw_reg payload,
3151                       struct brw_reg surface,
3152                       unsigned msg_length,
3153                       unsigned num_channels)
3154{
3155   const struct brw_device_info *devinfo = p->devinfo;
3156   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3157                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3158                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3159   struct brw_inst *insn = brw_send_indirect_surface_message(
3160      p, sfid, dst, payload, surface, msg_length,
3161      brw_surface_payload_size(p, num_channels,
3162                               devinfo->gen >= 8 || devinfo->is_haswell, false),
3163      true);
3164
3165   brw_set_dp_typed_surface_read_message(
3166      p, insn, num_channels);
3167}
3168
3169static void
3170brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3171                                       struct brw_inst *insn,
3172                                       unsigned num_channels)
3173{
3174   const struct brw_device_info *devinfo = p->devinfo;
3175   /* Set mask of unused channels. */
3176   unsigned msg_control = 0xf & (0xf << num_channels);
3177
3178   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3179      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3180         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3181            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3182         else
3183            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3184      }
3185
3186      brw_inst_set_dp_msg_type(devinfo, insn,
3187                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3188
3189   } else {
3190      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3191         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3192            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3193      }
3194
3195      brw_inst_set_dp_msg_type(devinfo, insn,
3196                               GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3197   }
3198
3199   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3200}
3201
3202void
3203brw_typed_surface_write(struct brw_codegen *p,
3204                        struct brw_reg payload,
3205                        struct brw_reg surface,
3206                        unsigned msg_length,
3207                        unsigned num_channels)
3208{
3209   const struct brw_device_info *devinfo = p->devinfo;
3210   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3211                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3212                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3213   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3214   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3215   const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3216                          WRITEMASK_X : WRITEMASK_XYZW);
3217   struct brw_inst *insn = brw_send_indirect_surface_message(
3218      p, sfid, brw_writemask(brw_null_reg(), mask),
3219      payload, surface, msg_length, 0, true);
3220
3221   brw_set_dp_typed_surface_write_message(
3222      p, insn, num_channels);
3223}
3224
3225static void
3226brw_set_memory_fence_message(struct brw_codegen *p,
3227                             struct brw_inst *insn,
3228                             enum brw_message_target sfid,
3229                             bool commit_enable)
3230{
3231   const struct brw_device_info *devinfo = p->devinfo;
3232
3233   brw_set_message_descriptor(p, insn, sfid,
3234                              1 /* message length */,
3235                              (commit_enable ? 1 : 0) /* response length */,
3236                              true /* header present */,
3237                              false);
3238
3239   switch (sfid) {
3240   case GEN6_SFID_DATAPORT_RENDER_CACHE:
3241      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3242      break;
3243   case GEN7_SFID_DATAPORT_DATA_CACHE:
3244      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3245      break;
3246   default:
3247      unreachable("Not reached");
3248   }
3249
3250   if (commit_enable)
3251      brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3252}
3253
3254void
3255brw_memory_fence(struct brw_codegen *p,
3256                 struct brw_reg dst)
3257{
3258   const struct brw_device_info *devinfo = p->devinfo;
3259   const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3260   struct brw_inst *insn;
3261
3262   brw_push_insn_state(p);
3263   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3264   brw_set_default_exec_size(p, BRW_EXECUTE_1);
3265   dst = vec1(dst);
3266
3267   /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3268    * message doesn't write anything back.
3269    */
3270   insn = next_insn(p, BRW_OPCODE_SEND);
3271   dst = retype(dst, BRW_REGISTER_TYPE_UW);
3272   brw_set_dest(p, insn, dst);
3273   brw_set_src0(p, insn, dst);
3274   brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3275                                commit_enable);
3276
3277   if (devinfo->gen == 7 && !devinfo->is_haswell) {
3278      /* IVB does typed surface access through the render cache, so we need to
3279       * flush it too.  Use a different register so both flushes can be
3280       * pipelined by the hardware.
3281       */
3282      insn = next_insn(p, BRW_OPCODE_SEND);
3283      brw_set_dest(p, insn, offset(dst, 1));
3284      brw_set_src0(p, insn, offset(dst, 1));
3285      brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3286                                   commit_enable);
3287
3288      /* Now write the response of the second message into the response of the
3289       * first to trigger a pipeline stall -- This way future render and data
3290       * cache messages will be properly ordered with respect to past data and
3291       * render cache messages.
3292       */
3293      brw_MOV(p, dst, offset(dst, 1));
3294   }
3295
3296   brw_pop_insn_state(p);
3297}
3298
3299void
3300brw_pixel_interpolator_query(struct brw_codegen *p,
3301                             struct brw_reg dest,
3302                             struct brw_reg mrf,
3303                             bool noperspective,
3304                             unsigned mode,
3305                             struct brw_reg data,
3306                             unsigned msg_length,
3307                             unsigned response_length)
3308{
3309   const struct brw_device_info *devinfo = p->devinfo;
3310   struct brw_inst *insn;
3311   const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3312
3313   /* brw_send_indirect_message will automatically use a direct send message
3314    * if data is actually immediate.
3315    */
3316   insn = brw_send_indirect_message(p,
3317                                    GEN7_SFID_PIXEL_INTERPOLATOR,
3318                                    dest,
3319                                    mrf,
3320                                    vec1(data));
3321   brw_inst_set_mlen(devinfo, insn, msg_length);
3322   brw_inst_set_rlen(devinfo, insn, response_length);
3323
3324   brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3325   brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3326   brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3327   brw_inst_set_pi_message_type(devinfo, insn, mode);
3328}
3329
3330void
3331brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst)
3332{
3333   const struct brw_device_info *devinfo = p->devinfo;
3334   brw_inst *inst;
3335
3336   assert(devinfo->gen >= 7);
3337
3338   brw_push_insn_state(p);
3339
3340   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3341      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3342
3343      if (devinfo->gen >= 8) {
3344         /* Getting the first active channel index is easy on Gen8: Just find
3345          * the first bit set in the mask register.  The same register exists
3346          * on HSW already but it reads back as all ones when the current
3347          * instruction has execution masking disabled, so it's kind of
3348          * useless.
3349          */
3350         inst = brw_FBL(p, vec1(dst),
3351                        retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
3352
3353         /* Quarter control has the effect of magically shifting the value of
3354          * this register.  Make sure it's set to zero.
3355          */
3356         brw_inst_set_qtr_control(devinfo, inst, GEN6_COMPRESSION_1Q);
3357      } else {
3358         const struct brw_reg flag = retype(brw_flag_reg(1, 0),
3359                                            BRW_REGISTER_TYPE_UD);
3360
3361         brw_MOV(p, flag, brw_imm_ud(0));
3362
3363         /* Run a 16-wide instruction returning zero with execution masking
3364          * and a conditional modifier enabled in order to get the current
3365          * execution mask in f1.0.
3366          */
3367         inst = brw_MOV(p, brw_null_reg(), brw_imm_ud(0));
3368         brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_16);
3369         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3370         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3371         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3372
3373         brw_FBL(p, vec1(dst), flag);
3374      }
3375   } else {
3376      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3377
3378      if (devinfo->gen >= 8) {
3379         /* In SIMD4x2 mode the first active channel index is just the
3380          * negation of the first bit of the mask register.
3381          */
3382         inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3383                        negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3384                        brw_imm_ud(1));
3385
3386      } else {
3387         /* Overwrite the destination without and with execution masking to
3388          * find out which of the channels is active.
3389          */
3390         brw_push_insn_state(p);
3391         brw_set_default_exec_size(p, BRW_EXECUTE_4);
3392         brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3393                 brw_imm_ud(1));
3394
3395         inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3396                        brw_imm_ud(0));
3397         brw_pop_insn_state(p);
3398         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3399      }
3400   }
3401
3402   brw_pop_insn_state(p);
3403}
3404
3405void
3406brw_broadcast(struct brw_codegen *p,
3407              struct brw_reg dst,
3408              struct brw_reg src,
3409              struct brw_reg idx)
3410{
3411   const struct brw_device_info *devinfo = p->devinfo;
3412   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3413   brw_inst *inst;
3414
3415   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3416          src.address_mode == BRW_ADDRESS_DIRECT);
3417
3418   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3419       idx.file == BRW_IMMEDIATE_VALUE) {
3420      /* Trivial, the source is already uniform or the index is a constant.
3421       * We will typically not get here if the optimizer is doing its job, but
3422       * asserting would be mean.
3423       */
3424      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3425      brw_MOV(p, dst,
3426              (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3427               stride(suboffset(src, 4 * i), 0, 4, 1)));
3428   } else {
3429      if (align1) {
3430         const struct brw_reg addr =
3431            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3432         const unsigned offset = src.nr * REG_SIZE + src.subnr;
3433         /* Limit in bytes of the signed indirect addressing immediate. */
3434         const unsigned limit = 512;
3435
3436         brw_push_insn_state(p);
3437         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3438         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3439
3440         /* Take into account the component size and horizontal stride. */
3441         assert(src.vstride == src.hstride + src.width);
3442         brw_SHL(p, addr, vec1(idx),
3443                 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3444                            src.hstride - 1));
3445
3446         /* We can only address up to limit bytes using the indirect
3447          * addressing immediate, account for the difference if the source
3448          * register is above this limit.
3449          */
3450         if (offset >= limit)
3451            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3452
3453         brw_pop_insn_state(p);
3454
3455         /* Use indirect addressing to fetch the specified component. */
3456         brw_MOV(p, dst,
3457                 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3458                        src.type));
3459      } else {
3460         /* In SIMD4x2 mode the index can be either zero or one, replicate it
3461          * to all bits of a flag register,
3462          */
3463         inst = brw_MOV(p,
3464                        brw_null_reg(),
3465                        stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 0, 4, 1));
3466         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3467         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3468         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3469
3470         /* and use predicated SEL to pick the right channel. */
3471         inst = brw_SEL(p, dst,
3472                        stride(suboffset(src, 4), 0, 4, 1),
3473                        stride(src, 0, 4, 1));
3474         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3475         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3476      }
3477   }
3478}
3479
3480/**
3481 * This instruction is generated as a single-channel align1 instruction by
3482 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3483 *
3484 * We can't use the typed atomic op in the FS because that has the execution
3485 * mask ANDed with the pixel mask, but we just want to write the one dword for
3486 * all the pixels.
3487 *
3488 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3489 * one u32.  So we use the same untyped atomic write message as the pixel
3490 * shader.
3491 *
3492 * The untyped atomic operation requires a BUFFER surface type with RAW
3493 * format, and is only accessible through the legacy DATA_CACHE dataport
3494 * messages.
3495 */
3496void brw_shader_time_add(struct brw_codegen *p,
3497                         struct brw_reg payload,
3498                         uint32_t surf_index)
3499{
3500   const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3501                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3502                          GEN7_SFID_DATAPORT_DATA_CACHE);
3503   assert(p->devinfo->gen >= 7);
3504
3505   brw_push_insn_state(p);
3506   brw_set_default_access_mode(p, BRW_ALIGN_1);
3507   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3508   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3509   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3510
3511   /* We use brw_vec1_reg and unmasked because we want to increment the given
3512    * offset only once.
3513    */
3514   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3515                                      BRW_ARF_NULL, 0));
3516   brw_set_src0(p, send, brw_vec1_reg(payload.file,
3517                                      payload.nr, 0));
3518   brw_set_src1(p, send, brw_imm_ud(0));
3519   brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3520   brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3521   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3522
3523   brw_pop_insn_state(p);
3524}
3525
3526
3527/**
3528 * Emit the SEND message for a barrier
3529 */
3530void
3531brw_barrier(struct brw_codegen *p, struct brw_reg src)
3532{
3533   const struct brw_device_info *devinfo = p->devinfo;
3534   struct brw_inst *inst;
3535
3536   assert(devinfo->gen >= 7);
3537
3538   inst = next_insn(p, BRW_OPCODE_SEND);
3539   brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3540   brw_set_src0(p, inst, src);
3541   brw_set_src1(p, inst, brw_null_reg());
3542
3543   brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3544                              1 /* msg_length */,
3545                              0 /* response_length */,
3546                              false /* header_present */,
3547                              false /* end_of_thread */);
3548
3549   brw_inst_set_gateway_notify(devinfo, inst, 1);
3550   brw_inst_set_gateway_subfuncid(devinfo, inst,
3551                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3552
3553   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3554}
3555
3556
3557/**
3558 * Emit the wait instruction for a barrier
3559 */
3560void
3561brw_WAIT(struct brw_codegen *p)
3562{
3563   const struct brw_device_info *devinfo = p->devinfo;
3564   struct brw_inst *insn;
3565
3566   struct brw_reg src = brw_notification_reg();
3567
3568   insn = next_insn(p, BRW_OPCODE_WAIT);
3569   brw_set_dest(p, insn, src);
3570   brw_set_src0(p, insn, src);
3571   brw_set_src1(p, insn, brw_null_reg());
3572
3573   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3574   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3575}
3576