1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "util/ralloc.h"
38
39/**
40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
41 * registers, implicitly moving the operand to a message register.
42 *
43 * On Sandybridge, this is no longer the case.  This function performs the
44 * explicit move; it should be called before emitting a SEND instruction.
45 */
46void
47gen6_resolve_implied_move(struct brw_codegen *p,
48			  struct brw_reg *src,
49			  unsigned msg_reg_nr)
50{
51   const struct gen_device_info *devinfo = p->devinfo;
52   if (devinfo->gen < 6)
53      return;
54
55   if (src->file == BRW_MESSAGE_REGISTER_FILE)
56      return;
57
58   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
59      brw_push_insn_state(p);
60      brw_set_default_exec_size(p, BRW_EXECUTE_8);
61      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64	      retype(*src, BRW_REGISTER_TYPE_UD));
65      brw_pop_insn_state(p);
66   }
67   *src = brw_message_reg(msg_reg_nr);
68}
69
70static void
71gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72{
73   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74    * "The send with EOT should use register space R112-R127 for <src>. This is
75    *  to enable loading of a new thread into the same slot while the message
76    *  with EOT for current thread is pending dispatch."
77    *
78    * Since we're pretending to have 16 MRFs anyway, we may as well use the
79    * registers required for messages with EOT.
80    */
81   const struct gen_device_info *devinfo = p->devinfo;
82   if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83      reg->file = BRW_GENERAL_REGISTER_FILE;
84      reg->nr += GEN7_MRF_HACK_START;
85   }
86}
87
88/**
89 * Convert a brw_reg_type enumeration value into the hardware representation.
90 *
91 * The hardware encoding may depend on whether the value is an immediate.
92 */
93unsigned
94brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
95                        enum brw_reg_type type, enum brw_reg_file file)
96{
97   if (file == BRW_IMMEDIATE_VALUE) {
98      static const int imm_hw_types[] = {
99         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
100         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
101         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
102         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
103         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
104         [BRW_REGISTER_TYPE_UB] = -1,
105         [BRW_REGISTER_TYPE_B]  = -1,
106         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
107         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
108         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
109         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
110         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
111         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
112         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
113      };
114      assert(type < ARRAY_SIZE(imm_hw_types));
115      assert(imm_hw_types[type] != -1);
116      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
117      return imm_hw_types[type];
118   } else {
119      /* Non-immediate registers */
120      static const int hw_types[] = {
121         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
122         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
123         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
124         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
125         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
126         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
127         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
128         [BRW_REGISTER_TYPE_UV] = -1,
129         [BRW_REGISTER_TYPE_VF] = -1,
130         [BRW_REGISTER_TYPE_V]  = -1,
131         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
132         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
133         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
134         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
135      };
136      assert(type < ARRAY_SIZE(hw_types));
137      assert(hw_types[type] != -1);
138      assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
139      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
140      return hw_types[type];
141   }
142}
143
144void
145brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
146{
147   const struct gen_device_info *devinfo = p->devinfo;
148
149   if (dest.file == BRW_MESSAGE_REGISTER_FILE)
150      assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
151   else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
152      assert(dest.nr < 128);
153
154   gen7_convert_mrf_to_grf(p, &dest);
155
156   brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
157   brw_inst_set_dst_reg_type(devinfo, inst,
158                             brw_reg_type_to_hw_type(devinfo, dest.type,
159                                                     dest.file));
160   brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
161
162   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
163      brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
164
165      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
166         brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
167	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
168	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
169         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
170      } else {
171         brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
172         brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
173         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
174             dest.file == BRW_MESSAGE_REGISTER_FILE) {
175            assert(dest.writemask != 0);
176         }
177	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
178	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
179	  *    this to be programmed as "01".
180	  */
181         brw_inst_set_dst_hstride(devinfo, inst, 1);
182      }
183   } else {
184      brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
185
186      /* These are different sizes in align1 vs align16:
187       */
188      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
189         brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
190                                       dest.indirect_offset);
191	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
192	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
193         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
194      } else {
195         brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
196                                        dest.indirect_offset);
197	 /* even ignored in da16, still need to set as '01' */
198         brw_inst_set_dst_hstride(devinfo, inst, 1);
199      }
200   }
201
202   /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
203    * or 16 (SIMD16), as that's normally correct.  However, when dealing with
204    * small registers, we automatically reduce it to match the register size.
205    *
206    * In platforms that support fp64 we can emit instructions with a width of
207    * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
208    * cases we need to make sure that these instructions have their exec sizes
209    * set properly when they are emitted and we can't rely on this code to fix
210    * it.
211    */
212   bool fix_exec_size;
213   if (devinfo->gen >= 6)
214      fix_exec_size = dest.width < BRW_EXECUTE_4;
215   else
216      fix_exec_size = dest.width < BRW_EXECUTE_8;
217
218   if (fix_exec_size)
219      brw_inst_set_exec_size(devinfo, inst, dest.width);
220}
221
222extern int reg_type_size[];
223
224static void
225validate_reg(const struct gen_device_info *devinfo,
226             brw_inst *inst, struct brw_reg reg)
227{
228   const int hstride_for_reg[] = {0, 1, 2, 4};
229   const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
230   const int width_for_reg[] = {1, 2, 4, 8, 16};
231   const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
232   int width, hstride, vstride, execsize;
233
234   if (reg.file == BRW_IMMEDIATE_VALUE) {
235      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
236       * mean the destination has to be 128-bit aligned and the
237       * destination horiz stride has to be a word.
238       */
239      if (reg.type == BRW_REGISTER_TYPE_V) {
240         assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
241                reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
242      }
243
244      return;
245   }
246
247   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
248       reg.file == BRW_ARF_NULL)
249      return;
250
251   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
252    *
253    *    "Swizzling is not allowed when an accumulator is used as an implicit
254    *    source or an explicit source in an instruction."
255    */
256   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
257       reg.nr == BRW_ARF_ACCUMULATOR)
258      assert(reg.swizzle == BRW_SWIZZLE_XYZW);
259
260   assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
261   hstride = hstride_for_reg[reg.hstride];
262
263   if (reg.vstride == 0xf) {
264      vstride = -1;
265   } else {
266      assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
267      vstride = vstride_for_reg[reg.vstride];
268   }
269
270   assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
271   width = width_for_reg[reg.width];
272
273   assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
274          brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
275   execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
276
277   /* Restrictions from 3.3.10: Register Region Restrictions. */
278   /* 3. */
279   assert(execsize >= width);
280
281   /* 4. */
282   if (execsize == width && hstride != 0) {
283      assert(vstride == -1 || vstride == width * hstride);
284   }
285
286   /* 5. */
287   if (execsize == width && hstride == 0) {
288      /* no restriction on vstride. */
289   }
290
291   /* 6. */
292   if (width == 1) {
293      assert(hstride == 0);
294   }
295
296   /* 7. */
297   if (execsize == 1 && width == 1) {
298      assert(hstride == 0);
299      assert(vstride == 0);
300   }
301
302   /* 8. */
303   if (vstride == 0 && hstride == 0) {
304      assert(width == 1);
305   }
306
307   /* 10. Check destination issues. */
308}
309
310static bool
311is_compactable_immediate(unsigned imm)
312{
313   /* We get the low 12 bits as-is. */
314   imm &= ~0xfff;
315
316   /* We get one bit replicated through the top 20 bits. */
317   return imm == 0 || imm == 0xfffff000;
318}
319
320void
321brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
322{
323   const struct gen_device_info *devinfo = p->devinfo;
324
325   if (reg.file == BRW_MESSAGE_REGISTER_FILE)
326      assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
327   else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
328      assert(reg.nr < 128);
329
330   gen7_convert_mrf_to_grf(p, &reg);
331
332   if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
333                             brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
334      /* Any source modifiers or regions will be ignored, since this just
335       * identifies the MRF/GRF to start reading the message contents from.
336       * Check for some likely failures.
337       */
338      assert(!reg.negate);
339      assert(!reg.abs);
340      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
341   }
342
343   validate_reg(devinfo, inst, reg);
344
345   brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
346   brw_inst_set_src0_reg_type(devinfo, inst,
347                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
348   brw_inst_set_src0_abs(devinfo, inst, reg.abs);
349   brw_inst_set_src0_negate(devinfo, inst, reg.negate);
350   brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
351
352   if (reg.file == BRW_IMMEDIATE_VALUE) {
353      if (reg.type == BRW_REGISTER_TYPE_DF ||
354          brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
355         brw_inst_set_imm_df(devinfo, inst, reg.df);
356      else
357         brw_inst_set_imm_ud(devinfo, inst, reg.ud);
358
359      /* The Bspec's section titled "Non-present Operands" claims that if src0
360       * is an immediate that src1's type must be the same as that of src0.
361       *
362       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
363       * that do not follow this rule. E.g., from the IVB/HSW table:
364       *
365       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
366       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
367       *
368       * And from the SNB table:
369       *
370       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
371       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
372       *
373       * Neither of these cause warnings from the simulator when used,
374       * compacted or otherwise. In fact, all compaction mappings that have an
375       * immediate in src0 use a:ud for src1.
376       *
377       * The GM45 instruction compaction tables do not contain mapped meanings
378       * so it's not clear whether it has the restriction. We'll assume it was
379       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
380       *
381       * Don't do any of this for 64-bit immediates, since the src1 fields
382       * overlap with the immediate and setting them would overwrite the
383       * immediate we set.
384       */
385      if (type_sz(reg.type) < 8) {
386         brw_inst_set_src1_reg_file(devinfo, inst,
387                                    BRW_ARCHITECTURE_REGISTER_FILE);
388         if (devinfo->gen < 6) {
389            brw_inst_set_src1_reg_type(devinfo, inst,
390                                       brw_inst_src0_reg_type(devinfo, inst));
391         } else {
392            brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
393         }
394      }
395
396      /* Compacted instructions only have 12-bits (plus 1 for the other 20)
397       * for immediate values. Presumably the hardware engineers realized
398       * that the only useful floating-point value that could be represented
399       * in this format is 0.0, which can also be represented as a VF-typed
400       * immediate, so they gave us the previously mentioned mapping on IVB+.
401       *
402       * Strangely, we do have a mapping for imm:f in src1, so we don't need
403       * to do this there.
404       *
405       * If we see a 0.0:F, change the type to VF so that it can be compacted.
406       */
407      if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
408          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
409         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
410      }
411
412      /* There are no mappings for dst:d | i:d, so if the immediate is suitable
413       * set the types to :UD so the instruction can be compacted.
414       */
415      if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
416          brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
417          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
418          brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
419         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
420         brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
421      }
422   } else {
423      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
424         brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
425         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
426             brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
427	 } else {
428            brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
429	 }
430      } else {
431         brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
432
433         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
434            brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
435	 } else {
436            brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
437	 }
438      }
439
440      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
441	 if (reg.width == BRW_WIDTH_1 &&
442             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
443            brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
444            brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
445            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
446	 } else {
447            brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
448            brw_inst_set_src0_width(devinfo, inst, reg.width);
449            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
450	 }
451      } else {
452         brw_inst_set_src0_da16_swiz_x(devinfo, inst,
453            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
454         brw_inst_set_src0_da16_swiz_y(devinfo, inst,
455            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
456         brw_inst_set_src0_da16_swiz_z(devinfo, inst,
457            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
458         brw_inst_set_src0_da16_swiz_w(devinfo, inst,
459            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
460
461	 /* This is an oddity of the fact we're using the same
462	  * descriptions for registers in align_16 as align_1:
463	  */
464	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
465            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
466	 else
467            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
468      }
469   }
470}
471
472
473void
474brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
475{
476   const struct gen_device_info *devinfo = p->devinfo;
477
478   if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
479      assert(reg.nr < 128);
480
481   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
482    *
483    *    "Accumulator registers may be accessed explicitly as src0
484    *    operands only."
485    */
486   assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
487          reg.nr != BRW_ARF_ACCUMULATOR);
488
489   gen7_convert_mrf_to_grf(p, &reg);
490   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
491
492   validate_reg(devinfo, inst, reg);
493
494   brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
495   brw_inst_set_src1_reg_type(devinfo, inst,
496                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
497   brw_inst_set_src1_abs(devinfo, inst, reg.abs);
498   brw_inst_set_src1_negate(devinfo, inst, reg.negate);
499
500   /* Only src1 can be immediate in two-argument instructions.
501    */
502   assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
503
504   if (reg.file == BRW_IMMEDIATE_VALUE) {
505      /* two-argument instructions can only use 32-bit immediates */
506      assert(type_sz(reg.type) < 8);
507      brw_inst_set_imm_ud(devinfo, inst, reg.ud);
508   } else {
509      /* This is a hardware restriction, which may or may not be lifted
510       * in the future:
511       */
512      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
513      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
514
515      brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
516      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
517         brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
518      } else {
519         brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
520      }
521
522      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
523	 if (reg.width == BRW_WIDTH_1 &&
524             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
525            brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
526            brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
527            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
528	 } else {
529            brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
530            brw_inst_set_src1_width(devinfo, inst, reg.width);
531            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
532	 }
533      } else {
534         brw_inst_set_src1_da16_swiz_x(devinfo, inst,
535            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
536         brw_inst_set_src1_da16_swiz_y(devinfo, inst,
537            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
538         brw_inst_set_src1_da16_swiz_z(devinfo, inst,
539            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
540         brw_inst_set_src1_da16_swiz_w(devinfo, inst,
541            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
542
543	 /* This is an oddity of the fact we're using the same
544	  * descriptions for registers in align_16 as align_1:
545	  */
546	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
547            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
548	 else
549            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
550      }
551   }
552}
553
554/**
555 * Set the Message Descriptor and Extended Message Descriptor fields
556 * for SEND messages.
557 *
558 * \note This zeroes out the Function Control bits, so it must be called
559 *       \b before filling out any message-specific data.  Callers can
560 *       choose not to fill in irrelevant bits; they will be zero.
561 */
562void
563brw_set_message_descriptor(struct brw_codegen *p,
564			   brw_inst *inst,
565			   enum brw_message_target sfid,
566			   unsigned msg_length,
567			   unsigned response_length,
568			   bool header_present,
569			   bool end_of_thread)
570{
571   const struct gen_device_info *devinfo = p->devinfo;
572
573   brw_set_src1(p, inst, brw_imm_d(0));
574
575   /* For indirect sends, `inst` will not be the SEND/SENDC instruction
576    * itself; instead, it will be a MOV/OR into the address register.
577    *
578    * In this case, we avoid setting the extended message descriptor bits,
579    * since they go on the later SEND/SENDC instead and if set here would
580    * instead clobber the conditionalmod bits.
581    */
582   unsigned opcode = brw_inst_opcode(devinfo, inst);
583   if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
584      brw_inst_set_sfid(devinfo, inst, sfid);
585   }
586
587   brw_inst_set_mlen(devinfo, inst, msg_length);
588   brw_inst_set_rlen(devinfo, inst, response_length);
589   brw_inst_set_eot(devinfo, inst, end_of_thread);
590
591   if (devinfo->gen >= 5) {
592      brw_inst_set_header_present(devinfo, inst, header_present);
593   }
594}
595
596static void brw_set_math_message( struct brw_codegen *p,
597				  brw_inst *inst,
598				  unsigned function,
599				  unsigned integer_type,
600				  bool low_precision,
601				  unsigned dataType )
602{
603   const struct gen_device_info *devinfo = p->devinfo;
604   unsigned msg_length;
605   unsigned response_length;
606
607   /* Infer message length from the function */
608   switch (function) {
609   case BRW_MATH_FUNCTION_POW:
610   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
611   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
612   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
613      msg_length = 2;
614      break;
615   default:
616      msg_length = 1;
617      break;
618   }
619
620   /* Infer response length from the function */
621   switch (function) {
622   case BRW_MATH_FUNCTION_SINCOS:
623   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
624      response_length = 2;
625      break;
626   default:
627      response_length = 1;
628      break;
629   }
630
631
632   brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
633			      msg_length, response_length, false, false);
634   brw_inst_set_math_msg_function(devinfo, inst, function);
635   brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
636   brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
637   brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
638   brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
639   brw_inst_set_saturate(devinfo, inst, 0);
640}
641
642
643static void brw_set_ff_sync_message(struct brw_codegen *p,
644				    brw_inst *insn,
645				    bool allocate,
646				    unsigned response_length,
647				    bool end_of_thread)
648{
649   const struct gen_device_info *devinfo = p->devinfo;
650
651   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
652			      1, response_length, true, end_of_thread);
653   brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
654   brw_inst_set_urb_allocate(devinfo, insn, allocate);
655   /* The following fields are not used by FF_SYNC: */
656   brw_inst_set_urb_global_offset(devinfo, insn, 0);
657   brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
658   brw_inst_set_urb_used(devinfo, insn, 0);
659   brw_inst_set_urb_complete(devinfo, insn, 0);
660}
661
662static void brw_set_urb_message( struct brw_codegen *p,
663				 brw_inst *insn,
664                                 enum brw_urb_write_flags flags,
665				 unsigned msg_length,
666				 unsigned response_length,
667				 unsigned offset,
668				 unsigned swizzle_control )
669{
670   const struct gen_device_info *devinfo = p->devinfo;
671
672   assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
673   assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
674   assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
675
676   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
677			      msg_length, response_length, true,
678                              flags & BRW_URB_WRITE_EOT);
679
680   if (flags & BRW_URB_WRITE_OWORD) {
681      assert(msg_length == 2); /* header + one OWORD of data */
682      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
683   } else {
684      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
685   }
686
687   brw_inst_set_urb_global_offset(devinfo, insn, offset);
688   brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
689
690   if (devinfo->gen < 8) {
691      brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
692   }
693
694   if (devinfo->gen < 7) {
695      brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
696      brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
697   } else {
698      brw_inst_set_urb_per_slot_offset(devinfo, insn,
699         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
700   }
701}
702
703void
704brw_set_dp_write_message(struct brw_codegen *p,
705			 brw_inst *insn,
706			 unsigned binding_table_index,
707			 unsigned msg_control,
708			 unsigned msg_type,
709                         unsigned target_cache,
710			 unsigned msg_length,
711			 bool header_present,
712			 unsigned last_render_target,
713			 unsigned response_length,
714			 unsigned end_of_thread,
715			 unsigned send_commit_msg)
716{
717   const struct gen_device_info *devinfo = p->devinfo;
718   const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
719                          BRW_SFID_DATAPORT_WRITE);
720
721   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
722			      header_present, end_of_thread);
723
724   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
725   brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
726   brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
727   brw_inst_set_rt_last(devinfo, insn, last_render_target);
728   if (devinfo->gen < 7) {
729      brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
730   }
731}
732
733void
734brw_set_dp_read_message(struct brw_codegen *p,
735			brw_inst *insn,
736			unsigned binding_table_index,
737			unsigned msg_control,
738			unsigned msg_type,
739			unsigned target_cache,
740			unsigned msg_length,
741                        bool header_present,
742			unsigned response_length)
743{
744   const struct gen_device_info *devinfo = p->devinfo;
745   const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
746                          BRW_SFID_DATAPORT_READ);
747
748   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
749			      header_present, false);
750
751   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
752   brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
753   brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
754   if (devinfo->gen < 6)
755      brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
756}
757
758void
759brw_set_sampler_message(struct brw_codegen *p,
760                        brw_inst *inst,
761                        unsigned binding_table_index,
762                        unsigned sampler,
763                        unsigned msg_type,
764                        unsigned response_length,
765                        unsigned msg_length,
766                        unsigned header_present,
767                        unsigned simd_mode,
768                        unsigned return_format)
769{
770   const struct gen_device_info *devinfo = p->devinfo;
771
772   brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
773			      response_length, header_present, false);
774
775   brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
776   brw_inst_set_sampler(devinfo, inst, sampler);
777   brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
778   if (devinfo->gen >= 5) {
779      brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
780   } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
781      brw_inst_set_sampler_return_format(devinfo, inst, return_format);
782   }
783}
784
785static void
786gen7_set_dp_scratch_message(struct brw_codegen *p,
787                            brw_inst *inst,
788                            bool write,
789                            bool dword,
790                            bool invalidate_after_read,
791                            unsigned num_regs,
792                            unsigned addr_offset,
793                            unsigned mlen,
794                            unsigned rlen,
795                            bool header_present)
796{
797   const struct gen_device_info *devinfo = p->devinfo;
798   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
799          (devinfo->gen >= 8 && num_regs == 8));
800   const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
801                                num_regs - 1);
802
803   brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
804                              mlen, rlen, header_present, false);
805   brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
806   brw_inst_set_scratch_read_write(devinfo, inst, write);
807   brw_inst_set_scratch_type(devinfo, inst, dword);
808   brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
809   brw_inst_set_scratch_block_size(devinfo, inst, block_size);
810   brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
811}
812
813#define next_insn brw_next_insn
814brw_inst *
815brw_next_insn(struct brw_codegen *p, unsigned opcode)
816{
817   const struct gen_device_info *devinfo = p->devinfo;
818   brw_inst *insn;
819
820   if (p->nr_insn + 1 > p->store_size) {
821      p->store_size <<= 1;
822      p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
823   }
824
825   p->next_insn_offset += 16;
826   insn = &p->store[p->nr_insn++];
827   memcpy(insn, p->current, sizeof(*insn));
828
829   brw_inst_set_opcode(devinfo, insn, opcode);
830   return insn;
831}
832
833static brw_inst *
834brw_alu1(struct brw_codegen *p, unsigned opcode,
835         struct brw_reg dest, struct brw_reg src)
836{
837   brw_inst *insn = next_insn(p, opcode);
838   brw_set_dest(p, insn, dest);
839   brw_set_src0(p, insn, src);
840   return insn;
841}
842
843static brw_inst *
844brw_alu2(struct brw_codegen *p, unsigned opcode,
845         struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
846{
847   /* 64-bit immediates are only supported on 1-src instructions */
848   assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
849   assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
850
851   brw_inst *insn = next_insn(p, opcode);
852   brw_set_dest(p, insn, dest);
853   brw_set_src0(p, insn, src0);
854   brw_set_src1(p, insn, src1);
855   return insn;
856}
857
858static int
859get_3src_subreg_nr(struct brw_reg reg)
860{
861   /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
862    * use 32-bit units (components 0..7).  Since they only support F/D/UD
863    * types, this doesn't lose any flexibility, but uses fewer bits.
864    */
865   return reg.subnr / 4;
866}
867
868static brw_inst *
869brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
870         struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
871{
872   const struct gen_device_info *devinfo = p->devinfo;
873   brw_inst *inst = next_insn(p, opcode);
874
875   gen7_convert_mrf_to_grf(p, &dest);
876
877   assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
878
879   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
880	  dest.file == BRW_MESSAGE_REGISTER_FILE);
881   assert(dest.nr < 128);
882   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
883   assert(dest.type == BRW_REGISTER_TYPE_F  ||
884          dest.type == BRW_REGISTER_TYPE_DF ||
885          dest.type == BRW_REGISTER_TYPE_D  ||
886          dest.type == BRW_REGISTER_TYPE_UD);
887   if (devinfo->gen == 6) {
888      brw_inst_set_3src_dst_reg_file(devinfo, inst,
889                                     dest.file == BRW_MESSAGE_REGISTER_FILE);
890   }
891   brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
892   brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
893   brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
894
895   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
896   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
897   assert(src0.nr < 128);
898   brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
899   brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
900   brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
901   brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
902   brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
903   brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
904                                   src0.vstride == BRW_VERTICAL_STRIDE_0);
905
906   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
907   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
908   assert(src1.nr < 128);
909   brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
910   brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
911   brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
912   brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
913   brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
914   brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
915                                   src1.vstride == BRW_VERTICAL_STRIDE_0);
916
917   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
918   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
919   assert(src2.nr < 128);
920   brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
921   brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
922   brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
923   brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
924   brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
925   brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
926                                   src2.vstride == BRW_VERTICAL_STRIDE_0);
927
928   if (devinfo->gen >= 7) {
929      /* Set both the source and destination types based on dest.type,
930       * ignoring the source register types.  The MAD and LRP emitters ensure
931       * that all four types are float.  The BFE and BFI2 emitters, however,
932       * may send us mixed D and UD types and want us to ignore that and use
933       * the destination type.
934       */
935      switch (dest.type) {
936      case BRW_REGISTER_TYPE_F:
937         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
938         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
939         break;
940      case BRW_REGISTER_TYPE_DF:
941         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
942         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
943         break;
944      case BRW_REGISTER_TYPE_D:
945         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
946         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
947         break;
948      case BRW_REGISTER_TYPE_UD:
949         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
950         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
951         break;
952      default:
953         unreachable("not reached");
954      }
955   }
956
957   return inst;
958}
959
960
961/***********************************************************************
962 * Convenience routines.
963 */
964#define ALU1(OP)					\
965brw_inst *brw_##OP(struct brw_codegen *p,		\
966	      struct brw_reg dest,			\
967	      struct brw_reg src0)   			\
968{							\
969   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
970}
971
972#define ALU2(OP)					\
973brw_inst *brw_##OP(struct brw_codegen *p,		\
974	      struct brw_reg dest,			\
975	      struct brw_reg src0,			\
976	      struct brw_reg src1)   			\
977{							\
978   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
979}
980
981#define ALU3(OP)					\
982brw_inst *brw_##OP(struct brw_codegen *p,		\
983	      struct brw_reg dest,			\
984	      struct brw_reg src0,			\
985	      struct brw_reg src1,			\
986	      struct brw_reg src2)   			\
987{							\
988   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
989}
990
991#define ALU3F(OP)                                               \
992brw_inst *brw_##OP(struct brw_codegen *p,         \
993                                 struct brw_reg dest,           \
994                                 struct brw_reg src0,           \
995                                 struct brw_reg src1,           \
996                                 struct brw_reg src2)           \
997{                                                               \
998   assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
999          dest.type == BRW_REGISTER_TYPE_DF);                   \
1000   if (dest.type == BRW_REGISTER_TYPE_F) {                      \
1001      assert(src0.type == BRW_REGISTER_TYPE_F);                 \
1002      assert(src1.type == BRW_REGISTER_TYPE_F);                 \
1003      assert(src2.type == BRW_REGISTER_TYPE_F);                 \
1004   } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
1005      assert(src0.type == BRW_REGISTER_TYPE_DF);                \
1006      assert(src1.type == BRW_REGISTER_TYPE_DF);                \
1007      assert(src2.type == BRW_REGISTER_TYPE_DF);                \
1008   }                                                            \
1009   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1010}
1011
1012/* Rounding operations (other than RNDD) require two instructions - the first
1013 * stores a rounded value (possibly the wrong way) in the dest register, but
1014 * also sets a per-channel "increment bit" in the flag register.  A predicated
1015 * add of 1.0 fixes dest to contain the desired result.
1016 *
1017 * Sandybridge and later appear to round correctly without an ADD.
1018 */
1019#define ROUND(OP)							      \
1020void brw_##OP(struct brw_codegen *p,					      \
1021	      struct brw_reg dest,					      \
1022	      struct brw_reg src)					      \
1023{									      \
1024   const struct gen_device_info *devinfo = p->devinfo;					      \
1025   brw_inst *rnd, *add;							      \
1026   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
1027   brw_set_dest(p, rnd, dest);						      \
1028   brw_set_src0(p, rnd, src);						      \
1029									      \
1030   if (devinfo->gen < 6) {							      \
1031      /* turn on round-increments */					      \
1032      brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
1033      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
1034      brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
1035   }									      \
1036}
1037
1038
1039ALU1(MOV)
1040ALU2(SEL)
1041ALU1(NOT)
1042ALU2(AND)
1043ALU2(OR)
1044ALU2(XOR)
1045ALU2(SHR)
1046ALU2(SHL)
1047ALU1(DIM)
1048ALU2(ASR)
1049ALU1(FRC)
1050ALU1(RNDD)
1051ALU2(MAC)
1052ALU2(MACH)
1053ALU1(LZD)
1054ALU2(DP4)
1055ALU2(DPH)
1056ALU2(DP3)
1057ALU2(DP2)
1058ALU3F(MAD)
1059ALU3F(LRP)
1060ALU1(BFREV)
1061ALU3(BFE)
1062ALU2(BFI1)
1063ALU3(BFI2)
1064ALU1(FBH)
1065ALU1(FBL)
1066ALU1(CBIT)
1067ALU2(ADDC)
1068ALU2(SUBB)
1069
1070ROUND(RNDZ)
1071ROUND(RNDE)
1072
1073
1074brw_inst *
1075brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1076        struct brw_reg src0, struct brw_reg src1)
1077{
1078   /* 6.2.2: add */
1079   if (src0.type == BRW_REGISTER_TYPE_F ||
1080       (src0.file == BRW_IMMEDIATE_VALUE &&
1081	src0.type == BRW_REGISTER_TYPE_VF)) {
1082      assert(src1.type != BRW_REGISTER_TYPE_UD);
1083      assert(src1.type != BRW_REGISTER_TYPE_D);
1084   }
1085
1086   if (src1.type == BRW_REGISTER_TYPE_F ||
1087       (src1.file == BRW_IMMEDIATE_VALUE &&
1088	src1.type == BRW_REGISTER_TYPE_VF)) {
1089      assert(src0.type != BRW_REGISTER_TYPE_UD);
1090      assert(src0.type != BRW_REGISTER_TYPE_D);
1091   }
1092
1093   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1094}
1095
1096brw_inst *
1097brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1098        struct brw_reg src0, struct brw_reg src1)
1099{
1100   assert(dest.type == src0.type);
1101   assert(src0.type == src1.type);
1102   switch (src0.type) {
1103   case BRW_REGISTER_TYPE_B:
1104   case BRW_REGISTER_TYPE_UB:
1105   case BRW_REGISTER_TYPE_W:
1106   case BRW_REGISTER_TYPE_UW:
1107   case BRW_REGISTER_TYPE_D:
1108   case BRW_REGISTER_TYPE_UD:
1109      break;
1110   default:
1111      unreachable("Bad type for brw_AVG");
1112   }
1113
1114   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1115}
1116
1117brw_inst *
1118brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1119        struct brw_reg src0, struct brw_reg src1)
1120{
1121   /* 6.32.38: mul */
1122   if (src0.type == BRW_REGISTER_TYPE_D ||
1123       src0.type == BRW_REGISTER_TYPE_UD ||
1124       src1.type == BRW_REGISTER_TYPE_D ||
1125       src1.type == BRW_REGISTER_TYPE_UD) {
1126      assert(dest.type != BRW_REGISTER_TYPE_F);
1127   }
1128
1129   if (src0.type == BRW_REGISTER_TYPE_F ||
1130       (src0.file == BRW_IMMEDIATE_VALUE &&
1131	src0.type == BRW_REGISTER_TYPE_VF)) {
1132      assert(src1.type != BRW_REGISTER_TYPE_UD);
1133      assert(src1.type != BRW_REGISTER_TYPE_D);
1134   }
1135
1136   if (src1.type == BRW_REGISTER_TYPE_F ||
1137       (src1.file == BRW_IMMEDIATE_VALUE &&
1138	src1.type == BRW_REGISTER_TYPE_VF)) {
1139      assert(src0.type != BRW_REGISTER_TYPE_UD);
1140      assert(src0.type != BRW_REGISTER_TYPE_D);
1141   }
1142
1143   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1144	  src0.nr != BRW_ARF_ACCUMULATOR);
1145   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1146	  src1.nr != BRW_ARF_ACCUMULATOR);
1147
1148   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1149}
1150
1151brw_inst *
1152brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1153         struct brw_reg src0, struct brw_reg src1)
1154{
1155   src0.vstride = BRW_VERTICAL_STRIDE_0;
1156   src0.width = BRW_WIDTH_1;
1157   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1158   return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1159}
1160
1161brw_inst *
1162brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1163        struct brw_reg src0, struct brw_reg src1)
1164{
1165   src0.vstride = BRW_VERTICAL_STRIDE_0;
1166   src0.width = BRW_WIDTH_1;
1167   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1168   src1.vstride = BRW_VERTICAL_STRIDE_8;
1169   src1.width = BRW_WIDTH_8;
1170   src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1171   return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1172}
1173
1174brw_inst *
1175brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1176{
1177   const struct gen_device_info *devinfo = p->devinfo;
1178   const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1179   /* The F32TO16 instruction doesn't support 32-bit destination types in
1180    * Align1 mode, and neither does the Gen8 implementation in terms of a
1181    * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
1182    * an undocumented feature.
1183    */
1184   const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1185                                 (!align16 || devinfo->gen >= 8));
1186   brw_inst *inst;
1187
1188   if (align16) {
1189      assert(dst.type == BRW_REGISTER_TYPE_UD);
1190   } else {
1191      assert(dst.type == BRW_REGISTER_TYPE_UD ||
1192             dst.type == BRW_REGISTER_TYPE_W ||
1193             dst.type == BRW_REGISTER_TYPE_UW ||
1194             dst.type == BRW_REGISTER_TYPE_HF);
1195   }
1196
1197   brw_push_insn_state(p);
1198
1199   if (needs_zero_fill) {
1200      brw_set_default_access_mode(p, BRW_ALIGN_1);
1201      dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1202   }
1203
1204   if (devinfo->gen >= 8) {
1205      inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1206   } else {
1207      assert(devinfo->gen == 7);
1208      inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1209   }
1210
1211   if (needs_zero_fill) {
1212      brw_inst_set_no_dd_clear(devinfo, inst, true);
1213      inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
1214      brw_inst_set_no_dd_check(devinfo, inst, true);
1215   }
1216
1217   brw_pop_insn_state(p);
1218   return inst;
1219}
1220
1221brw_inst *
1222brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1223{
1224   const struct gen_device_info *devinfo = p->devinfo;
1225   bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1226
1227   if (align16) {
1228      assert(src.type == BRW_REGISTER_TYPE_UD);
1229   } else {
1230      /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1231       *
1232       *   Because this instruction does not have a 16-bit floating-point
1233       *   type, the source data type must be Word (W). The destination type
1234       *   must be F (Float).
1235       */
1236      if (src.type == BRW_REGISTER_TYPE_UD)
1237         src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1238
1239      assert(src.type == BRW_REGISTER_TYPE_W ||
1240             src.type == BRW_REGISTER_TYPE_UW ||
1241             src.type == BRW_REGISTER_TYPE_HF);
1242   }
1243
1244   if (devinfo->gen >= 8) {
1245      return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1246   } else {
1247      assert(devinfo->gen == 7);
1248      return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1249   }
1250}
1251
1252
1253void brw_NOP(struct brw_codegen *p)
1254{
1255   brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1256   memset(insn, 0, sizeof(*insn));
1257   brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1258}
1259
1260
1261
1262
1263
1264/***********************************************************************
1265 * Comparisons, if/else/endif
1266 */
1267
1268brw_inst *
1269brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1270         unsigned predicate_control)
1271{
1272   const struct gen_device_info *devinfo = p->devinfo;
1273   struct brw_reg ip = brw_ip_reg();
1274   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1275
1276   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1277   brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1278   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1279   brw_inst_set_pred_control(devinfo, inst, predicate_control);
1280
1281   return inst;
1282}
1283
1284static void
1285push_if_stack(struct brw_codegen *p, brw_inst *inst)
1286{
1287   p->if_stack[p->if_stack_depth] = inst - p->store;
1288
1289   p->if_stack_depth++;
1290   if (p->if_stack_array_size <= p->if_stack_depth) {
1291      p->if_stack_array_size *= 2;
1292      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1293			     p->if_stack_array_size);
1294   }
1295}
1296
1297static brw_inst *
1298pop_if_stack(struct brw_codegen *p)
1299{
1300   p->if_stack_depth--;
1301   return &p->store[p->if_stack[p->if_stack_depth]];
1302}
1303
1304static void
1305push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1306{
1307   if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1308      p->loop_stack_array_size *= 2;
1309      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1310			       p->loop_stack_array_size);
1311      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1312				     p->loop_stack_array_size);
1313   }
1314
1315   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1316   p->loop_stack_depth++;
1317   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1318}
1319
1320static brw_inst *
1321get_inner_do_insn(struct brw_codegen *p)
1322{
1323   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1324}
1325
1326/* EU takes the value from the flag register and pushes it onto some
1327 * sort of a stack (presumably merging with any flag value already on
1328 * the stack).  Within an if block, the flags at the top of the stack
1329 * control execution on each channel of the unit, eg. on each of the
1330 * 16 pixel values in our wm programs.
1331 *
1332 * When the matching 'else' instruction is reached (presumably by
1333 * countdown of the instruction count patched in by our ELSE/ENDIF
1334 * functions), the relevant flags are inverted.
1335 *
1336 * When the matching 'endif' instruction is reached, the flags are
1337 * popped off.  If the stack is now empty, normal execution resumes.
1338 */
1339brw_inst *
1340brw_IF(struct brw_codegen *p, unsigned execute_size)
1341{
1342   const struct gen_device_info *devinfo = p->devinfo;
1343   brw_inst *insn;
1344
1345   insn = next_insn(p, BRW_OPCODE_IF);
1346
1347   /* Override the defaults for this instruction:
1348    */
1349   if (devinfo->gen < 6) {
1350      brw_set_dest(p, insn, brw_ip_reg());
1351      brw_set_src0(p, insn, brw_ip_reg());
1352      brw_set_src1(p, insn, brw_imm_d(0x0));
1353   } else if (devinfo->gen == 6) {
1354      brw_set_dest(p, insn, brw_imm_w(0));
1355      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1356      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1357      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1358   } else if (devinfo->gen == 7) {
1359      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1360      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1361      brw_set_src1(p, insn, brw_imm_w(0));
1362      brw_inst_set_jip(devinfo, insn, 0);
1363      brw_inst_set_uip(devinfo, insn, 0);
1364   } else {
1365      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1366      brw_set_src0(p, insn, brw_imm_d(0));
1367      brw_inst_set_jip(devinfo, insn, 0);
1368      brw_inst_set_uip(devinfo, insn, 0);
1369   }
1370
1371   brw_inst_set_exec_size(devinfo, insn, execute_size);
1372   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1373   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1374   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1375   if (!p->single_program_flow && devinfo->gen < 6)
1376      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1377
1378   push_if_stack(p, insn);
1379   p->if_depth_in_loop[p->loop_stack_depth]++;
1380   return insn;
1381}
1382
1383/* This function is only used for gen6-style IF instructions with an
1384 * embedded comparison (conditional modifier).  It is not used on gen7.
1385 */
1386brw_inst *
1387gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1388	struct brw_reg src0, struct brw_reg src1)
1389{
1390   const struct gen_device_info *devinfo = p->devinfo;
1391   brw_inst *insn;
1392
1393   insn = next_insn(p, BRW_OPCODE_IF);
1394
1395   brw_set_dest(p, insn, brw_imm_w(0));
1396   brw_inst_set_exec_size(devinfo, insn,
1397                          brw_inst_exec_size(devinfo, p->current));
1398   brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1399   brw_set_src0(p, insn, src0);
1400   brw_set_src1(p, insn, src1);
1401
1402   assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1403   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1404   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1405
1406   push_if_stack(p, insn);
1407   return insn;
1408}
1409
1410/**
1411 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1412 */
1413static void
1414convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1415                       brw_inst *if_inst, brw_inst *else_inst)
1416{
1417   const struct gen_device_info *devinfo = p->devinfo;
1418
1419   /* The next instruction (where the ENDIF would be, if it existed) */
1420   brw_inst *next_inst = &p->store[p->nr_insn];
1421
1422   assert(p->single_program_flow);
1423   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1424   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1425   assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1426
1427   /* Convert IF to an ADD instruction that moves the instruction pointer
1428    * to the first instruction of the ELSE block.  If there is no ELSE
1429    * block, point to where ENDIF would be.  Reverse the predicate.
1430    *
1431    * There's no need to execute an ENDIF since we don't need to do any
1432    * stack operations, and if we're currently executing, we just want to
1433    * continue normally.
1434    */
1435   brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1436   brw_inst_set_pred_inv(devinfo, if_inst, true);
1437
1438   if (else_inst != NULL) {
1439      /* Convert ELSE to an ADD instruction that points where the ENDIF
1440       * would be.
1441       */
1442      brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1443
1444      brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1445      brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1446   } else {
1447      brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1448   }
1449}
1450
1451/**
1452 * Patch IF and ELSE instructions with appropriate jump targets.
1453 */
1454static void
1455patch_IF_ELSE(struct brw_codegen *p,
1456              brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1457{
1458   const struct gen_device_info *devinfo = p->devinfo;
1459
1460   /* We shouldn't be patching IF and ELSE instructions in single program flow
1461    * mode when gen < 6, because in single program flow mode on those
1462    * platforms, we convert flow control instructions to conditional ADDs that
1463    * operate on IP (see brw_ENDIF).
1464    *
1465    * However, on Gen6, writing to IP doesn't work in single program flow mode
1466    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1467    * not be updated by non-flow control instructions.").  And on later
1468    * platforms, there is no significant benefit to converting control flow
1469    * instructions to conditional ADDs.  So we do patch IF and ELSE
1470    * instructions in single program flow mode on those platforms.
1471    */
1472   if (devinfo->gen < 6)
1473      assert(!p->single_program_flow);
1474
1475   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1476   assert(endif_inst != NULL);
1477   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1478
1479   unsigned br = brw_jump_scale(devinfo);
1480
1481   assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1482   brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1483
1484   if (else_inst == NULL) {
1485      /* Patch IF -> ENDIF */
1486      if (devinfo->gen < 6) {
1487	 /* Turn it into an IFF, which means no mask stack operations for
1488	  * all-false and jumping past the ENDIF.
1489	  */
1490         brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1491         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1492                                      br * (endif_inst - if_inst + 1));
1493         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1494      } else if (devinfo->gen == 6) {
1495	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1496         brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1497      } else {
1498         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1499         brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1500      }
1501   } else {
1502      brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1503
1504      /* Patch IF -> ELSE */
1505      if (devinfo->gen < 6) {
1506         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1507                                      br * (else_inst - if_inst));
1508         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1509      } else if (devinfo->gen == 6) {
1510         brw_inst_set_gen6_jump_count(devinfo, if_inst,
1511                                      br * (else_inst - if_inst + 1));
1512      }
1513
1514      /* Patch ELSE -> ENDIF */
1515      if (devinfo->gen < 6) {
1516	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1517	  * matching ENDIF.
1518	  */
1519         brw_inst_set_gen4_jump_count(devinfo, else_inst,
1520                                      br * (endif_inst - else_inst + 1));
1521         brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1522      } else if (devinfo->gen == 6) {
1523	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1524         brw_inst_set_gen6_jump_count(devinfo, else_inst,
1525                                      br * (endif_inst - else_inst));
1526      } else {
1527	 /* The IF instruction's JIP should point just past the ELSE */
1528         brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1529	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1530         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1531         brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1532         if (devinfo->gen >= 8) {
1533            /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1534             * should point to ENDIF.
1535             */
1536            brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1537         }
1538      }
1539   }
1540}
1541
1542void
1543brw_ELSE(struct brw_codegen *p)
1544{
1545   const struct gen_device_info *devinfo = p->devinfo;
1546   brw_inst *insn;
1547
1548   insn = next_insn(p, BRW_OPCODE_ELSE);
1549
1550   if (devinfo->gen < 6) {
1551      brw_set_dest(p, insn, brw_ip_reg());
1552      brw_set_src0(p, insn, brw_ip_reg());
1553      brw_set_src1(p, insn, brw_imm_d(0x0));
1554   } else if (devinfo->gen == 6) {
1555      brw_set_dest(p, insn, brw_imm_w(0));
1556      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1557      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1558      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1559   } else if (devinfo->gen == 7) {
1560      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1561      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1562      brw_set_src1(p, insn, brw_imm_w(0));
1563      brw_inst_set_jip(devinfo, insn, 0);
1564      brw_inst_set_uip(devinfo, insn, 0);
1565   } else {
1566      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1567      brw_set_src0(p, insn, brw_imm_d(0));
1568      brw_inst_set_jip(devinfo, insn, 0);
1569      brw_inst_set_uip(devinfo, insn, 0);
1570   }
1571
1572   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1573   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1574   if (!p->single_program_flow && devinfo->gen < 6)
1575      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1576
1577   push_if_stack(p, insn);
1578}
1579
1580void
1581brw_ENDIF(struct brw_codegen *p)
1582{
1583   const struct gen_device_info *devinfo = p->devinfo;
1584   brw_inst *insn = NULL;
1585   brw_inst *else_inst = NULL;
1586   brw_inst *if_inst = NULL;
1587   brw_inst *tmp;
1588   bool emit_endif = true;
1589
1590   /* In single program flow mode, we can express IF and ELSE instructions
1591    * equivalently as ADD instructions that operate on IP.  On platforms prior
1592    * to Gen6, flow control instructions cause an implied thread switch, so
1593    * this is a significant savings.
1594    *
1595    * However, on Gen6, writing to IP doesn't work in single program flow mode
1596    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1597    * not be updated by non-flow control instructions.").  And on later
1598    * platforms, there is no significant benefit to converting control flow
1599    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1600    * Gen5.
1601    */
1602   if (devinfo->gen < 6 && p->single_program_flow)
1603      emit_endif = false;
1604
1605   /*
1606    * A single next_insn() may change the base address of instruction store
1607    * memory(p->store), so call it first before referencing the instruction
1608    * store pointer from an index
1609    */
1610   if (emit_endif)
1611      insn = next_insn(p, BRW_OPCODE_ENDIF);
1612
1613   /* Pop the IF and (optional) ELSE instructions from the stack */
1614   p->if_depth_in_loop[p->loop_stack_depth]--;
1615   tmp = pop_if_stack(p);
1616   if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1617      else_inst = tmp;
1618      tmp = pop_if_stack(p);
1619   }
1620   if_inst = tmp;
1621
1622   if (!emit_endif) {
1623      /* ENDIF is useless; don't bother emitting it. */
1624      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1625      return;
1626   }
1627
1628   if (devinfo->gen < 6) {
1629      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1630      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1631      brw_set_src1(p, insn, brw_imm_d(0x0));
1632   } else if (devinfo->gen == 6) {
1633      brw_set_dest(p, insn, brw_imm_w(0));
1634      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1635      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1636   } else if (devinfo->gen == 7) {
1637      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1638      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1639      brw_set_src1(p, insn, brw_imm_w(0));
1640   } else {
1641      brw_set_src0(p, insn, brw_imm_d(0));
1642   }
1643
1644   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1645   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1646   if (devinfo->gen < 6)
1647      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1648
1649   /* Also pop item off the stack in the endif instruction: */
1650   if (devinfo->gen < 6) {
1651      brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1652      brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1653   } else if (devinfo->gen == 6) {
1654      brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1655   } else {
1656      brw_inst_set_jip(devinfo, insn, 2);
1657   }
1658   patch_IF_ELSE(p, if_inst, else_inst, insn);
1659}
1660
1661brw_inst *
1662brw_BREAK(struct brw_codegen *p)
1663{
1664   const struct gen_device_info *devinfo = p->devinfo;
1665   brw_inst *insn;
1666
1667   insn = next_insn(p, BRW_OPCODE_BREAK);
1668   if (devinfo->gen >= 8) {
1669      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1670      brw_set_src0(p, insn, brw_imm_d(0x0));
1671   } else if (devinfo->gen >= 6) {
1672      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1673      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1674      brw_set_src1(p, insn, brw_imm_d(0x0));
1675   } else {
1676      brw_set_dest(p, insn, brw_ip_reg());
1677      brw_set_src0(p, insn, brw_ip_reg());
1678      brw_set_src1(p, insn, brw_imm_d(0x0));
1679      brw_inst_set_gen4_pop_count(devinfo, insn,
1680                                  p->if_depth_in_loop[p->loop_stack_depth]);
1681   }
1682   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1683   brw_inst_set_exec_size(devinfo, insn,
1684                          brw_inst_exec_size(devinfo, p->current));
1685
1686   return insn;
1687}
1688
1689brw_inst *
1690brw_CONT(struct brw_codegen *p)
1691{
1692   const struct gen_device_info *devinfo = p->devinfo;
1693   brw_inst *insn;
1694
1695   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1696   brw_set_dest(p, insn, brw_ip_reg());
1697   if (devinfo->gen >= 8) {
1698      brw_set_src0(p, insn, brw_imm_d(0x0));
1699   } else {
1700      brw_set_src0(p, insn, brw_ip_reg());
1701      brw_set_src1(p, insn, brw_imm_d(0x0));
1702   }
1703
1704   if (devinfo->gen < 6) {
1705      brw_inst_set_gen4_pop_count(devinfo, insn,
1706                                  p->if_depth_in_loop[p->loop_stack_depth]);
1707   }
1708   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1709   brw_inst_set_exec_size(devinfo, insn,
1710                          brw_inst_exec_size(devinfo, p->current));
1711   return insn;
1712}
1713
1714brw_inst *
1715gen6_HALT(struct brw_codegen *p)
1716{
1717   const struct gen_device_info *devinfo = p->devinfo;
1718   brw_inst *insn;
1719
1720   insn = next_insn(p, BRW_OPCODE_HALT);
1721   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1722   if (devinfo->gen >= 8) {
1723      brw_set_src0(p, insn, brw_imm_d(0x0));
1724   } else {
1725      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1726      brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1727   }
1728
1729   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1730   brw_inst_set_exec_size(devinfo, insn,
1731                          brw_inst_exec_size(devinfo, p->current));
1732   return insn;
1733}
1734
1735/* DO/WHILE loop:
1736 *
1737 * The DO/WHILE is just an unterminated loop -- break or continue are
1738 * used for control within the loop.  We have a few ways they can be
1739 * done.
1740 *
1741 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1742 * jip and no DO instruction.
1743 *
1744 * For non-uniform control flow pre-gen6, there's a DO instruction to
1745 * push the mask, and a WHILE to jump back, and BREAK to get out and
1746 * pop the mask.
1747 *
1748 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1749 * just points back to the first instruction of the loop.
1750 */
1751brw_inst *
1752brw_DO(struct brw_codegen *p, unsigned execute_size)
1753{
1754   const struct gen_device_info *devinfo = p->devinfo;
1755
1756   if (devinfo->gen >= 6 || p->single_program_flow) {
1757      push_loop_stack(p, &p->store[p->nr_insn]);
1758      return &p->store[p->nr_insn];
1759   } else {
1760      brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1761
1762      push_loop_stack(p, insn);
1763
1764      /* Override the defaults for this instruction:
1765       */
1766      brw_set_dest(p, insn, brw_null_reg());
1767      brw_set_src0(p, insn, brw_null_reg());
1768      brw_set_src1(p, insn, brw_null_reg());
1769
1770      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1771      brw_inst_set_exec_size(devinfo, insn, execute_size);
1772      brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1773
1774      return insn;
1775   }
1776}
1777
1778/**
1779 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1780 * instruction here.
1781 *
1782 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1783 * nesting, since it can always just point to the end of the block/current loop.
1784 */
1785static void
1786brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1787{
1788   const struct gen_device_info *devinfo = p->devinfo;
1789   brw_inst *do_inst = get_inner_do_insn(p);
1790   brw_inst *inst;
1791   unsigned br = brw_jump_scale(devinfo);
1792
1793   assert(devinfo->gen < 6);
1794
1795   for (inst = while_inst - 1; inst != do_inst; inst--) {
1796      /* If the jump count is != 0, that means that this instruction has already
1797       * been patched because it's part of a loop inside of the one we're
1798       * patching.
1799       */
1800      if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1801          brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1802         brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1803      } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1804                 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1805         brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1806      }
1807   }
1808}
1809
1810brw_inst *
1811brw_WHILE(struct brw_codegen *p)
1812{
1813   const struct gen_device_info *devinfo = p->devinfo;
1814   brw_inst *insn, *do_insn;
1815   unsigned br = brw_jump_scale(devinfo);
1816
1817   if (devinfo->gen >= 6) {
1818      insn = next_insn(p, BRW_OPCODE_WHILE);
1819      do_insn = get_inner_do_insn(p);
1820
1821      if (devinfo->gen >= 8) {
1822         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1823         brw_set_src0(p, insn, brw_imm_d(0));
1824         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1825      } else if (devinfo->gen == 7) {
1826         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1827         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1828         brw_set_src1(p, insn, brw_imm_w(0));
1829         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1830      } else {
1831         brw_set_dest(p, insn, brw_imm_w(0));
1832         brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1833         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1834         brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1835      }
1836
1837      brw_inst_set_exec_size(devinfo, insn,
1838                             brw_inst_exec_size(devinfo, p->current));
1839
1840   } else {
1841      if (p->single_program_flow) {
1842	 insn = next_insn(p, BRW_OPCODE_ADD);
1843         do_insn = get_inner_do_insn(p);
1844
1845	 brw_set_dest(p, insn, brw_ip_reg());
1846	 brw_set_src0(p, insn, brw_ip_reg());
1847	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1848         brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1849      } else {
1850	 insn = next_insn(p, BRW_OPCODE_WHILE);
1851         do_insn = get_inner_do_insn(p);
1852
1853         assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1854
1855	 brw_set_dest(p, insn, brw_ip_reg());
1856	 brw_set_src0(p, insn, brw_ip_reg());
1857	 brw_set_src1(p, insn, brw_imm_d(0));
1858
1859         brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1860         brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1861         brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1862
1863	 brw_patch_break_cont(p, insn);
1864      }
1865   }
1866   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1867
1868   p->loop_stack_depth--;
1869
1870   return insn;
1871}
1872
1873/* FORWARD JUMPS:
1874 */
1875void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1876{
1877   const struct gen_device_info *devinfo = p->devinfo;
1878   brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1879   unsigned jmpi = 1;
1880
1881   if (devinfo->gen >= 5)
1882      jmpi = 2;
1883
1884   assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1885   assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1886
1887   brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1888                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
1889}
1890
1891/* To integrate with the above, it makes sense that the comparison
1892 * instruction should populate the flag register.  It might be simpler
1893 * just to use the flag reg for most WM tasks?
1894 */
1895void brw_CMP(struct brw_codegen *p,
1896	     struct brw_reg dest,
1897	     unsigned conditional,
1898	     struct brw_reg src0,
1899	     struct brw_reg src1)
1900{
1901   const struct gen_device_info *devinfo = p->devinfo;
1902   brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1903
1904   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1905   brw_set_dest(p, insn, dest);
1906   brw_set_src0(p, insn, src0);
1907   brw_set_src1(p, insn, src1);
1908
1909   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1910    * page says:
1911    *    "Any CMP instruction with a null destination must use a {switch}."
1912    *
1913    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1914    * mentioned on their work-arounds pages.
1915    */
1916   if (devinfo->gen == 7) {
1917      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1918          dest.nr == BRW_ARF_NULL) {
1919         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1920      }
1921   }
1922}
1923
1924/***********************************************************************
1925 * Helpers for the various SEND message types:
1926 */
1927
1928/** Extended math function, float[8].
1929 */
1930void gen4_math(struct brw_codegen *p,
1931	       struct brw_reg dest,
1932	       unsigned function,
1933	       unsigned msg_reg_nr,
1934	       struct brw_reg src,
1935	       unsigned precision )
1936{
1937   const struct gen_device_info *devinfo = p->devinfo;
1938   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1939   unsigned data_type;
1940   if (has_scalar_region(src)) {
1941      data_type = BRW_MATH_DATA_SCALAR;
1942   } else {
1943      data_type = BRW_MATH_DATA_VECTOR;
1944   }
1945
1946   assert(devinfo->gen < 6);
1947
1948   /* Example code doesn't set predicate_control for send
1949    * instructions.
1950    */
1951   brw_inst_set_pred_control(devinfo, insn, 0);
1952   brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1953
1954   brw_set_dest(p, insn, dest);
1955   brw_set_src0(p, insn, src);
1956   brw_set_math_message(p,
1957                        insn,
1958                        function,
1959                        src.type == BRW_REGISTER_TYPE_D,
1960                        precision,
1961                        data_type);
1962}
1963
1964void gen6_math(struct brw_codegen *p,
1965	       struct brw_reg dest,
1966	       unsigned function,
1967	       struct brw_reg src0,
1968	       struct brw_reg src1)
1969{
1970   const struct gen_device_info *devinfo = p->devinfo;
1971   brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1972
1973   assert(devinfo->gen >= 6);
1974
1975   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1976          (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1977   assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
1978          (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
1979
1980   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1981   if (devinfo->gen == 6) {
1982      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1983      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1984   }
1985
1986   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1987       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1988       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1989      assert(src0.type != BRW_REGISTER_TYPE_F);
1990      assert(src1.type != BRW_REGISTER_TYPE_F);
1991      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1992             (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1993   } else {
1994      assert(src0.type == BRW_REGISTER_TYPE_F);
1995      assert(src1.type == BRW_REGISTER_TYPE_F);
1996      if (function == BRW_MATH_FUNCTION_POW) {
1997         assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1998                (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1999      } else {
2000         assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2001                src1.nr == BRW_ARF_NULL);
2002      }
2003   }
2004
2005   /* Source modifiers are ignored for extended math instructions on Gen6. */
2006   if (devinfo->gen == 6) {
2007      assert(!src0.negate);
2008      assert(!src0.abs);
2009      assert(!src1.negate);
2010      assert(!src1.abs);
2011   }
2012
2013   brw_inst_set_math_function(devinfo, insn, function);
2014
2015   brw_set_dest(p, insn, dest);
2016   brw_set_src0(p, insn, src0);
2017   brw_set_src1(p, insn, src1);
2018}
2019
2020/**
2021 * Return the right surface index to access the thread scratch space using
2022 * stateless dataport messages.
2023 */
2024unsigned
2025brw_scratch_surface_idx(const struct brw_codegen *p)
2026{
2027   /* The scratch space is thread-local so IA coherency is unnecessary. */
2028   if (p->devinfo->gen >= 8)
2029      return GEN8_BTI_STATELESS_NON_COHERENT;
2030   else
2031      return BRW_BTI_STATELESS;
2032}
2033
2034/**
2035 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2036 * using a constant offset per channel.
2037 *
2038 * The offset must be aligned to oword size (16 bytes).  Used for
2039 * register spilling.
2040 */
2041void brw_oword_block_write_scratch(struct brw_codegen *p,
2042				   struct brw_reg mrf,
2043				   int num_regs,
2044				   unsigned offset)
2045{
2046   const struct gen_device_info *devinfo = p->devinfo;
2047   const unsigned target_cache =
2048      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2049       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2050       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2051   uint32_t msg_type;
2052
2053   if (devinfo->gen >= 6)
2054      offset /= 16;
2055
2056   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2057
2058   const unsigned mlen = 1 + num_regs;
2059
2060   /* Set up the message header.  This is g0, with g0.2 filled with
2061    * the offset.  We don't want to leave our offset around in g0 or
2062    * it'll screw up texture samples, so set it up inside the message
2063    * reg.
2064    */
2065   {
2066      brw_push_insn_state(p);
2067      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2068      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2069      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2070
2071      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2072
2073      /* set message header global offset field (reg 0, element 2) */
2074      brw_MOV(p,
2075	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2076				  mrf.nr,
2077				  2), BRW_REGISTER_TYPE_UD),
2078	      brw_imm_ud(offset));
2079
2080      brw_pop_insn_state(p);
2081   }
2082
2083   {
2084      struct brw_reg dest;
2085      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2086      int send_commit_msg;
2087      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2088					 BRW_REGISTER_TYPE_UW);
2089
2090      brw_inst_set_compression(devinfo, insn, false);
2091
2092      if (brw_inst_exec_size(devinfo, insn) >= 16)
2093	 src_header = vec16(src_header);
2094
2095      assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2096      if (devinfo->gen < 6)
2097         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2098
2099      /* Until gen6, writes followed by reads from the same location
2100       * are not guaranteed to be ordered unless write_commit is set.
2101       * If set, then a no-op write is issued to the destination
2102       * register to set a dependency, and a read from the destination
2103       * can be used to ensure the ordering.
2104       *
2105       * For gen6, only writes between different threads need ordering
2106       * protection.  Our use of DP writes is all about register
2107       * spilling within a thread.
2108       */
2109      if (devinfo->gen >= 6) {
2110	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2111	 send_commit_msg = 0;
2112      } else {
2113	 dest = src_header;
2114	 send_commit_msg = 1;
2115      }
2116
2117      brw_set_dest(p, insn, dest);
2118      if (devinfo->gen >= 6) {
2119	 brw_set_src0(p, insn, mrf);
2120      } else {
2121	 brw_set_src0(p, insn, brw_null_reg());
2122      }
2123
2124      if (devinfo->gen >= 6)
2125	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2126      else
2127	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2128
2129      brw_set_dp_write_message(p,
2130			       insn,
2131                               brw_scratch_surface_idx(p),
2132			       BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2133			       msg_type,
2134                               target_cache,
2135			       mlen,
2136			       true, /* header_present */
2137			       0, /* not a render target */
2138			       send_commit_msg, /* response_length */
2139			       0, /* eot */
2140			       send_commit_msg);
2141   }
2142}
2143
2144
2145/**
2146 * Read a block of owords (half a GRF each) from the scratch buffer
2147 * using a constant index per channel.
2148 *
2149 * Offset must be aligned to oword size (16 bytes).  Used for register
2150 * spilling.
2151 */
2152void
2153brw_oword_block_read_scratch(struct brw_codegen *p,
2154			     struct brw_reg dest,
2155			     struct brw_reg mrf,
2156			     int num_regs,
2157			     unsigned offset)
2158{
2159   const struct gen_device_info *devinfo = p->devinfo;
2160
2161   if (devinfo->gen >= 6)
2162      offset /= 16;
2163
2164   if (p->devinfo->gen >= 7) {
2165      /* On gen 7 and above, we no longer have message registers and we can
2166       * send from any register we want.  By using the destination register
2167       * for the message, we guarantee that the implied message write won't
2168       * accidentally overwrite anything.  This has been a problem because
2169       * the MRF registers and source for the final FB write are both fixed
2170       * and may overlap.
2171       */
2172      mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2173   } else {
2174      mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2175   }
2176   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2177
2178   const unsigned rlen = num_regs;
2179   const unsigned target_cache =
2180      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2181       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2182       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2183
2184   {
2185      brw_push_insn_state(p);
2186      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2187      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2188      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2189
2190      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2191
2192      /* set message header global offset field (reg 0, element 2) */
2193      brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2194
2195      brw_pop_insn_state(p);
2196   }
2197
2198   {
2199      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2200
2201      assert(brw_inst_pred_control(devinfo, insn) == 0);
2202      brw_inst_set_compression(devinfo, insn, false);
2203
2204      brw_set_dest(p, insn, dest);	/* UW? */
2205      if (devinfo->gen >= 6) {
2206	 brw_set_src0(p, insn, mrf);
2207      } else {
2208	 brw_set_src0(p, insn, brw_null_reg());
2209         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2210      }
2211
2212      brw_set_dp_read_message(p,
2213			      insn,
2214                              brw_scratch_surface_idx(p),
2215			      BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2216			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2217			      target_cache,
2218			      1, /* msg_length */
2219                              true, /* header_present */
2220			      rlen);
2221   }
2222}
2223
2224void
2225gen7_block_read_scratch(struct brw_codegen *p,
2226                        struct brw_reg dest,
2227                        int num_regs,
2228                        unsigned offset)
2229{
2230   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2231   assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2232
2233   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2234
2235   /* The HW requires that the header is present; this is to get the g0.5
2236    * scratch offset.
2237    */
2238   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2239
2240   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2241    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2242    * is 32 bytes, which happens to be the size of a register.
2243    */
2244   offset /= REG_SIZE;
2245   assert(offset < (1 << 12));
2246
2247   gen7_set_dp_scratch_message(p, insn,
2248                               false, /* scratch read */
2249                               false, /* OWords */
2250                               false, /* invalidate after read */
2251                               num_regs,
2252                               offset,
2253                               1,        /* mlen: just g0 */
2254                               num_regs, /* rlen */
2255                               true);    /* header present */
2256}
2257
2258/**
2259 * Read float[4] vectors from the data port constant cache.
2260 * Location (in buffer) should be a multiple of 16.
2261 * Used for fetching shader constants.
2262 */
2263void brw_oword_block_read(struct brw_codegen *p,
2264			  struct brw_reg dest,
2265			  struct brw_reg mrf,
2266			  uint32_t offset,
2267			  uint32_t bind_table_index)
2268{
2269   const struct gen_device_info *devinfo = p->devinfo;
2270   const unsigned target_cache =
2271      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2272       BRW_DATAPORT_READ_TARGET_DATA_CACHE);
2273   const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
2274
2275   /* On newer hardware, offset is in units of owords. */
2276   if (devinfo->gen >= 6)
2277      offset /= 16;
2278
2279   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2280
2281   brw_push_insn_state(p);
2282   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2283   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2284   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2285
2286   brw_push_insn_state(p);
2287   brw_set_default_exec_size(p, BRW_EXECUTE_8);
2288   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2289
2290   /* set message header global offset field (reg 0, element 2) */
2291   brw_MOV(p,
2292	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2293			       mrf.nr,
2294			       2), BRW_REGISTER_TYPE_UD),
2295	   brw_imm_ud(offset));
2296   brw_pop_insn_state(p);
2297
2298   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2299
2300   /* cast dest to a uword[8] vector */
2301   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2302
2303   brw_set_dest(p, insn, dest);
2304   if (devinfo->gen >= 6) {
2305      brw_set_src0(p, insn, mrf);
2306   } else {
2307      brw_set_src0(p, insn, brw_null_reg());
2308      brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2309   }
2310
2311   brw_set_dp_read_message(p, insn, bind_table_index,
2312                           BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2313			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2314			   target_cache,
2315			   1, /* msg_length */
2316                           true, /* header_present */
2317			   DIV_ROUND_UP(exec_size, 8)); /* response_length */
2318
2319   brw_pop_insn_state(p);
2320}
2321
2322
2323void brw_fb_WRITE(struct brw_codegen *p,
2324                  struct brw_reg payload,
2325                  struct brw_reg implied_header,
2326                  unsigned msg_control,
2327                  unsigned binding_table_index,
2328                  unsigned msg_length,
2329                  unsigned response_length,
2330                  bool eot,
2331                  bool last_render_target,
2332                  bool header_present)
2333{
2334   const struct gen_device_info *devinfo = p->devinfo;
2335   const unsigned target_cache =
2336      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2337       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2338   brw_inst *insn;
2339   unsigned msg_type;
2340   struct brw_reg dest, src0;
2341
2342   if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
2343      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2344   else
2345      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2346
2347   if (devinfo->gen >= 6) {
2348      insn = next_insn(p, BRW_OPCODE_SENDC);
2349   } else {
2350      insn = next_insn(p, BRW_OPCODE_SEND);
2351   }
2352   brw_inst_set_compression(devinfo, insn, false);
2353
2354   if (devinfo->gen >= 6) {
2355      /* headerless version, just submit color payload */
2356      src0 = payload;
2357
2358      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2359   } else {
2360      assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2361      brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2362      src0 = implied_header;
2363
2364      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2365   }
2366
2367   brw_set_dest(p, insn, dest);
2368   brw_set_src0(p, insn, src0);
2369   brw_set_dp_write_message(p,
2370			    insn,
2371			    binding_table_index,
2372			    msg_control,
2373			    msg_type,
2374                            target_cache,
2375			    msg_length,
2376			    header_present,
2377			    last_render_target,
2378			    response_length,
2379			    eot,
2380			    0 /* send_commit_msg */);
2381}
2382
2383brw_inst *
2384gen9_fb_READ(struct brw_codegen *p,
2385             struct brw_reg dst,
2386             struct brw_reg payload,
2387             unsigned binding_table_index,
2388             unsigned msg_length,
2389             unsigned response_length,
2390             bool per_sample)
2391{
2392   const struct gen_device_info *devinfo = p->devinfo;
2393   assert(devinfo->gen >= 9);
2394   const unsigned msg_subtype =
2395      brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
2396   brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2397
2398   brw_set_dest(p, insn, dst);
2399   brw_set_src0(p, insn, payload);
2400   brw_set_dp_read_message(p, insn, binding_table_index,
2401                           per_sample << 5 | msg_subtype,
2402                           GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2403                           GEN6_SFID_DATAPORT_RENDER_CACHE,
2404                           msg_length, true /* header_present */,
2405                           response_length);
2406   brw_inst_set_rt_slot_group(devinfo, insn,
2407                              brw_inst_qtr_control(devinfo, p->current) / 2);
2408
2409   return insn;
2410}
2411
2412/**
2413 * Texture sample instruction.
2414 * Note: the msg_type plus msg_length values determine exactly what kind
2415 * of sampling operation is performed.  See volume 4, page 161 of docs.
2416 */
2417void brw_SAMPLE(struct brw_codegen *p,
2418		struct brw_reg dest,
2419		unsigned msg_reg_nr,
2420		struct brw_reg src0,
2421		unsigned binding_table_index,
2422		unsigned sampler,
2423		unsigned msg_type,
2424		unsigned response_length,
2425		unsigned msg_length,
2426		unsigned header_present,
2427		unsigned simd_mode,
2428		unsigned return_format)
2429{
2430   const struct gen_device_info *devinfo = p->devinfo;
2431   brw_inst *insn;
2432
2433   if (msg_reg_nr != -1)
2434      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2435
2436   insn = next_insn(p, BRW_OPCODE_SEND);
2437   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2438
2439   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2440    *
2441    *    "Instruction compression is not allowed for this instruction (that
2442    *     is, send). The hardware behavior is undefined if this instruction is
2443    *     set as compressed. However, compress control can be set to "SecHalf"
2444    *     to affect the EMask generation."
2445    *
2446    * No similar wording is found in later PRMs, but there are examples
2447    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2448    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2449    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2450    */
2451   brw_inst_set_compression(devinfo, insn, false);
2452
2453   if (devinfo->gen < 6)
2454      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2455
2456   brw_set_dest(p, insn, dest);
2457   brw_set_src0(p, insn, src0);
2458   brw_set_sampler_message(p, insn,
2459                           binding_table_index,
2460                           sampler,
2461                           msg_type,
2462                           response_length,
2463                           msg_length,
2464                           header_present,
2465                           simd_mode,
2466                           return_format);
2467}
2468
2469/* Adjust the message header's sampler state pointer to
2470 * select the correct group of 16 samplers.
2471 */
2472void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2473                                      struct brw_reg header,
2474                                      struct brw_reg sampler_index)
2475{
2476   /* The "Sampler Index" field can only store values between 0 and 15.
2477    * However, we can add an offset to the "Sampler State Pointer"
2478    * field, effectively selecting a different set of 16 samplers.
2479    *
2480    * The "Sampler State Pointer" needs to be aligned to a 32-byte
2481    * offset, and each sampler state is only 16-bytes, so we can't
2482    * exclusively use the offset - we have to use both.
2483    */
2484
2485   const struct gen_device_info *devinfo = p->devinfo;
2486
2487   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2488      const int sampler_state_size = 16; /* 16 bytes */
2489      uint32_t sampler = sampler_index.ud;
2490
2491      if (sampler >= 16) {
2492         assert(devinfo->is_haswell || devinfo->gen >= 8);
2493         brw_ADD(p,
2494                 get_element_ud(header, 3),
2495                 get_element_ud(brw_vec8_grf(0, 0), 3),
2496                 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2497      }
2498   } else {
2499      /* Non-const sampler array indexing case */
2500      if (devinfo->gen < 8 && !devinfo->is_haswell) {
2501         return;
2502      }
2503
2504      struct brw_reg temp = get_element_ud(header, 3);
2505
2506      brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2507      brw_SHL(p, temp, temp, brw_imm_ud(4));
2508      brw_ADD(p,
2509              get_element_ud(header, 3),
2510              get_element_ud(brw_vec8_grf(0, 0), 3),
2511              temp);
2512   }
2513}
2514
2515/* All these variables are pretty confusing - we might be better off
2516 * using bitmasks and macros for this, in the old style.  Or perhaps
2517 * just having the caller instantiate the fields in dword3 itself.
2518 */
2519void brw_urb_WRITE(struct brw_codegen *p,
2520		   struct brw_reg dest,
2521		   unsigned msg_reg_nr,
2522		   struct brw_reg src0,
2523                   enum brw_urb_write_flags flags,
2524		   unsigned msg_length,
2525		   unsigned response_length,
2526		   unsigned offset,
2527		   unsigned swizzle)
2528{
2529   const struct gen_device_info *devinfo = p->devinfo;
2530   brw_inst *insn;
2531
2532   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2533
2534   if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2535      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2536      brw_push_insn_state(p);
2537      brw_set_default_access_mode(p, BRW_ALIGN_1);
2538      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2539      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2540		       BRW_REGISTER_TYPE_UD),
2541	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2542		brw_imm_ud(0xff00));
2543      brw_pop_insn_state(p);
2544   }
2545
2546   insn = next_insn(p, BRW_OPCODE_SEND);
2547
2548   assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2549
2550   brw_set_dest(p, insn, dest);
2551   brw_set_src0(p, insn, src0);
2552   brw_set_src1(p, insn, brw_imm_d(0));
2553
2554   if (devinfo->gen < 6)
2555      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2556
2557   brw_set_urb_message(p,
2558		       insn,
2559		       flags,
2560		       msg_length,
2561		       response_length,
2562		       offset,
2563		       swizzle);
2564}
2565
2566struct brw_inst *
2567brw_send_indirect_message(struct brw_codegen *p,
2568                          unsigned sfid,
2569                          struct brw_reg dst,
2570                          struct brw_reg payload,
2571                          struct brw_reg desc)
2572{
2573   const struct gen_device_info *devinfo = p->devinfo;
2574   struct brw_inst *send;
2575   int setup;
2576
2577   dst = retype(dst, BRW_REGISTER_TYPE_UW);
2578
2579   assert(desc.type == BRW_REGISTER_TYPE_UD);
2580
2581   /* We hold on to the setup instruction (the SEND in the direct case, the OR
2582    * in the indirect case) by its index in the instruction store.  The
2583    * pointer returned by next_insn() may become invalid if emitting the SEND
2584    * in the indirect case reallocs the store.
2585    */
2586
2587   if (desc.file == BRW_IMMEDIATE_VALUE) {
2588      setup = p->nr_insn;
2589      send = next_insn(p, BRW_OPCODE_SEND);
2590      brw_set_src1(p, send, desc);
2591
2592   } else {
2593      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2594
2595      brw_push_insn_state(p);
2596      brw_set_default_access_mode(p, BRW_ALIGN_1);
2597      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2598      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2599
2600      /* Load the indirect descriptor to an address register using OR so the
2601       * caller can specify additional descriptor bits with the usual
2602       * brw_set_*_message() helper functions.
2603       */
2604      setup = p->nr_insn;
2605      brw_OR(p, addr, desc, brw_imm_ud(0));
2606
2607      brw_pop_insn_state(p);
2608
2609      send = next_insn(p, BRW_OPCODE_SEND);
2610      brw_set_src1(p, send, addr);
2611   }
2612
2613   if (dst.width < BRW_EXECUTE_8)
2614      brw_inst_set_exec_size(devinfo, send, dst.width);
2615
2616   brw_set_dest(p, send, dst);
2617   brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2618   brw_inst_set_sfid(devinfo, send, sfid);
2619
2620   return &p->store[setup];
2621}
2622
2623static struct brw_inst *
2624brw_send_indirect_surface_message(struct brw_codegen *p,
2625                                  unsigned sfid,
2626                                  struct brw_reg dst,
2627                                  struct brw_reg payload,
2628                                  struct brw_reg surface,
2629                                  unsigned message_len,
2630                                  unsigned response_len,
2631                                  bool header_present)
2632{
2633   const struct gen_device_info *devinfo = p->devinfo;
2634   struct brw_inst *insn;
2635
2636   if (surface.file != BRW_IMMEDIATE_VALUE) {
2637      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2638
2639      brw_push_insn_state(p);
2640      brw_set_default_access_mode(p, BRW_ALIGN_1);
2641      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2642      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2643
2644      /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2645       * some surface array is accessed out of bounds.
2646       */
2647      insn = brw_AND(p, addr,
2648                     suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2649                               BRW_GET_SWZ(surface.swizzle, 0)),
2650                     brw_imm_ud(0xff));
2651
2652      brw_pop_insn_state(p);
2653
2654      surface = addr;
2655   }
2656
2657   insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2658   brw_inst_set_mlen(devinfo, insn, message_len);
2659   brw_inst_set_rlen(devinfo, insn, response_len);
2660   brw_inst_set_header_present(devinfo, insn, header_present);
2661
2662   return insn;
2663}
2664
2665static bool
2666while_jumps_before_offset(const struct gen_device_info *devinfo,
2667                          brw_inst *insn, int while_offset, int start_offset)
2668{
2669   int scale = 16 / brw_jump_scale(devinfo);
2670   int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2671                               : brw_inst_jip(devinfo, insn);
2672   return while_offset + jip * scale <= start_offset;
2673}
2674
2675
2676static int
2677brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2678{
2679   int offset;
2680   void *store = p->store;
2681   const struct gen_device_info *devinfo = p->devinfo;
2682
2683   int depth = 0;
2684
2685   for (offset = next_offset(devinfo, store, start_offset);
2686        offset < p->next_insn_offset;
2687        offset = next_offset(devinfo, store, offset)) {
2688      brw_inst *insn = store + offset;
2689
2690      switch (brw_inst_opcode(devinfo, insn)) {
2691      case BRW_OPCODE_IF:
2692         depth++;
2693         break;
2694      case BRW_OPCODE_ENDIF:
2695         if (depth == 0)
2696            return offset;
2697         depth--;
2698         break;
2699      case BRW_OPCODE_WHILE:
2700         /* If the while doesn't jump before our instruction, it's the end
2701          * of a sibling do...while loop.  Ignore it.
2702          */
2703         if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2704            continue;
2705         /* fallthrough */
2706      case BRW_OPCODE_ELSE:
2707      case BRW_OPCODE_HALT:
2708         if (depth == 0)
2709            return offset;
2710      }
2711   }
2712
2713   return 0;
2714}
2715
2716/* There is no DO instruction on gen6, so to find the end of the loop
2717 * we have to see if the loop is jumping back before our start
2718 * instruction.
2719 */
2720static int
2721brw_find_loop_end(struct brw_codegen *p, int start_offset)
2722{
2723   const struct gen_device_info *devinfo = p->devinfo;
2724   int offset;
2725   void *store = p->store;
2726
2727   assert(devinfo->gen >= 6);
2728
2729   /* Always start after the instruction (such as a WHILE) we're trying to fix
2730    * up.
2731    */
2732   for (offset = next_offset(devinfo, store, start_offset);
2733        offset < p->next_insn_offset;
2734        offset = next_offset(devinfo, store, offset)) {
2735      brw_inst *insn = store + offset;
2736
2737      if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2738	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2739	    return offset;
2740      }
2741   }
2742   assert(!"not reached");
2743   return start_offset;
2744}
2745
2746/* After program generation, go back and update the UIP and JIP of
2747 * BREAK, CONT, and HALT instructions to their correct locations.
2748 */
2749void
2750brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2751{
2752   const struct gen_device_info *devinfo = p->devinfo;
2753   int offset;
2754   int br = brw_jump_scale(devinfo);
2755   int scale = 16 / br;
2756   void *store = p->store;
2757
2758   if (devinfo->gen < 6)
2759      return;
2760
2761   for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2762      brw_inst *insn = store + offset;
2763      assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2764
2765      int block_end_offset = brw_find_next_block_end(p, offset);
2766      switch (brw_inst_opcode(devinfo, insn)) {
2767      case BRW_OPCODE_BREAK:
2768         assert(block_end_offset != 0);
2769         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2770	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2771         brw_inst_set_uip(devinfo, insn,
2772	    (brw_find_loop_end(p, offset) - offset +
2773             (devinfo->gen == 6 ? 16 : 0)) / scale);
2774	 break;
2775      case BRW_OPCODE_CONTINUE:
2776         assert(block_end_offset != 0);
2777         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2778         brw_inst_set_uip(devinfo, insn,
2779            (brw_find_loop_end(p, offset) - offset) / scale);
2780
2781         assert(brw_inst_uip(devinfo, insn) != 0);
2782         assert(brw_inst_jip(devinfo, insn) != 0);
2783	 break;
2784
2785      case BRW_OPCODE_ENDIF: {
2786         int32_t jump = (block_end_offset == 0) ?
2787                        1 * br : (block_end_offset - offset) / scale;
2788         if (devinfo->gen >= 7)
2789            brw_inst_set_jip(devinfo, insn, jump);
2790         else
2791            brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2792	 break;
2793      }
2794
2795      case BRW_OPCODE_HALT:
2796	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2797	  *
2798	  *    "In case of the halt instruction not inside any conditional
2799	  *     code block, the value of <JIP> and <UIP> should be the
2800	  *     same. In case of the halt instruction inside conditional code
2801	  *     block, the <UIP> should be the end of the program, and the
2802	  *     <JIP> should be end of the most inner conditional code block."
2803	  *
2804	  * The uip will have already been set by whoever set up the
2805	  * instruction.
2806	  */
2807	 if (block_end_offset == 0) {
2808            brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2809	 } else {
2810            brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2811	 }
2812         assert(brw_inst_uip(devinfo, insn) != 0);
2813         assert(brw_inst_jip(devinfo, insn) != 0);
2814	 break;
2815      }
2816   }
2817}
2818
2819void brw_ff_sync(struct brw_codegen *p,
2820		   struct brw_reg dest,
2821		   unsigned msg_reg_nr,
2822		   struct brw_reg src0,
2823		   bool allocate,
2824		   unsigned response_length,
2825		   bool eot)
2826{
2827   const struct gen_device_info *devinfo = p->devinfo;
2828   brw_inst *insn;
2829
2830   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2831
2832   insn = next_insn(p, BRW_OPCODE_SEND);
2833   brw_set_dest(p, insn, dest);
2834   brw_set_src0(p, insn, src0);
2835   brw_set_src1(p, insn, brw_imm_d(0));
2836
2837   if (devinfo->gen < 6)
2838      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2839
2840   brw_set_ff_sync_message(p,
2841			   insn,
2842			   allocate,
2843			   response_length,
2844			   eot);
2845}
2846
2847/**
2848 * Emit the SEND instruction necessary to generate stream output data on Gen6
2849 * (for transform feedback).
2850 *
2851 * If send_commit_msg is true, this is the last piece of stream output data
2852 * from this thread, so send the data as a committed write.  According to the
2853 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2854 *
2855 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2856 *   writes are complete by sending the final write as a committed write."
2857 */
2858void
2859brw_svb_write(struct brw_codegen *p,
2860              struct brw_reg dest,
2861              unsigned msg_reg_nr,
2862              struct brw_reg src0,
2863              unsigned binding_table_index,
2864              bool   send_commit_msg)
2865{
2866   const struct gen_device_info *devinfo = p->devinfo;
2867   const unsigned target_cache =
2868      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2869       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2870       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
2871   brw_inst *insn;
2872
2873   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2874
2875   insn = next_insn(p, BRW_OPCODE_SEND);
2876   brw_set_dest(p, insn, dest);
2877   brw_set_src0(p, insn, src0);
2878   brw_set_src1(p, insn, brw_imm_d(0));
2879   brw_set_dp_write_message(p, insn,
2880                            binding_table_index,
2881                            0, /* msg_control: ignored */
2882                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2883                            target_cache,
2884                            1, /* msg_length */
2885                            true, /* header_present */
2886                            0, /* last_render_target: ignored */
2887                            send_commit_msg, /* response_length */
2888                            0, /* end_of_thread */
2889                            send_commit_msg); /* send_commit_msg */
2890}
2891
2892static unsigned
2893brw_surface_payload_size(struct brw_codegen *p,
2894                         unsigned num_channels,
2895                         bool has_simd4x2,
2896                         bool has_simd16)
2897{
2898   if (has_simd4x2 &&
2899       brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2900      return 1;
2901   else if (has_simd16 &&
2902            brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
2903      return 2 * num_channels;
2904   else
2905      return num_channels;
2906}
2907
2908static void
2909brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2910                                  brw_inst *insn,
2911                                  unsigned atomic_op,
2912                                  bool response_expected)
2913{
2914   const struct gen_device_info *devinfo = p->devinfo;
2915   unsigned msg_control =
2916      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2917      (response_expected ? 1 << 5 : 0); /* Return data expected */
2918
2919   if (devinfo->gen >= 8 || devinfo->is_haswell) {
2920      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2921         if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2922            msg_control |= 1 << 4; /* SIMD8 mode */
2923
2924         brw_inst_set_dp_msg_type(devinfo, insn,
2925                                  HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2926      } else {
2927         brw_inst_set_dp_msg_type(devinfo, insn,
2928            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2929      }
2930   } else {
2931      brw_inst_set_dp_msg_type(devinfo, insn,
2932                               GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2933
2934      if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
2935         msg_control |= 1 << 4; /* SIMD8 mode */
2936   }
2937
2938   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2939}
2940
2941void
2942brw_untyped_atomic(struct brw_codegen *p,
2943                   struct brw_reg dst,
2944                   struct brw_reg payload,
2945                   struct brw_reg surface,
2946                   unsigned atomic_op,
2947                   unsigned msg_length,
2948                   bool response_expected)
2949{
2950   const struct gen_device_info *devinfo = p->devinfo;
2951   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2952                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2953                          GEN7_SFID_DATAPORT_DATA_CACHE);
2954   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2955   /* Mask out unused components -- This is especially important in Align16
2956    * mode on generations that don't have native support for SIMD4x2 atomics,
2957    * because unused but enabled components will cause the dataport to perform
2958    * additional atomic operations on the addresses that happen to be in the
2959    * uninitialized Y, Z and W coordinates of the payload.
2960    */
2961   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2962   struct brw_inst *insn = brw_send_indirect_surface_message(
2963      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2964      brw_surface_payload_size(p, response_expected,
2965                               devinfo->gen >= 8 || devinfo->is_haswell, true),
2966      align1);
2967
2968   brw_set_dp_untyped_atomic_message(
2969      p, insn, atomic_op, response_expected);
2970}
2971
2972static void
2973brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2974                                        struct brw_inst *insn,
2975                                        unsigned num_channels)
2976{
2977   const struct gen_device_info *devinfo = p->devinfo;
2978   /* Set mask of 32-bit channels to drop. */
2979   unsigned msg_control = 0xf & (0xf << num_channels);
2980
2981   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2982      if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
2983         msg_control |= 1 << 4; /* SIMD16 mode */
2984      else
2985         msg_control |= 2 << 4; /* SIMD8 mode */
2986   }
2987
2988   brw_inst_set_dp_msg_type(devinfo, insn,
2989                            (devinfo->gen >= 8 || devinfo->is_haswell ?
2990                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2991                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2992   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2993}
2994
2995void
2996brw_untyped_surface_read(struct brw_codegen *p,
2997                         struct brw_reg dst,
2998                         struct brw_reg payload,
2999                         struct brw_reg surface,
3000                         unsigned msg_length,
3001                         unsigned num_channels)
3002{
3003   const struct gen_device_info *devinfo = p->devinfo;
3004   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3005                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3006                          GEN7_SFID_DATAPORT_DATA_CACHE);
3007   struct brw_inst *insn = brw_send_indirect_surface_message(
3008      p, sfid, dst, payload, surface, msg_length,
3009      brw_surface_payload_size(p, num_channels, true, true),
3010      false);
3011
3012   brw_set_dp_untyped_surface_read_message(
3013      p, insn, num_channels);
3014}
3015
3016static void
3017brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
3018                                         struct brw_inst *insn,
3019                                         unsigned num_channels)
3020{
3021   const struct gen_device_info *devinfo = p->devinfo;
3022   /* Set mask of 32-bit channels to drop. */
3023   unsigned msg_control = 0xf & (0xf << num_channels);
3024
3025   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3026      if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
3027         msg_control |= 1 << 4; /* SIMD16 mode */
3028      else
3029         msg_control |= 2 << 4; /* SIMD8 mode */
3030   } else {
3031      if (devinfo->gen >= 8 || devinfo->is_haswell)
3032         msg_control |= 0 << 4; /* SIMD4x2 mode */
3033      else
3034         msg_control |= 2 << 4; /* SIMD8 mode */
3035   }
3036
3037   brw_inst_set_dp_msg_type(devinfo, insn,
3038                            devinfo->gen >= 8 || devinfo->is_haswell ?
3039                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
3040                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
3041   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3042}
3043
3044void
3045brw_untyped_surface_write(struct brw_codegen *p,
3046                          struct brw_reg payload,
3047                          struct brw_reg surface,
3048                          unsigned msg_length,
3049                          unsigned num_channels)
3050{
3051   const struct gen_device_info *devinfo = p->devinfo;
3052   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3053                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3054                          GEN7_SFID_DATAPORT_DATA_CACHE);
3055   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3056   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3057   const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3058                          WRITEMASK_X : WRITEMASK_XYZW;
3059   struct brw_inst *insn = brw_send_indirect_surface_message(
3060      p, sfid, brw_writemask(brw_null_reg(), mask),
3061      payload, surface, msg_length, 0, align1);
3062
3063   brw_set_dp_untyped_surface_write_message(
3064      p, insn, num_channels);
3065}
3066
3067static void
3068brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3069                                struct brw_inst *insn,
3070                                unsigned atomic_op,
3071                                bool response_expected)
3072{
3073   const struct gen_device_info *devinfo = p->devinfo;
3074   unsigned msg_control =
3075      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3076      (response_expected ? 1 << 5 : 0); /* Return data expected */
3077
3078   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3079      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3080         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3081            msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3082
3083         brw_inst_set_dp_msg_type(devinfo, insn,
3084                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3085      } else {
3086         brw_inst_set_dp_msg_type(devinfo, insn,
3087                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3088      }
3089
3090   } else {
3091      brw_inst_set_dp_msg_type(devinfo, insn,
3092                               GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3093
3094      if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3095         msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3096   }
3097
3098   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3099}
3100
3101void
3102brw_typed_atomic(struct brw_codegen *p,
3103                 struct brw_reg dst,
3104                 struct brw_reg payload,
3105                 struct brw_reg surface,
3106                 unsigned atomic_op,
3107                 unsigned msg_length,
3108                 bool response_expected) {
3109   const struct gen_device_info *devinfo = p->devinfo;
3110   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3111                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3112                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3113   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3114   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3115   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3116   struct brw_inst *insn = brw_send_indirect_surface_message(
3117      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3118      brw_surface_payload_size(p, response_expected,
3119                               devinfo->gen >= 8 || devinfo->is_haswell, false),
3120      true);
3121
3122   brw_set_dp_typed_atomic_message(
3123      p, insn, atomic_op, response_expected);
3124}
3125
3126static void
3127brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3128                                      struct brw_inst *insn,
3129                                      unsigned num_channels)
3130{
3131   const struct gen_device_info *devinfo = p->devinfo;
3132   /* Set mask of unused channels. */
3133   unsigned msg_control = 0xf & (0xf << num_channels);
3134
3135   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3136      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3137         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3138            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3139         else
3140            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3141      }
3142
3143      brw_inst_set_dp_msg_type(devinfo, insn,
3144                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3145   } else {
3146      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3147         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3148            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3149      }
3150
3151      brw_inst_set_dp_msg_type(devinfo, insn,
3152                               GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3153   }
3154
3155   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3156}
3157
3158void
3159brw_typed_surface_read(struct brw_codegen *p,
3160                       struct brw_reg dst,
3161                       struct brw_reg payload,
3162                       struct brw_reg surface,
3163                       unsigned msg_length,
3164                       unsigned num_channels)
3165{
3166   const struct gen_device_info *devinfo = p->devinfo;
3167   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3168                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3169                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3170   struct brw_inst *insn = brw_send_indirect_surface_message(
3171      p, sfid, dst, payload, surface, msg_length,
3172      brw_surface_payload_size(p, num_channels,
3173                               devinfo->gen >= 8 || devinfo->is_haswell, false),
3174      true);
3175
3176   brw_set_dp_typed_surface_read_message(
3177      p, insn, num_channels);
3178}
3179
3180static void
3181brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3182                                       struct brw_inst *insn,
3183                                       unsigned num_channels)
3184{
3185   const struct gen_device_info *devinfo = p->devinfo;
3186   /* Set mask of unused channels. */
3187   unsigned msg_control = 0xf & (0xf << num_channels);
3188
3189   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3190      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3191         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3192            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3193         else
3194            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3195      }
3196
3197      brw_inst_set_dp_msg_type(devinfo, insn,
3198                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3199
3200   } else {
3201      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3202         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
3203            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3204      }
3205
3206      brw_inst_set_dp_msg_type(devinfo, insn,
3207                               GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3208   }
3209
3210   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3211}
3212
3213void
3214brw_typed_surface_write(struct brw_codegen *p,
3215                        struct brw_reg payload,
3216                        struct brw_reg surface,
3217                        unsigned msg_length,
3218                        unsigned num_channels)
3219{
3220   const struct gen_device_info *devinfo = p->devinfo;
3221   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3222                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3223                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3224   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3225   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3226   const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3227                          WRITEMASK_X : WRITEMASK_XYZW);
3228   struct brw_inst *insn = brw_send_indirect_surface_message(
3229      p, sfid, brw_writemask(brw_null_reg(), mask),
3230      payload, surface, msg_length, 0, true);
3231
3232   brw_set_dp_typed_surface_write_message(
3233      p, insn, num_channels);
3234}
3235
3236static void
3237brw_set_memory_fence_message(struct brw_codegen *p,
3238                             struct brw_inst *insn,
3239                             enum brw_message_target sfid,
3240                             bool commit_enable)
3241{
3242   const struct gen_device_info *devinfo = p->devinfo;
3243
3244   brw_set_message_descriptor(p, insn, sfid,
3245                              1 /* message length */,
3246                              (commit_enable ? 1 : 0) /* response length */,
3247                              true /* header present */,
3248                              false);
3249
3250   switch (sfid) {
3251   case GEN6_SFID_DATAPORT_RENDER_CACHE:
3252      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3253      break;
3254   case GEN7_SFID_DATAPORT_DATA_CACHE:
3255      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3256      break;
3257   default:
3258      unreachable("Not reached");
3259   }
3260
3261   if (commit_enable)
3262      brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3263}
3264
3265void
3266brw_memory_fence(struct brw_codegen *p,
3267                 struct brw_reg dst)
3268{
3269   const struct gen_device_info *devinfo = p->devinfo;
3270   const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3271   struct brw_inst *insn;
3272
3273   brw_push_insn_state(p);
3274   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3275   brw_set_default_exec_size(p, BRW_EXECUTE_1);
3276   dst = vec1(dst);
3277
3278   /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3279    * message doesn't write anything back.
3280    */
3281   insn = next_insn(p, BRW_OPCODE_SEND);
3282   dst = retype(dst, BRW_REGISTER_TYPE_UW);
3283   brw_set_dest(p, insn, dst);
3284   brw_set_src0(p, insn, dst);
3285   brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3286                                commit_enable);
3287
3288   if (devinfo->gen == 7 && !devinfo->is_haswell) {
3289      /* IVB does typed surface access through the render cache, so we need to
3290       * flush it too.  Use a different register so both flushes can be
3291       * pipelined by the hardware.
3292       */
3293      insn = next_insn(p, BRW_OPCODE_SEND);
3294      brw_set_dest(p, insn, offset(dst, 1));
3295      brw_set_src0(p, insn, offset(dst, 1));
3296      brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3297                                   commit_enable);
3298
3299      /* Now write the response of the second message into the response of the
3300       * first to trigger a pipeline stall -- This way future render and data
3301       * cache messages will be properly ordered with respect to past data and
3302       * render cache messages.
3303       */
3304      brw_MOV(p, dst, offset(dst, 1));
3305   }
3306
3307   brw_pop_insn_state(p);
3308}
3309
3310void
3311brw_pixel_interpolator_query(struct brw_codegen *p,
3312                             struct brw_reg dest,
3313                             struct brw_reg mrf,
3314                             bool noperspective,
3315                             unsigned mode,
3316                             struct brw_reg data,
3317                             unsigned msg_length,
3318                             unsigned response_length)
3319{
3320   const struct gen_device_info *devinfo = p->devinfo;
3321   struct brw_inst *insn;
3322   const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3323
3324   /* brw_send_indirect_message will automatically use a direct send message
3325    * if data is actually immediate.
3326    */
3327   insn = brw_send_indirect_message(p,
3328                                    GEN7_SFID_PIXEL_INTERPOLATOR,
3329                                    dest,
3330                                    mrf,
3331                                    vec1(data));
3332   brw_inst_set_mlen(devinfo, insn, msg_length);
3333   brw_inst_set_rlen(devinfo, insn, response_length);
3334
3335   brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3336   brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3337   brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3338   brw_inst_set_pi_message_type(devinfo, insn, mode);
3339}
3340
3341void
3342brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3343                      struct brw_reg mask)
3344{
3345   const struct gen_device_info *devinfo = p->devinfo;
3346   const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
3347   const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
3348   brw_inst *inst;
3349
3350   assert(devinfo->gen >= 7);
3351   assert(mask.type == BRW_REGISTER_TYPE_UD);
3352
3353   brw_push_insn_state(p);
3354
3355   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3356      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3357
3358      if (devinfo->gen >= 8) {
3359         /* Getting the first active channel index is easy on Gen8: Just find
3360          * the first bit set in the execution mask.  The register exists on
3361          * HSW already but it reads back as all ones when the current
3362          * instruction has execution masking disabled, so it's kind of
3363          * useless.
3364          */
3365         struct brw_reg exec_mask =
3366            retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3367
3368         if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3369            /* Unfortunately, ce0 does not take into account the thread
3370             * dispatch mask, which may be a problem in cases where it's not
3371             * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3372             * some n).  Combine ce0 with the given dispatch (or vector) mask
3373             * to mask off those channels which were never dispatched by the
3374             * hardware.
3375             */
3376            brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3377            brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3378            exec_mask = vec1(dst);
3379         }
3380
3381         /* Quarter control has the effect of magically shifting the value of
3382          * ce0 so you'll get the first active channel relative to the
3383          * specified quarter control as result.
3384          */
3385         inst = brw_FBL(p, vec1(dst), exec_mask);
3386      } else {
3387         const struct brw_reg flag = brw_flag_reg(1, 0);
3388
3389         brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3390
3391         /* Run enough instructions returning zero with execution masking and
3392          * a conditional modifier enabled in order to get the full execution
3393          * mask in f1.0.  We could use a single 32-wide move here if it
3394          * weren't because of the hardware bug that causes channel enables to
3395          * be applied incorrectly to the second half of 32-wide instructions
3396          * on Gen7.
3397          */
3398         const unsigned lower_size = MIN2(16, exec_size);
3399         for (unsigned i = 0; i < exec_size / lower_size; i++) {
3400            inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3401                           brw_imm_uw(0));
3402            brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3403            brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3404            brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3405            brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3406            brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3407         }
3408
3409         /* Find the first bit set in the exec_size-wide portion of the flag
3410          * register that was updated by the last sequence of MOV
3411          * instructions.
3412          */
3413         const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3414         brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3415      }
3416   } else {
3417      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3418
3419      if (devinfo->gen >= 8 &&
3420          mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3421         /* In SIMD4x2 mode the first active channel index is just the
3422          * negation of the first bit of the mask register.  Note that ce0
3423          * doesn't take into account the dispatch mask, so the Gen7 path
3424          * should be used instead unless you have the guarantee that the
3425          * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3426          * for some n).
3427          */
3428         inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3429                        negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3430                        brw_imm_ud(1));
3431
3432      } else {
3433         /* Overwrite the destination without and with execution masking to
3434          * find out which of the channels is active.
3435          */
3436         brw_push_insn_state(p);
3437         brw_set_default_exec_size(p, BRW_EXECUTE_4);
3438         brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3439                 brw_imm_ud(1));
3440
3441         inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3442                        brw_imm_ud(0));
3443         brw_pop_insn_state(p);
3444         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3445      }
3446   }
3447
3448   brw_pop_insn_state(p);
3449}
3450
3451void
3452brw_broadcast(struct brw_codegen *p,
3453              struct brw_reg dst,
3454              struct brw_reg src,
3455              struct brw_reg idx)
3456{
3457   const struct gen_device_info *devinfo = p->devinfo;
3458   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3459   brw_inst *inst;
3460
3461   brw_push_insn_state(p);
3462   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3463   brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3464
3465   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3466          src.address_mode == BRW_ADDRESS_DIRECT);
3467
3468   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3469       idx.file == BRW_IMMEDIATE_VALUE) {
3470      /* Trivial, the source is already uniform or the index is a constant.
3471       * We will typically not get here if the optimizer is doing its job, but
3472       * asserting would be mean.
3473       */
3474      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3475      brw_MOV(p, dst,
3476              (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3477               stride(suboffset(src, 4 * i), 0, 4, 1)));
3478   } else {
3479      if (align1) {
3480         const struct brw_reg addr =
3481            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3482         const unsigned offset = src.nr * REG_SIZE + src.subnr;
3483         /* Limit in bytes of the signed indirect addressing immediate. */
3484         const unsigned limit = 512;
3485
3486         brw_push_insn_state(p);
3487         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3488         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3489
3490         /* Take into account the component size and horizontal stride. */
3491         assert(src.vstride == src.hstride + src.width);
3492         brw_SHL(p, addr, vec1(idx),
3493                 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3494                            src.hstride - 1));
3495
3496         /* We can only address up to limit bytes using the indirect
3497          * addressing immediate, account for the difference if the source
3498          * register is above this limit.
3499          */
3500         if (offset >= limit)
3501            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3502
3503         brw_pop_insn_state(p);
3504
3505         /* Use indirect addressing to fetch the specified component. */
3506         brw_MOV(p, dst,
3507                 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3508                        src.type));
3509      } else {
3510         /* In SIMD4x2 mode the index can be either zero or one, replicate it
3511          * to all bits of a flag register,
3512          */
3513         inst = brw_MOV(p,
3514                        brw_null_reg(),
3515                        stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3516         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3517         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3518         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3519
3520         /* and use predicated SEL to pick the right channel. */
3521         inst = brw_SEL(p, dst,
3522                        stride(suboffset(src, 4), 4, 4, 1),
3523                        stride(src, 4, 4, 1));
3524         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3525         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3526      }
3527   }
3528
3529   brw_pop_insn_state(p);
3530}
3531
3532/**
3533 * This instruction is generated as a single-channel align1 instruction by
3534 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3535 *
3536 * We can't use the typed atomic op in the FS because that has the execution
3537 * mask ANDed with the pixel mask, but we just want to write the one dword for
3538 * all the pixels.
3539 *
3540 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3541 * one u32.  So we use the same untyped atomic write message as the pixel
3542 * shader.
3543 *
3544 * The untyped atomic operation requires a BUFFER surface type with RAW
3545 * format, and is only accessible through the legacy DATA_CACHE dataport
3546 * messages.
3547 */
3548void brw_shader_time_add(struct brw_codegen *p,
3549                         struct brw_reg payload,
3550                         uint32_t surf_index)
3551{
3552   const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3553                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3554                          GEN7_SFID_DATAPORT_DATA_CACHE);
3555   assert(p->devinfo->gen >= 7);
3556
3557   brw_push_insn_state(p);
3558   brw_set_default_access_mode(p, BRW_ALIGN_1);
3559   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3560   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3561   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3562
3563   /* We use brw_vec1_reg and unmasked because we want to increment the given
3564    * offset only once.
3565    */
3566   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3567                                      BRW_ARF_NULL, 0));
3568   brw_set_src0(p, send, brw_vec1_reg(payload.file,
3569                                      payload.nr, 0));
3570   brw_set_src1(p, send, brw_imm_ud(0));
3571   brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3572   brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3573   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3574
3575   brw_pop_insn_state(p);
3576}
3577
3578
3579/**
3580 * Emit the SEND message for a barrier
3581 */
3582void
3583brw_barrier(struct brw_codegen *p, struct brw_reg src)
3584{
3585   const struct gen_device_info *devinfo = p->devinfo;
3586   struct brw_inst *inst;
3587
3588   assert(devinfo->gen >= 7);
3589
3590   brw_push_insn_state(p);
3591   brw_set_default_access_mode(p, BRW_ALIGN_1);
3592   inst = next_insn(p, BRW_OPCODE_SEND);
3593   brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3594   brw_set_src0(p, inst, src);
3595   brw_set_src1(p, inst, brw_null_reg());
3596
3597   brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3598                              1 /* msg_length */,
3599                              0 /* response_length */,
3600                              false /* header_present */,
3601                              false /* end_of_thread */);
3602
3603   brw_inst_set_gateway_notify(devinfo, inst, 1);
3604   brw_inst_set_gateway_subfuncid(devinfo, inst,
3605                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3606
3607   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3608   brw_pop_insn_state(p);
3609}
3610
3611
3612/**
3613 * Emit the wait instruction for a barrier
3614 */
3615void
3616brw_WAIT(struct brw_codegen *p)
3617{
3618   const struct gen_device_info *devinfo = p->devinfo;
3619   struct brw_inst *insn;
3620
3621   struct brw_reg src = brw_notification_reg();
3622
3623   insn = next_insn(p, BRW_OPCODE_WAIT);
3624   brw_set_dest(p, insn, src);
3625   brw_set_src0(p, insn, src);
3626   brw_set_src1(p, insn, brw_null_reg());
3627
3628   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3629   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3630}
3631