brw_eu_emit.c revision 77b338d63b61d72dafa7ecd420e36ee2bb0436ab
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "util/ralloc.h"
38
39/**
40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
41 * registers, implicitly moving the operand to a message register.
42 *
43 * On Sandybridge, this is no longer the case.  This function performs the
44 * explicit move; it should be called before emitting a SEND instruction.
45 */
46void
47gen6_resolve_implied_move(struct brw_codegen *p,
48			  struct brw_reg *src,
49			  unsigned msg_reg_nr)
50{
51   const struct brw_device_info *devinfo = p->devinfo;
52   if (devinfo->gen < 6)
53      return;
54
55   if (src->file == BRW_MESSAGE_REGISTER_FILE)
56      return;
57
58   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
59      brw_push_insn_state(p);
60      brw_set_default_exec_size(p, BRW_EXECUTE_8);
61      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64	      retype(*src, BRW_REGISTER_TYPE_UD));
65      brw_pop_insn_state(p);
66   }
67   *src = brw_message_reg(msg_reg_nr);
68}
69
70static void
71gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72{
73   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74    * "The send with EOT should use register space R112-R127 for <src>. This is
75    *  to enable loading of a new thread into the same slot while the message
76    *  with EOT for current thread is pending dispatch."
77    *
78    * Since we're pretending to have 16 MRFs anyway, we may as well use the
79    * registers required for messages with EOT.
80    */
81   const struct brw_device_info *devinfo = p->devinfo;
82   if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83      reg->file = BRW_GENERAL_REGISTER_FILE;
84      reg->nr += GEN7_MRF_HACK_START;
85   }
86}
87
88/**
89 * Convert a brw_reg_type enumeration value into the hardware representation.
90 *
91 * The hardware encoding may depend on whether the value is an immediate.
92 */
93unsigned
94brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
95                        enum brw_reg_type type, enum brw_reg_file file)
96{
97   if (file == BRW_IMMEDIATE_VALUE) {
98      static const int imm_hw_types[] = {
99         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
100         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
101         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
102         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
103         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
104         [BRW_REGISTER_TYPE_UB] = -1,
105         [BRW_REGISTER_TYPE_B]  = -1,
106         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
107         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
108         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
109         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
110         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
111         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
112         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
113      };
114      assert(type < ARRAY_SIZE(imm_hw_types));
115      assert(imm_hw_types[type] != -1);
116      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
117      return imm_hw_types[type];
118   } else {
119      /* Non-immediate registers */
120      static const int hw_types[] = {
121         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
122         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
123         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
124         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
125         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
126         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
127         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
128         [BRW_REGISTER_TYPE_UV] = -1,
129         [BRW_REGISTER_TYPE_VF] = -1,
130         [BRW_REGISTER_TYPE_V]  = -1,
131         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
132         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
133         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
134         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
135      };
136      assert(type < ARRAY_SIZE(hw_types));
137      assert(hw_types[type] != -1);
138      assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
139      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
140      return hw_types[type];
141   }
142}
143
144void
145brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
146{
147   const struct brw_device_info *devinfo = p->devinfo;
148
149   if (dest.file == BRW_MESSAGE_REGISTER_FILE)
150      assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
151   else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
152      assert(dest.nr < 128);
153
154   gen7_convert_mrf_to_grf(p, &dest);
155
156   brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
157   brw_inst_set_dst_reg_type(devinfo, inst,
158                             brw_reg_type_to_hw_type(devinfo, dest.type,
159                                                     dest.file));
160   brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
161
162   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
163      brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
164
165      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
166         brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
167	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
168	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
169         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
170      } else {
171         brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
172         brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
173         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
174             dest.file == BRW_MESSAGE_REGISTER_FILE) {
175            assert(dest.writemask != 0);
176         }
177	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
178	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
179	  *    this to be programmed as "01".
180	  */
181         brw_inst_set_dst_hstride(devinfo, inst, 1);
182      }
183   } else {
184      brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
185
186      /* These are different sizes in align1 vs align16:
187       */
188      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
189         brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
190                                       dest.indirect_offset);
191	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
192	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
193         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
194      } else {
195         brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
196                                        dest.indirect_offset);
197	 /* even ignored in da16, still need to set as '01' */
198         brw_inst_set_dst_hstride(devinfo, inst, 1);
199      }
200   }
201
202   /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
203    * or 16 (SIMD16), as that's normally correct.  However, when dealing with
204    * small registers, we automatically reduce it to match the register size.
205    */
206   if (dest.width < BRW_EXECUTE_8)
207      brw_inst_set_exec_size(devinfo, inst, dest.width);
208}
209
210extern int reg_type_size[];
211
212static void
213validate_reg(const struct brw_device_info *devinfo,
214             brw_inst *inst, struct brw_reg reg)
215{
216   const int hstride_for_reg[] = {0, 1, 2, 4};
217   const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
218   const int width_for_reg[] = {1, 2, 4, 8, 16};
219   const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
220   int width, hstride, vstride, execsize;
221
222   if (reg.file == BRW_IMMEDIATE_VALUE) {
223      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
224       * mean the destination has to be 128-bit aligned and the
225       * destination horiz stride has to be a word.
226       */
227      if (reg.type == BRW_REGISTER_TYPE_V) {
228         assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
229                reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
230      }
231
232      return;
233   }
234
235   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
236       reg.file == BRW_ARF_NULL)
237      return;
238
239   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
240    *
241    *    "Swizzling is not allowed when an accumulator is used as an implicit
242    *    source or an explicit source in an instruction."
243    */
244   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
245       reg.nr == BRW_ARF_ACCUMULATOR)
246      assert(reg.swizzle == BRW_SWIZZLE_XYZW);
247
248   assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
249   hstride = hstride_for_reg[reg.hstride];
250
251   if (reg.vstride == 0xf) {
252      vstride = -1;
253   } else {
254      assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
255      vstride = vstride_for_reg[reg.vstride];
256   }
257
258   assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
259   width = width_for_reg[reg.width];
260
261   assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
262          brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
263   execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
264
265   /* Restrictions from 3.3.10: Register Region Restrictions. */
266   /* 3. */
267   assert(execsize >= width);
268
269   /* 4. */
270   if (execsize == width && hstride != 0) {
271      assert(vstride == -1 || vstride == width * hstride);
272   }
273
274   /* 5. */
275   if (execsize == width && hstride == 0) {
276      /* no restriction on vstride. */
277   }
278
279   /* 6. */
280   if (width == 1) {
281      assert(hstride == 0);
282   }
283
284   /* 7. */
285   if (execsize == 1 && width == 1) {
286      assert(hstride == 0);
287      assert(vstride == 0);
288   }
289
290   /* 8. */
291   if (vstride == 0 && hstride == 0) {
292      assert(width == 1);
293   }
294
295   /* 10. Check destination issues. */
296}
297
298static bool
299is_compactable_immediate(unsigned imm)
300{
301   /* We get the low 12 bits as-is. */
302   imm &= ~0xfff;
303
304   /* We get one bit replicated through the top 20 bits. */
305   return imm == 0 || imm == 0xfffff000;
306}
307
308void
309brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
310{
311   const struct brw_device_info *devinfo = p->devinfo;
312
313   if (reg.file == BRW_MESSAGE_REGISTER_FILE)
314      assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
315   else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
316      assert(reg.nr < 128);
317
318   gen7_convert_mrf_to_grf(p, &reg);
319
320   if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
321                             brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
322      /* Any source modifiers or regions will be ignored, since this just
323       * identifies the MRF/GRF to start reading the message contents from.
324       * Check for some likely failures.
325       */
326      assert(!reg.negate);
327      assert(!reg.abs);
328      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
329   }
330
331   validate_reg(devinfo, inst, reg);
332
333   brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
334   brw_inst_set_src0_reg_type(devinfo, inst,
335                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
336   brw_inst_set_src0_abs(devinfo, inst, reg.abs);
337   brw_inst_set_src0_negate(devinfo, inst, reg.negate);
338   brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
339
340   if (reg.file == BRW_IMMEDIATE_VALUE) {
341      brw_inst_set_imm_ud(devinfo, inst, reg.ud);
342
343      /* The Bspec's section titled "Non-present Operands" claims that if src0
344       * is an immediate that src1's type must be the same as that of src0.
345       *
346       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
347       * that do not follow this rule. E.g., from the IVB/HSW table:
348       *
349       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
350       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
351       *
352       * And from the SNB table:
353       *
354       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
355       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
356       *
357       * Neither of these cause warnings from the simulator when used,
358       * compacted or otherwise. In fact, all compaction mappings that have an
359       * immediate in src0 use a:ud for src1.
360       *
361       * The GM45 instruction compaction tables do not contain mapped meanings
362       * so it's not clear whether it has the restriction. We'll assume it was
363       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
364       */
365      brw_inst_set_src1_reg_file(devinfo, inst, BRW_ARCHITECTURE_REGISTER_FILE);
366      if (devinfo->gen < 6) {
367         brw_inst_set_src1_reg_type(devinfo, inst,
368                                    brw_inst_src0_reg_type(devinfo, inst));
369      } else {
370         brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
371      }
372
373      /* Compacted instructions only have 12-bits (plus 1 for the other 20)
374       * for immediate values. Presumably the hardware engineers realized
375       * that the only useful floating-point value that could be represented
376       * in this format is 0.0, which can also be represented as a VF-typed
377       * immediate, so they gave us the previously mentioned mapping on IVB+.
378       *
379       * Strangely, we do have a mapping for imm:f in src1, so we don't need
380       * to do this there.
381       *
382       * If we see a 0.0:F, change the type to VF so that it can be compacted.
383       */
384      if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
385          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
386         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
387      }
388
389      /* There are no mappings for dst:d | i:d, so if the immediate is suitable
390       * set the types to :UD so the instruction can be compacted.
391       */
392      if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
393          brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
394          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
395          brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
396         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
397         brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
398      }
399   } else {
400      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
401         brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
402         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
403             brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
404	 } else {
405            brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
406	 }
407      } else {
408         brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
409
410         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
411            brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
412	 } else {
413            brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
414	 }
415      }
416
417      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
418	 if (reg.width == BRW_WIDTH_1 &&
419             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
420            brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
421            brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
422            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
423	 } else {
424            brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
425            brw_inst_set_src0_width(devinfo, inst, reg.width);
426            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
427	 }
428      } else {
429         brw_inst_set_src0_da16_swiz_x(devinfo, inst,
430            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
431         brw_inst_set_src0_da16_swiz_y(devinfo, inst,
432            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
433         brw_inst_set_src0_da16_swiz_z(devinfo, inst,
434            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
435         brw_inst_set_src0_da16_swiz_w(devinfo, inst,
436            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
437
438	 /* This is an oddity of the fact we're using the same
439	  * descriptions for registers in align_16 as align_1:
440	  */
441	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
442            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
443	 else
444            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
445      }
446   }
447}
448
449
450void
451brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
452{
453   const struct brw_device_info *devinfo = p->devinfo;
454
455   if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
456      assert(reg.nr < 128);
457
458   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
459    *
460    *    "Accumulator registers may be accessed explicitly as src0
461    *    operands only."
462    */
463   assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
464          reg.nr != BRW_ARF_ACCUMULATOR);
465
466   gen7_convert_mrf_to_grf(p, &reg);
467   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
468
469   validate_reg(devinfo, inst, reg);
470
471   brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
472   brw_inst_set_src1_reg_type(devinfo, inst,
473                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
474   brw_inst_set_src1_abs(devinfo, inst, reg.abs);
475   brw_inst_set_src1_negate(devinfo, inst, reg.negate);
476
477   /* Only src1 can be immediate in two-argument instructions.
478    */
479   assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
480
481   if (reg.file == BRW_IMMEDIATE_VALUE) {
482      brw_inst_set_imm_ud(devinfo, inst, reg.ud);
483   } else {
484      /* This is a hardware restriction, which may or may not be lifted
485       * in the future:
486       */
487      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
488      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
489
490      brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
491      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
492         brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
493      } else {
494         brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
495      }
496
497      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
498	 if (reg.width == BRW_WIDTH_1 &&
499             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
500            brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
501            brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
502            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
503	 } else {
504            brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
505            brw_inst_set_src1_width(devinfo, inst, reg.width);
506            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
507	 }
508      } else {
509         brw_inst_set_src1_da16_swiz_x(devinfo, inst,
510            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
511         brw_inst_set_src1_da16_swiz_y(devinfo, inst,
512            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
513         brw_inst_set_src1_da16_swiz_z(devinfo, inst,
514            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
515         brw_inst_set_src1_da16_swiz_w(devinfo, inst,
516            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
517
518	 /* This is an oddity of the fact we're using the same
519	  * descriptions for registers in align_16 as align_1:
520	  */
521	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
522            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
523	 else
524            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
525      }
526   }
527}
528
529/**
530 * Set the Message Descriptor and Extended Message Descriptor fields
531 * for SEND messages.
532 *
533 * \note This zeroes out the Function Control bits, so it must be called
534 *       \b before filling out any message-specific data.  Callers can
535 *       choose not to fill in irrelevant bits; they will be zero.
536 */
537void
538brw_set_message_descriptor(struct brw_codegen *p,
539			   brw_inst *inst,
540			   enum brw_message_target sfid,
541			   unsigned msg_length,
542			   unsigned response_length,
543			   bool header_present,
544			   bool end_of_thread)
545{
546   const struct brw_device_info *devinfo = p->devinfo;
547
548   brw_set_src1(p, inst, brw_imm_d(0));
549
550   /* For indirect sends, `inst` will not be the SEND/SENDC instruction
551    * itself; instead, it will be a MOV/OR into the address register.
552    *
553    * In this case, we avoid setting the extended message descriptor bits,
554    * since they go on the later SEND/SENDC instead and if set here would
555    * instead clobber the conditionalmod bits.
556    */
557   unsigned opcode = brw_inst_opcode(devinfo, inst);
558   if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
559      brw_inst_set_sfid(devinfo, inst, sfid);
560   }
561
562   brw_inst_set_mlen(devinfo, inst, msg_length);
563   brw_inst_set_rlen(devinfo, inst, response_length);
564   brw_inst_set_eot(devinfo, inst, end_of_thread);
565
566   if (devinfo->gen >= 5) {
567      brw_inst_set_header_present(devinfo, inst, header_present);
568   }
569}
570
571static void brw_set_math_message( struct brw_codegen *p,
572				  brw_inst *inst,
573				  unsigned function,
574				  unsigned integer_type,
575				  bool low_precision,
576				  unsigned dataType )
577{
578   const struct brw_device_info *devinfo = p->devinfo;
579   unsigned msg_length;
580   unsigned response_length;
581
582   /* Infer message length from the function */
583   switch (function) {
584   case BRW_MATH_FUNCTION_POW:
585   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
586   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
587   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
588      msg_length = 2;
589      break;
590   default:
591      msg_length = 1;
592      break;
593   }
594
595   /* Infer response length from the function */
596   switch (function) {
597   case BRW_MATH_FUNCTION_SINCOS:
598   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
599      response_length = 2;
600      break;
601   default:
602      response_length = 1;
603      break;
604   }
605
606
607   brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
608			      msg_length, response_length, false, false);
609   brw_inst_set_math_msg_function(devinfo, inst, function);
610   brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
611   brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
612   brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
613   brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
614   brw_inst_set_saturate(devinfo, inst, 0);
615}
616
617
618static void brw_set_ff_sync_message(struct brw_codegen *p,
619				    brw_inst *insn,
620				    bool allocate,
621				    unsigned response_length,
622				    bool end_of_thread)
623{
624   const struct brw_device_info *devinfo = p->devinfo;
625
626   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
627			      1, response_length, true, end_of_thread);
628   brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
629   brw_inst_set_urb_allocate(devinfo, insn, allocate);
630   /* The following fields are not used by FF_SYNC: */
631   brw_inst_set_urb_global_offset(devinfo, insn, 0);
632   brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
633   brw_inst_set_urb_used(devinfo, insn, 0);
634   brw_inst_set_urb_complete(devinfo, insn, 0);
635}
636
637static void brw_set_urb_message( struct brw_codegen *p,
638				 brw_inst *insn,
639                                 enum brw_urb_write_flags flags,
640				 unsigned msg_length,
641				 unsigned response_length,
642				 unsigned offset,
643				 unsigned swizzle_control )
644{
645   const struct brw_device_info *devinfo = p->devinfo;
646
647   assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
648   assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
649   assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
650
651   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
652			      msg_length, response_length, true,
653                              flags & BRW_URB_WRITE_EOT);
654
655   if (flags & BRW_URB_WRITE_OWORD) {
656      assert(msg_length == 2); /* header + one OWORD of data */
657      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
658   } else {
659      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
660   }
661
662   brw_inst_set_urb_global_offset(devinfo, insn, offset);
663   brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
664
665   if (devinfo->gen < 8) {
666      brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
667   }
668
669   if (devinfo->gen < 7) {
670      brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
671      brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
672   } else {
673      brw_inst_set_urb_per_slot_offset(devinfo, insn,
674         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
675   }
676}
677
678void
679brw_set_dp_write_message(struct brw_codegen *p,
680			 brw_inst *insn,
681			 unsigned binding_table_index,
682			 unsigned msg_control,
683			 unsigned msg_type,
684			 unsigned msg_length,
685			 bool header_present,
686			 unsigned last_render_target,
687			 unsigned response_length,
688			 unsigned end_of_thread,
689			 unsigned send_commit_msg)
690{
691   const struct brw_device_info *devinfo = p->devinfo;
692   unsigned sfid;
693
694   if (devinfo->gen >= 7) {
695      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
696      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
697	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
698      else
699	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
700   } else if (devinfo->gen == 6) {
701      /* Use the render cache for all write messages. */
702      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
703   } else {
704      sfid = BRW_SFID_DATAPORT_WRITE;
705   }
706
707   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
708			      header_present, end_of_thread);
709
710   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
711   brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
712   brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
713   brw_inst_set_rt_last(devinfo, insn, last_render_target);
714   if (devinfo->gen < 7) {
715      brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
716   }
717}
718
719void
720brw_set_dp_read_message(struct brw_codegen *p,
721			brw_inst *insn,
722			unsigned binding_table_index,
723			unsigned msg_control,
724			unsigned msg_type,
725			unsigned target_cache,
726			unsigned msg_length,
727                        bool header_present,
728			unsigned response_length)
729{
730   const struct brw_device_info *devinfo = p->devinfo;
731   unsigned sfid;
732
733   if (devinfo->gen >= 7) {
734      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
735   } else if (devinfo->gen == 6) {
736      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
737	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
738      else
739	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
740   } else {
741      sfid = BRW_SFID_DATAPORT_READ;
742   }
743
744   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
745			      header_present, false);
746
747   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
748   brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
749   brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
750   if (devinfo->gen < 6)
751      brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
752}
753
754void
755brw_set_sampler_message(struct brw_codegen *p,
756                        brw_inst *inst,
757                        unsigned binding_table_index,
758                        unsigned sampler,
759                        unsigned msg_type,
760                        unsigned response_length,
761                        unsigned msg_length,
762                        unsigned header_present,
763                        unsigned simd_mode,
764                        unsigned return_format)
765{
766   const struct brw_device_info *devinfo = p->devinfo;
767
768   brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
769			      response_length, header_present, false);
770
771   brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
772   brw_inst_set_sampler(devinfo, inst, sampler);
773   brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
774   if (devinfo->gen >= 5) {
775      brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
776   } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
777      brw_inst_set_sampler_return_format(devinfo, inst, return_format);
778   }
779}
780
781static void
782gen7_set_dp_scratch_message(struct brw_codegen *p,
783                            brw_inst *inst,
784                            bool write,
785                            bool dword,
786                            bool invalidate_after_read,
787                            unsigned num_regs,
788                            unsigned addr_offset,
789                            unsigned mlen,
790                            unsigned rlen,
791                            bool header_present)
792{
793   const struct brw_device_info *devinfo = p->devinfo;
794   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
795          (devinfo->gen >= 8 && num_regs == 8));
796   brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
797                              mlen, rlen, header_present, false);
798   brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
799   brw_inst_set_scratch_read_write(devinfo, inst, write);
800   brw_inst_set_scratch_type(devinfo, inst, dword);
801   brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
802   brw_inst_set_scratch_block_size(devinfo, inst, ffs(num_regs) - 1);
803   brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
804}
805
806#define next_insn brw_next_insn
807brw_inst *
808brw_next_insn(struct brw_codegen *p, unsigned opcode)
809{
810   const struct brw_device_info *devinfo = p->devinfo;
811   brw_inst *insn;
812
813   if (p->nr_insn + 1 > p->store_size) {
814      p->store_size <<= 1;
815      p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
816   }
817
818   p->next_insn_offset += 16;
819   insn = &p->store[p->nr_insn++];
820   memcpy(insn, p->current, sizeof(*insn));
821
822   brw_inst_set_opcode(devinfo, insn, opcode);
823   return insn;
824}
825
826static brw_inst *
827brw_alu1(struct brw_codegen *p, unsigned opcode,
828         struct brw_reg dest, struct brw_reg src)
829{
830   brw_inst *insn = next_insn(p, opcode);
831   brw_set_dest(p, insn, dest);
832   brw_set_src0(p, insn, src);
833   return insn;
834}
835
836static brw_inst *
837brw_alu2(struct brw_codegen *p, unsigned opcode,
838         struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
839{
840   brw_inst *insn = next_insn(p, opcode);
841   brw_set_dest(p, insn, dest);
842   brw_set_src0(p, insn, src0);
843   brw_set_src1(p, insn, src1);
844   return insn;
845}
846
847static int
848get_3src_subreg_nr(struct brw_reg reg)
849{
850   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
851      assert(brw_is_single_value_swizzle(reg.swizzle));
852      return reg.subnr / 4 + BRW_GET_SWZ(reg.swizzle, 0);
853   } else {
854      return reg.subnr / 4;
855   }
856}
857
858static brw_inst *
859brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
860         struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
861{
862   const struct brw_device_info *devinfo = p->devinfo;
863   brw_inst *inst = next_insn(p, opcode);
864
865   gen7_convert_mrf_to_grf(p, &dest);
866
867   assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
868
869   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
870	  dest.file == BRW_MESSAGE_REGISTER_FILE);
871   assert(dest.nr < 128);
872   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
873   assert(dest.type == BRW_REGISTER_TYPE_F ||
874          dest.type == BRW_REGISTER_TYPE_D ||
875          dest.type == BRW_REGISTER_TYPE_UD);
876   if (devinfo->gen == 6) {
877      brw_inst_set_3src_dst_reg_file(devinfo, inst,
878                                     dest.file == BRW_MESSAGE_REGISTER_FILE);
879   }
880   brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
881   brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
882   brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
883
884   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
885   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
886   assert(src0.nr < 128);
887   brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
888   brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
889   brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
890   brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
891   brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
892   brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
893                                   src0.vstride == BRW_VERTICAL_STRIDE_0);
894
895   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
896   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
897   assert(src1.nr < 128);
898   brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
899   brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
900   brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
901   brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
902   brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
903   brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
904                                   src1.vstride == BRW_VERTICAL_STRIDE_0);
905
906   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
907   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
908   assert(src2.nr < 128);
909   brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
910   brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
911   brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
912   brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
913   brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
914   brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
915                                   src2.vstride == BRW_VERTICAL_STRIDE_0);
916
917   if (devinfo->gen >= 7) {
918      /* Set both the source and destination types based on dest.type,
919       * ignoring the source register types.  The MAD and LRP emitters ensure
920       * that all four types are float.  The BFE and BFI2 emitters, however,
921       * may send us mixed D and UD types and want us to ignore that and use
922       * the destination type.
923       */
924      switch (dest.type) {
925      case BRW_REGISTER_TYPE_F:
926         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
927         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
928         break;
929      case BRW_REGISTER_TYPE_D:
930         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
931         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
932         break;
933      case BRW_REGISTER_TYPE_UD:
934         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
935         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
936         break;
937      default:
938         unreachable("not reached");
939      }
940   }
941
942   return inst;
943}
944
945
946/***********************************************************************
947 * Convenience routines.
948 */
949#define ALU1(OP)					\
950brw_inst *brw_##OP(struct brw_codegen *p,		\
951	      struct brw_reg dest,			\
952	      struct brw_reg src0)   			\
953{							\
954   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
955}
956
957#define ALU2(OP)					\
958brw_inst *brw_##OP(struct brw_codegen *p,		\
959	      struct brw_reg dest,			\
960	      struct brw_reg src0,			\
961	      struct brw_reg src1)   			\
962{							\
963   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
964}
965
966#define ALU3(OP)					\
967brw_inst *brw_##OP(struct brw_codegen *p,		\
968	      struct brw_reg dest,			\
969	      struct brw_reg src0,			\
970	      struct brw_reg src1,			\
971	      struct brw_reg src2)   			\
972{							\
973   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
974}
975
976#define ALU3F(OP)                                               \
977brw_inst *brw_##OP(struct brw_codegen *p,         \
978                                 struct brw_reg dest,           \
979                                 struct brw_reg src0,           \
980                                 struct brw_reg src1,           \
981                                 struct brw_reg src2)           \
982{                                                               \
983   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
984   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
985   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
986   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
987   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
988}
989
990/* Rounding operations (other than RNDD) require two instructions - the first
991 * stores a rounded value (possibly the wrong way) in the dest register, but
992 * also sets a per-channel "increment bit" in the flag register.  A predicated
993 * add of 1.0 fixes dest to contain the desired result.
994 *
995 * Sandybridge and later appear to round correctly without an ADD.
996 */
997#define ROUND(OP)							      \
998void brw_##OP(struct brw_codegen *p,					      \
999	      struct brw_reg dest,					      \
1000	      struct brw_reg src)					      \
1001{									      \
1002   const struct brw_device_info *devinfo = p->devinfo;					      \
1003   brw_inst *rnd, *add;							      \
1004   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
1005   brw_set_dest(p, rnd, dest);						      \
1006   brw_set_src0(p, rnd, src);						      \
1007									      \
1008   if (devinfo->gen < 6) {							      \
1009      /* turn on round-increments */					      \
1010      brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
1011      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
1012      brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
1013   }									      \
1014}
1015
1016
1017ALU1(MOV)
1018ALU2(SEL)
1019ALU1(NOT)
1020ALU2(AND)
1021ALU2(OR)
1022ALU2(XOR)
1023ALU2(SHR)
1024ALU2(SHL)
1025ALU2(ASR)
1026ALU1(FRC)
1027ALU1(RNDD)
1028ALU2(MAC)
1029ALU2(MACH)
1030ALU1(LZD)
1031ALU2(DP4)
1032ALU2(DPH)
1033ALU2(DP3)
1034ALU2(DP2)
1035ALU3F(MAD)
1036ALU3F(LRP)
1037ALU1(BFREV)
1038ALU3(BFE)
1039ALU2(BFI1)
1040ALU3(BFI2)
1041ALU1(FBH)
1042ALU1(FBL)
1043ALU1(CBIT)
1044ALU2(ADDC)
1045ALU2(SUBB)
1046
1047ROUND(RNDZ)
1048ROUND(RNDE)
1049
1050
1051brw_inst *
1052brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1053        struct brw_reg src0, struct brw_reg src1)
1054{
1055   /* 6.2.2: add */
1056   if (src0.type == BRW_REGISTER_TYPE_F ||
1057       (src0.file == BRW_IMMEDIATE_VALUE &&
1058	src0.type == BRW_REGISTER_TYPE_VF)) {
1059      assert(src1.type != BRW_REGISTER_TYPE_UD);
1060      assert(src1.type != BRW_REGISTER_TYPE_D);
1061   }
1062
1063   if (src1.type == BRW_REGISTER_TYPE_F ||
1064       (src1.file == BRW_IMMEDIATE_VALUE &&
1065	src1.type == BRW_REGISTER_TYPE_VF)) {
1066      assert(src0.type != BRW_REGISTER_TYPE_UD);
1067      assert(src0.type != BRW_REGISTER_TYPE_D);
1068   }
1069
1070   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1071}
1072
1073brw_inst *
1074brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1075        struct brw_reg src0, struct brw_reg src1)
1076{
1077   assert(dest.type == src0.type);
1078   assert(src0.type == src1.type);
1079   switch (src0.type) {
1080   case BRW_REGISTER_TYPE_B:
1081   case BRW_REGISTER_TYPE_UB:
1082   case BRW_REGISTER_TYPE_W:
1083   case BRW_REGISTER_TYPE_UW:
1084   case BRW_REGISTER_TYPE_D:
1085   case BRW_REGISTER_TYPE_UD:
1086      break;
1087   default:
1088      unreachable("Bad type for brw_AVG");
1089   }
1090
1091   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1092}
1093
1094brw_inst *
1095brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1096        struct brw_reg src0, struct brw_reg src1)
1097{
1098   /* 6.32.38: mul */
1099   if (src0.type == BRW_REGISTER_TYPE_D ||
1100       src0.type == BRW_REGISTER_TYPE_UD ||
1101       src1.type == BRW_REGISTER_TYPE_D ||
1102       src1.type == BRW_REGISTER_TYPE_UD) {
1103      assert(dest.type != BRW_REGISTER_TYPE_F);
1104   }
1105
1106   if (src0.type == BRW_REGISTER_TYPE_F ||
1107       (src0.file == BRW_IMMEDIATE_VALUE &&
1108	src0.type == BRW_REGISTER_TYPE_VF)) {
1109      assert(src1.type != BRW_REGISTER_TYPE_UD);
1110      assert(src1.type != BRW_REGISTER_TYPE_D);
1111   }
1112
1113   if (src1.type == BRW_REGISTER_TYPE_F ||
1114       (src1.file == BRW_IMMEDIATE_VALUE &&
1115	src1.type == BRW_REGISTER_TYPE_VF)) {
1116      assert(src0.type != BRW_REGISTER_TYPE_UD);
1117      assert(src0.type != BRW_REGISTER_TYPE_D);
1118   }
1119
1120   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1121	  src0.nr != BRW_ARF_ACCUMULATOR);
1122   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1123	  src1.nr != BRW_ARF_ACCUMULATOR);
1124
1125   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1126}
1127
1128brw_inst *
1129brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1130         struct brw_reg src0, struct brw_reg src1)
1131{
1132   src0.vstride = BRW_VERTICAL_STRIDE_0;
1133   src0.width = BRW_WIDTH_1;
1134   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1135   return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1136}
1137
1138brw_inst *
1139brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1140        struct brw_reg src0, struct brw_reg src1)
1141{
1142   src0.vstride = BRW_VERTICAL_STRIDE_0;
1143   src0.width = BRW_WIDTH_1;
1144   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1145   src1.vstride = BRW_VERTICAL_STRIDE_8;
1146   src1.width = BRW_WIDTH_8;
1147   src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1148   return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1149}
1150
1151brw_inst *
1152brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1153{
1154   const struct brw_device_info *devinfo = p->devinfo;
1155   const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1156   /* The F32TO16 instruction doesn't support 32-bit destination types in
1157    * Align1 mode, and neither does the Gen8 implementation in terms of a
1158    * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
1159    * an undocumented feature.
1160    */
1161   const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1162                                 (!align16 || devinfo->gen >= 8));
1163   brw_inst *inst;
1164
1165   if (align16) {
1166      assert(dst.type == BRW_REGISTER_TYPE_UD);
1167   } else {
1168      assert(dst.type == BRW_REGISTER_TYPE_UD ||
1169             dst.type == BRW_REGISTER_TYPE_W ||
1170             dst.type == BRW_REGISTER_TYPE_UW ||
1171             dst.type == BRW_REGISTER_TYPE_HF);
1172   }
1173
1174   brw_push_insn_state(p);
1175
1176   if (needs_zero_fill) {
1177      brw_set_default_access_mode(p, BRW_ALIGN_1);
1178      dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1179   }
1180
1181   if (devinfo->gen >= 8) {
1182      inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1183   } else {
1184      assert(devinfo->gen == 7);
1185      inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1186   }
1187
1188   if (needs_zero_fill) {
1189      brw_inst_set_no_dd_clear(devinfo, inst, true);
1190      inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
1191      brw_inst_set_no_dd_check(devinfo, inst, true);
1192   }
1193
1194   brw_pop_insn_state(p);
1195   return inst;
1196}
1197
1198brw_inst *
1199brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1200{
1201   const struct brw_device_info *devinfo = p->devinfo;
1202   bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1203
1204   if (align16) {
1205      assert(src.type == BRW_REGISTER_TYPE_UD);
1206   } else {
1207      /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1208       *
1209       *   Because this instruction does not have a 16-bit floating-point
1210       *   type, the source data type must be Word (W). The destination type
1211       *   must be F (Float).
1212       */
1213      if (src.type == BRW_REGISTER_TYPE_UD)
1214         src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1215
1216      assert(src.type == BRW_REGISTER_TYPE_W ||
1217             src.type == BRW_REGISTER_TYPE_UW ||
1218             src.type == BRW_REGISTER_TYPE_HF);
1219   }
1220
1221   if (devinfo->gen >= 8) {
1222      return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1223   } else {
1224      assert(devinfo->gen == 7);
1225      return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1226   }
1227}
1228
1229
1230void brw_NOP(struct brw_codegen *p)
1231{
1232   brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1233   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1234   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1235   brw_set_src1(p, insn, brw_imm_ud(0x0));
1236}
1237
1238
1239
1240
1241
1242/***********************************************************************
1243 * Comparisons, if/else/endif
1244 */
1245
1246brw_inst *
1247brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1248         unsigned predicate_control)
1249{
1250   const struct brw_device_info *devinfo = p->devinfo;
1251   struct brw_reg ip = brw_ip_reg();
1252   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1253
1254   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1255   brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1256   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1257   brw_inst_set_pred_control(devinfo, inst, predicate_control);
1258
1259   return inst;
1260}
1261
1262static void
1263push_if_stack(struct brw_codegen *p, brw_inst *inst)
1264{
1265   p->if_stack[p->if_stack_depth] = inst - p->store;
1266
1267   p->if_stack_depth++;
1268   if (p->if_stack_array_size <= p->if_stack_depth) {
1269      p->if_stack_array_size *= 2;
1270      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1271			     p->if_stack_array_size);
1272   }
1273}
1274
1275static brw_inst *
1276pop_if_stack(struct brw_codegen *p)
1277{
1278   p->if_stack_depth--;
1279   return &p->store[p->if_stack[p->if_stack_depth]];
1280}
1281
1282static void
1283push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1284{
1285   if (p->loop_stack_array_size < p->loop_stack_depth) {
1286      p->loop_stack_array_size *= 2;
1287      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1288			       p->loop_stack_array_size);
1289      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1290				     p->loop_stack_array_size);
1291   }
1292
1293   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1294   p->loop_stack_depth++;
1295   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1296}
1297
1298static brw_inst *
1299get_inner_do_insn(struct brw_codegen *p)
1300{
1301   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1302}
1303
1304/* EU takes the value from the flag register and pushes it onto some
1305 * sort of a stack (presumably merging with any flag value already on
1306 * the stack).  Within an if block, the flags at the top of the stack
1307 * control execution on each channel of the unit, eg. on each of the
1308 * 16 pixel values in our wm programs.
1309 *
1310 * When the matching 'else' instruction is reached (presumably by
1311 * countdown of the instruction count patched in by our ELSE/ENDIF
1312 * functions), the relevant flags are inverted.
1313 *
1314 * When the matching 'endif' instruction is reached, the flags are
1315 * popped off.  If the stack is now empty, normal execution resumes.
1316 */
1317brw_inst *
1318brw_IF(struct brw_codegen *p, unsigned execute_size)
1319{
1320   const struct brw_device_info *devinfo = p->devinfo;
1321   brw_inst *insn;
1322
1323   insn = next_insn(p, BRW_OPCODE_IF);
1324
1325   /* Override the defaults for this instruction:
1326    */
1327   if (devinfo->gen < 6) {
1328      brw_set_dest(p, insn, brw_ip_reg());
1329      brw_set_src0(p, insn, brw_ip_reg());
1330      brw_set_src1(p, insn, brw_imm_d(0x0));
1331   } else if (devinfo->gen == 6) {
1332      brw_set_dest(p, insn, brw_imm_w(0));
1333      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1334      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1335      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1336   } else if (devinfo->gen == 7) {
1337      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1338      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1339      brw_set_src1(p, insn, brw_imm_w(0));
1340      brw_inst_set_jip(devinfo, insn, 0);
1341      brw_inst_set_uip(devinfo, insn, 0);
1342   } else {
1343      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1344      brw_set_src0(p, insn, brw_imm_d(0));
1345      brw_inst_set_jip(devinfo, insn, 0);
1346      brw_inst_set_uip(devinfo, insn, 0);
1347   }
1348
1349   brw_inst_set_exec_size(devinfo, insn, execute_size);
1350   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1351   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1352   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1353   if (!p->single_program_flow && devinfo->gen < 6)
1354      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1355
1356   push_if_stack(p, insn);
1357   p->if_depth_in_loop[p->loop_stack_depth]++;
1358   return insn;
1359}
1360
1361/* This function is only used for gen6-style IF instructions with an
1362 * embedded comparison (conditional modifier).  It is not used on gen7.
1363 */
1364brw_inst *
1365gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1366	struct brw_reg src0, struct brw_reg src1)
1367{
1368   const struct brw_device_info *devinfo = p->devinfo;
1369   brw_inst *insn;
1370
1371   insn = next_insn(p, BRW_OPCODE_IF);
1372
1373   brw_set_dest(p, insn, brw_imm_w(0));
1374   brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1375                                                   : BRW_EXECUTE_8);
1376   brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1377   brw_set_src0(p, insn, src0);
1378   brw_set_src1(p, insn, src1);
1379
1380   assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1381   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1382   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1383
1384   push_if_stack(p, insn);
1385   return insn;
1386}
1387
1388/**
1389 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1390 */
1391static void
1392convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1393                       brw_inst *if_inst, brw_inst *else_inst)
1394{
1395   const struct brw_device_info *devinfo = p->devinfo;
1396
1397   /* The next instruction (where the ENDIF would be, if it existed) */
1398   brw_inst *next_inst = &p->store[p->nr_insn];
1399
1400   assert(p->single_program_flow);
1401   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1402   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1403   assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1404
1405   /* Convert IF to an ADD instruction that moves the instruction pointer
1406    * to the first instruction of the ELSE block.  If there is no ELSE
1407    * block, point to where ENDIF would be.  Reverse the predicate.
1408    *
1409    * There's no need to execute an ENDIF since we don't need to do any
1410    * stack operations, and if we're currently executing, we just want to
1411    * continue normally.
1412    */
1413   brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1414   brw_inst_set_pred_inv(devinfo, if_inst, true);
1415
1416   if (else_inst != NULL) {
1417      /* Convert ELSE to an ADD instruction that points where the ENDIF
1418       * would be.
1419       */
1420      brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1421
1422      brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1423      brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1424   } else {
1425      brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1426   }
1427}
1428
1429/**
1430 * Patch IF and ELSE instructions with appropriate jump targets.
1431 */
1432static void
1433patch_IF_ELSE(struct brw_codegen *p,
1434              brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1435{
1436   const struct brw_device_info *devinfo = p->devinfo;
1437
1438   /* We shouldn't be patching IF and ELSE instructions in single program flow
1439    * mode when gen < 6, because in single program flow mode on those
1440    * platforms, we convert flow control instructions to conditional ADDs that
1441    * operate on IP (see brw_ENDIF).
1442    *
1443    * However, on Gen6, writing to IP doesn't work in single program flow mode
1444    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1445    * not be updated by non-flow control instructions.").  And on later
1446    * platforms, there is no significant benefit to converting control flow
1447    * instructions to conditional ADDs.  So we do patch IF and ELSE
1448    * instructions in single program flow mode on those platforms.
1449    */
1450   if (devinfo->gen < 6)
1451      assert(!p->single_program_flow);
1452
1453   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1454   assert(endif_inst != NULL);
1455   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1456
1457   unsigned br = brw_jump_scale(devinfo);
1458
1459   assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1460   brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1461
1462   if (else_inst == NULL) {
1463      /* Patch IF -> ENDIF */
1464      if (devinfo->gen < 6) {
1465	 /* Turn it into an IFF, which means no mask stack operations for
1466	  * all-false and jumping past the ENDIF.
1467	  */
1468         brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1469         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1470                                      br * (endif_inst - if_inst + 1));
1471         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1472      } else if (devinfo->gen == 6) {
1473	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1474         brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1475      } else {
1476         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1477         brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1478      }
1479   } else {
1480      brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1481
1482      /* Patch IF -> ELSE */
1483      if (devinfo->gen < 6) {
1484         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1485                                      br * (else_inst - if_inst));
1486         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1487      } else if (devinfo->gen == 6) {
1488         brw_inst_set_gen6_jump_count(devinfo, if_inst,
1489                                      br * (else_inst - if_inst + 1));
1490      }
1491
1492      /* Patch ELSE -> ENDIF */
1493      if (devinfo->gen < 6) {
1494	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1495	  * matching ENDIF.
1496	  */
1497         brw_inst_set_gen4_jump_count(devinfo, else_inst,
1498                                      br * (endif_inst - else_inst + 1));
1499         brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1500      } else if (devinfo->gen == 6) {
1501	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1502         brw_inst_set_gen6_jump_count(devinfo, else_inst,
1503                                      br * (endif_inst - else_inst));
1504      } else {
1505	 /* The IF instruction's JIP should point just past the ELSE */
1506         brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1507	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1508         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1509         brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1510         if (devinfo->gen >= 8) {
1511            /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1512             * should point to ENDIF.
1513             */
1514            brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1515         }
1516      }
1517   }
1518}
1519
1520void
1521brw_ELSE(struct brw_codegen *p)
1522{
1523   const struct brw_device_info *devinfo = p->devinfo;
1524   brw_inst *insn;
1525
1526   insn = next_insn(p, BRW_OPCODE_ELSE);
1527
1528   if (devinfo->gen < 6) {
1529      brw_set_dest(p, insn, brw_ip_reg());
1530      brw_set_src0(p, insn, brw_ip_reg());
1531      brw_set_src1(p, insn, brw_imm_d(0x0));
1532   } else if (devinfo->gen == 6) {
1533      brw_set_dest(p, insn, brw_imm_w(0));
1534      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1535      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1536      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1537   } else if (devinfo->gen == 7) {
1538      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1539      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1540      brw_set_src1(p, insn, brw_imm_w(0));
1541      brw_inst_set_jip(devinfo, insn, 0);
1542      brw_inst_set_uip(devinfo, insn, 0);
1543   } else {
1544      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1545      brw_set_src0(p, insn, brw_imm_d(0));
1546      brw_inst_set_jip(devinfo, insn, 0);
1547      brw_inst_set_uip(devinfo, insn, 0);
1548   }
1549
1550   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1551   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1552   if (!p->single_program_flow && devinfo->gen < 6)
1553      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1554
1555   push_if_stack(p, insn);
1556}
1557
1558void
1559brw_ENDIF(struct brw_codegen *p)
1560{
1561   const struct brw_device_info *devinfo = p->devinfo;
1562   brw_inst *insn = NULL;
1563   brw_inst *else_inst = NULL;
1564   brw_inst *if_inst = NULL;
1565   brw_inst *tmp;
1566   bool emit_endif = true;
1567
1568   /* In single program flow mode, we can express IF and ELSE instructions
1569    * equivalently as ADD instructions that operate on IP.  On platforms prior
1570    * to Gen6, flow control instructions cause an implied thread switch, so
1571    * this is a significant savings.
1572    *
1573    * However, on Gen6, writing to IP doesn't work in single program flow mode
1574    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1575    * not be updated by non-flow control instructions.").  And on later
1576    * platforms, there is no significant benefit to converting control flow
1577    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1578    * Gen5.
1579    */
1580   if (devinfo->gen < 6 && p->single_program_flow)
1581      emit_endif = false;
1582
1583   /*
1584    * A single next_insn() may change the base address of instruction store
1585    * memory(p->store), so call it first before referencing the instruction
1586    * store pointer from an index
1587    */
1588   if (emit_endif)
1589      insn = next_insn(p, BRW_OPCODE_ENDIF);
1590
1591   /* Pop the IF and (optional) ELSE instructions from the stack */
1592   p->if_depth_in_loop[p->loop_stack_depth]--;
1593   tmp = pop_if_stack(p);
1594   if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1595      else_inst = tmp;
1596      tmp = pop_if_stack(p);
1597   }
1598   if_inst = tmp;
1599
1600   if (!emit_endif) {
1601      /* ENDIF is useless; don't bother emitting it. */
1602      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1603      return;
1604   }
1605
1606   if (devinfo->gen < 6) {
1607      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1608      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1609      brw_set_src1(p, insn, brw_imm_d(0x0));
1610   } else if (devinfo->gen == 6) {
1611      brw_set_dest(p, insn, brw_imm_w(0));
1612      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1613      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1614   } else if (devinfo->gen == 7) {
1615      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1616      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1617      brw_set_src1(p, insn, brw_imm_w(0));
1618   } else {
1619      brw_set_src0(p, insn, brw_imm_d(0));
1620   }
1621
1622   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1623   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1624   if (devinfo->gen < 6)
1625      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1626
1627   /* Also pop item off the stack in the endif instruction: */
1628   if (devinfo->gen < 6) {
1629      brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1630      brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1631   } else if (devinfo->gen == 6) {
1632      brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1633   } else {
1634      brw_inst_set_jip(devinfo, insn, 2);
1635   }
1636   patch_IF_ELSE(p, if_inst, else_inst, insn);
1637}
1638
1639brw_inst *
1640brw_BREAK(struct brw_codegen *p)
1641{
1642   const struct brw_device_info *devinfo = p->devinfo;
1643   brw_inst *insn;
1644
1645   insn = next_insn(p, BRW_OPCODE_BREAK);
1646   if (devinfo->gen >= 8) {
1647      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1648      brw_set_src0(p, insn, brw_imm_d(0x0));
1649   } else if (devinfo->gen >= 6) {
1650      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1651      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1652      brw_set_src1(p, insn, brw_imm_d(0x0));
1653   } else {
1654      brw_set_dest(p, insn, brw_ip_reg());
1655      brw_set_src0(p, insn, brw_ip_reg());
1656      brw_set_src1(p, insn, brw_imm_d(0x0));
1657      brw_inst_set_gen4_pop_count(devinfo, insn,
1658                                  p->if_depth_in_loop[p->loop_stack_depth]);
1659   }
1660   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1661   brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1662                                                   : BRW_EXECUTE_8);
1663
1664   return insn;
1665}
1666
1667brw_inst *
1668brw_CONT(struct brw_codegen *p)
1669{
1670   const struct brw_device_info *devinfo = p->devinfo;
1671   brw_inst *insn;
1672
1673   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1674   brw_set_dest(p, insn, brw_ip_reg());
1675   if (devinfo->gen >= 8) {
1676      brw_set_src0(p, insn, brw_imm_d(0x0));
1677   } else {
1678      brw_set_src0(p, insn, brw_ip_reg());
1679      brw_set_src1(p, insn, brw_imm_d(0x0));
1680   }
1681
1682   if (devinfo->gen < 6) {
1683      brw_inst_set_gen4_pop_count(devinfo, insn,
1684                                  p->if_depth_in_loop[p->loop_stack_depth]);
1685   }
1686   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1687   brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1688                                                   : BRW_EXECUTE_8);
1689   return insn;
1690}
1691
1692brw_inst *
1693gen6_HALT(struct brw_codegen *p)
1694{
1695   const struct brw_device_info *devinfo = p->devinfo;
1696   brw_inst *insn;
1697
1698   insn = next_insn(p, BRW_OPCODE_HALT);
1699   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1700   if (devinfo->gen >= 8) {
1701      brw_set_src0(p, insn, brw_imm_d(0x0));
1702   } else {
1703      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1704      brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1705   }
1706
1707   if (p->compressed) {
1708      brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_16);
1709   } else {
1710      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1711      brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_8);
1712   }
1713   return insn;
1714}
1715
1716/* DO/WHILE loop:
1717 *
1718 * The DO/WHILE is just an unterminated loop -- break or continue are
1719 * used for control within the loop.  We have a few ways they can be
1720 * done.
1721 *
1722 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1723 * jip and no DO instruction.
1724 *
1725 * For non-uniform control flow pre-gen6, there's a DO instruction to
1726 * push the mask, and a WHILE to jump back, and BREAK to get out and
1727 * pop the mask.
1728 *
1729 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1730 * just points back to the first instruction of the loop.
1731 */
1732brw_inst *
1733brw_DO(struct brw_codegen *p, unsigned execute_size)
1734{
1735   const struct brw_device_info *devinfo = p->devinfo;
1736
1737   if (devinfo->gen >= 6 || p->single_program_flow) {
1738      push_loop_stack(p, &p->store[p->nr_insn]);
1739      return &p->store[p->nr_insn];
1740   } else {
1741      brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1742
1743      push_loop_stack(p, insn);
1744
1745      /* Override the defaults for this instruction:
1746       */
1747      brw_set_dest(p, insn, brw_null_reg());
1748      brw_set_src0(p, insn, brw_null_reg());
1749      brw_set_src1(p, insn, brw_null_reg());
1750
1751      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1752      brw_inst_set_exec_size(devinfo, insn, execute_size);
1753      brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1754
1755      return insn;
1756   }
1757}
1758
1759/**
1760 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1761 * instruction here.
1762 *
1763 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1764 * nesting, since it can always just point to the end of the block/current loop.
1765 */
1766static void
1767brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1768{
1769   const struct brw_device_info *devinfo = p->devinfo;
1770   brw_inst *do_inst = get_inner_do_insn(p);
1771   brw_inst *inst;
1772   unsigned br = brw_jump_scale(devinfo);
1773
1774   assert(devinfo->gen < 6);
1775
1776   for (inst = while_inst - 1; inst != do_inst; inst--) {
1777      /* If the jump count is != 0, that means that this instruction has already
1778       * been patched because it's part of a loop inside of the one we're
1779       * patching.
1780       */
1781      if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1782          brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1783         brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1784      } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1785                 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1786         brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1787      }
1788   }
1789}
1790
1791brw_inst *
1792brw_WHILE(struct brw_codegen *p)
1793{
1794   const struct brw_device_info *devinfo = p->devinfo;
1795   brw_inst *insn, *do_insn;
1796   unsigned br = brw_jump_scale(devinfo);
1797
1798   if (devinfo->gen >= 6) {
1799      insn = next_insn(p, BRW_OPCODE_WHILE);
1800      do_insn = get_inner_do_insn(p);
1801
1802      if (devinfo->gen >= 8) {
1803         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1804         brw_set_src0(p, insn, brw_imm_d(0));
1805         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1806      } else if (devinfo->gen == 7) {
1807         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1808         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1809         brw_set_src1(p, insn, brw_imm_w(0));
1810         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1811      } else {
1812         brw_set_dest(p, insn, brw_imm_w(0));
1813         brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1814         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1815         brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1816      }
1817
1818      brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1819                                                      : BRW_EXECUTE_8);
1820   } else {
1821      if (p->single_program_flow) {
1822	 insn = next_insn(p, BRW_OPCODE_ADD);
1823         do_insn = get_inner_do_insn(p);
1824
1825	 brw_set_dest(p, insn, brw_ip_reg());
1826	 brw_set_src0(p, insn, brw_ip_reg());
1827	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1828         brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1829      } else {
1830	 insn = next_insn(p, BRW_OPCODE_WHILE);
1831         do_insn = get_inner_do_insn(p);
1832
1833         assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1834
1835	 brw_set_dest(p, insn, brw_ip_reg());
1836	 brw_set_src0(p, insn, brw_ip_reg());
1837	 brw_set_src1(p, insn, brw_imm_d(0));
1838
1839         brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1840         brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1841         brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1842
1843	 brw_patch_break_cont(p, insn);
1844      }
1845   }
1846   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1847
1848   p->loop_stack_depth--;
1849
1850   return insn;
1851}
1852
1853/* FORWARD JUMPS:
1854 */
1855void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1856{
1857   const struct brw_device_info *devinfo = p->devinfo;
1858   brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1859   unsigned jmpi = 1;
1860
1861   if (devinfo->gen >= 5)
1862      jmpi = 2;
1863
1864   assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1865   assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1866
1867   brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1868                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
1869}
1870
1871/* To integrate with the above, it makes sense that the comparison
1872 * instruction should populate the flag register.  It might be simpler
1873 * just to use the flag reg for most WM tasks?
1874 */
1875void brw_CMP(struct brw_codegen *p,
1876	     struct brw_reg dest,
1877	     unsigned conditional,
1878	     struct brw_reg src0,
1879	     struct brw_reg src1)
1880{
1881   const struct brw_device_info *devinfo = p->devinfo;
1882   brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1883
1884   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1885   brw_set_dest(p, insn, dest);
1886   brw_set_src0(p, insn, src0);
1887   brw_set_src1(p, insn, src1);
1888
1889   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1890    * page says:
1891    *    "Any CMP instruction with a null destination must use a {switch}."
1892    *
1893    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1894    * mentioned on their work-arounds pages.
1895    */
1896   if (devinfo->gen == 7) {
1897      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1898          dest.nr == BRW_ARF_NULL) {
1899         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1900      }
1901   }
1902}
1903
1904/***********************************************************************
1905 * Helpers for the various SEND message types:
1906 */
1907
1908/** Extended math function, float[8].
1909 */
1910void gen4_math(struct brw_codegen *p,
1911	       struct brw_reg dest,
1912	       unsigned function,
1913	       unsigned msg_reg_nr,
1914	       struct brw_reg src,
1915	       unsigned precision )
1916{
1917   const struct brw_device_info *devinfo = p->devinfo;
1918   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1919   unsigned data_type;
1920   if (has_scalar_region(src)) {
1921      data_type = BRW_MATH_DATA_SCALAR;
1922   } else {
1923      data_type = BRW_MATH_DATA_VECTOR;
1924   }
1925
1926   assert(devinfo->gen < 6);
1927
1928   /* Example code doesn't set predicate_control for send
1929    * instructions.
1930    */
1931   brw_inst_set_pred_control(devinfo, insn, 0);
1932   brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1933
1934   brw_set_dest(p, insn, dest);
1935   brw_set_src0(p, insn, src);
1936   brw_set_math_message(p,
1937                        insn,
1938                        function,
1939                        src.type == BRW_REGISTER_TYPE_D,
1940                        precision,
1941                        data_type);
1942}
1943
1944void gen6_math(struct brw_codegen *p,
1945	       struct brw_reg dest,
1946	       unsigned function,
1947	       struct brw_reg src0,
1948	       struct brw_reg src1)
1949{
1950   const struct brw_device_info *devinfo = p->devinfo;
1951   brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1952
1953   assert(devinfo->gen >= 6);
1954
1955   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1956          (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1957   assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
1958          (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
1959
1960   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1961   if (devinfo->gen == 6) {
1962      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1963      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1964   }
1965
1966   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1967       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1968       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1969      assert(src0.type != BRW_REGISTER_TYPE_F);
1970      assert(src1.type != BRW_REGISTER_TYPE_F);
1971      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1972             (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1973   } else {
1974      assert(src0.type == BRW_REGISTER_TYPE_F);
1975      assert(src1.type == BRW_REGISTER_TYPE_F);
1976      if (function == BRW_MATH_FUNCTION_POW) {
1977         assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1978                (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1979      } else {
1980         assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1981                src1.nr == BRW_ARF_NULL);
1982      }
1983   }
1984
1985   /* Source modifiers are ignored for extended math instructions on Gen6. */
1986   if (devinfo->gen == 6) {
1987      assert(!src0.negate);
1988      assert(!src0.abs);
1989      assert(!src1.negate);
1990      assert(!src1.abs);
1991   }
1992
1993   brw_inst_set_math_function(devinfo, insn, function);
1994
1995   brw_set_dest(p, insn, dest);
1996   brw_set_src0(p, insn, src0);
1997   brw_set_src1(p, insn, src1);
1998}
1999
2000/**
2001 * Return the right surface index to access the thread scratch space using
2002 * stateless dataport messages.
2003 */
2004unsigned
2005brw_scratch_surface_idx(const struct brw_codegen *p)
2006{
2007   /* The scratch space is thread-local so IA coherency is unnecessary. */
2008   if (p->devinfo->gen >= 8)
2009      return GEN8_BTI_STATELESS_NON_COHERENT;
2010   else
2011      return BRW_BTI_STATELESS;
2012}
2013
2014/**
2015 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2016 * using a constant offset per channel.
2017 *
2018 * The offset must be aligned to oword size (16 bytes).  Used for
2019 * register spilling.
2020 */
2021void brw_oword_block_write_scratch(struct brw_codegen *p,
2022				   struct brw_reg mrf,
2023				   int num_regs,
2024				   unsigned offset)
2025{
2026   const struct brw_device_info *devinfo = p->devinfo;
2027   uint32_t msg_control, msg_type;
2028   int mlen;
2029
2030   if (devinfo->gen >= 6)
2031      offset /= 16;
2032
2033   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2034
2035   if (num_regs == 1) {
2036      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2037      mlen = 2;
2038   } else {
2039      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2040      mlen = 3;
2041   }
2042
2043   /* Set up the message header.  This is g0, with g0.2 filled with
2044    * the offset.  We don't want to leave our offset around in g0 or
2045    * it'll screw up texture samples, so set it up inside the message
2046    * reg.
2047    */
2048   {
2049      brw_push_insn_state(p);
2050      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2051      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2052      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2053
2054      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2055
2056      /* set message header global offset field (reg 0, element 2) */
2057      brw_MOV(p,
2058	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2059				  mrf.nr,
2060				  2), BRW_REGISTER_TYPE_UD),
2061	      brw_imm_ud(offset));
2062
2063      brw_pop_insn_state(p);
2064   }
2065
2066   {
2067      struct brw_reg dest;
2068      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2069      int send_commit_msg;
2070      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2071					 BRW_REGISTER_TYPE_UW);
2072
2073      if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_NONE) {
2074         brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2075	 src_header = vec16(src_header);
2076      }
2077      assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2078      if (devinfo->gen < 6)
2079         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2080
2081      /* Until gen6, writes followed by reads from the same location
2082       * are not guaranteed to be ordered unless write_commit is set.
2083       * If set, then a no-op write is issued to the destination
2084       * register to set a dependency, and a read from the destination
2085       * can be used to ensure the ordering.
2086       *
2087       * For gen6, only writes between different threads need ordering
2088       * protection.  Our use of DP writes is all about register
2089       * spilling within a thread.
2090       */
2091      if (devinfo->gen >= 6) {
2092	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2093	 send_commit_msg = 0;
2094      } else {
2095	 dest = src_header;
2096	 send_commit_msg = 1;
2097      }
2098
2099      brw_set_dest(p, insn, dest);
2100      if (devinfo->gen >= 6) {
2101	 brw_set_src0(p, insn, mrf);
2102      } else {
2103	 brw_set_src0(p, insn, brw_null_reg());
2104      }
2105
2106      if (devinfo->gen >= 6)
2107	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2108      else
2109	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2110
2111      brw_set_dp_write_message(p,
2112			       insn,
2113                               brw_scratch_surface_idx(p),
2114			       msg_control,
2115			       msg_type,
2116			       mlen,
2117			       true, /* header_present */
2118			       0, /* not a render target */
2119			       send_commit_msg, /* response_length */
2120			       0, /* eot */
2121			       send_commit_msg);
2122   }
2123}
2124
2125
2126/**
2127 * Read a block of owords (half a GRF each) from the scratch buffer
2128 * using a constant index per channel.
2129 *
2130 * Offset must be aligned to oword size (16 bytes).  Used for register
2131 * spilling.
2132 */
2133void
2134brw_oword_block_read_scratch(struct brw_codegen *p,
2135			     struct brw_reg dest,
2136			     struct brw_reg mrf,
2137			     int num_regs,
2138			     unsigned offset)
2139{
2140   const struct brw_device_info *devinfo = p->devinfo;
2141   uint32_t msg_control;
2142   int rlen;
2143
2144   if (devinfo->gen >= 6)
2145      offset /= 16;
2146
2147   if (p->devinfo->gen >= 7) {
2148      /* On gen 7 and above, we no longer have message registers and we can
2149       * send from any register we want.  By using the destination register
2150       * for the message, we guarantee that the implied message write won't
2151       * accidentally overwrite anything.  This has been a problem because
2152       * the MRF registers and source for the final FB write are both fixed
2153       * and may overlap.
2154       */
2155      mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2156   } else {
2157      mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2158   }
2159   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2160
2161   if (num_regs == 1) {
2162      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2163      rlen = 1;
2164   } else {
2165      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2166      rlen = 2;
2167   }
2168
2169   {
2170      brw_push_insn_state(p);
2171      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2172      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2173      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2174
2175      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2176
2177      /* set message header global offset field (reg 0, element 2) */
2178      brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2179
2180      brw_pop_insn_state(p);
2181   }
2182
2183   {
2184      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2185
2186      assert(brw_inst_pred_control(devinfo, insn) == 0);
2187      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2188
2189      brw_set_dest(p, insn, dest);	/* UW? */
2190      if (devinfo->gen >= 6) {
2191	 brw_set_src0(p, insn, mrf);
2192      } else {
2193	 brw_set_src0(p, insn, brw_null_reg());
2194         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2195      }
2196
2197      brw_set_dp_read_message(p,
2198			      insn,
2199                              brw_scratch_surface_idx(p),
2200			      msg_control,
2201			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2202			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2203			      1, /* msg_length */
2204                              true, /* header_present */
2205			      rlen);
2206   }
2207}
2208
2209void
2210gen7_block_read_scratch(struct brw_codegen *p,
2211                        struct brw_reg dest,
2212                        int num_regs,
2213                        unsigned offset)
2214{
2215   const struct brw_device_info *devinfo = p->devinfo;
2216   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2217   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2218
2219   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2220   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2221
2222   /* The HW requires that the header is present; this is to get the g0.5
2223    * scratch offset.
2224    */
2225   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2226
2227   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2228    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2229    * is 32 bytes, which happens to be the size of a register.
2230    */
2231   offset /= REG_SIZE;
2232   assert(offset < (1 << 12));
2233
2234   gen7_set_dp_scratch_message(p, insn,
2235                               false, /* scratch read */
2236                               false, /* OWords */
2237                               false, /* invalidate after read */
2238                               num_regs,
2239                               offset,
2240                               1,        /* mlen: just g0 */
2241                               num_regs, /* rlen */
2242                               true);    /* header present */
2243}
2244
2245/**
2246 * Read a float[4] vector from the data port Data Cache (const buffer).
2247 * Location (in buffer) should be a multiple of 16.
2248 * Used for fetching shader constants.
2249 */
2250void brw_oword_block_read(struct brw_codegen *p,
2251			  struct brw_reg dest,
2252			  struct brw_reg mrf,
2253			  uint32_t offset,
2254			  uint32_t bind_table_index)
2255{
2256   const struct brw_device_info *devinfo = p->devinfo;
2257
2258   /* On newer hardware, offset is in units of owords. */
2259   if (devinfo->gen >= 6)
2260      offset /= 16;
2261
2262   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2263
2264   brw_push_insn_state(p);
2265   brw_set_default_exec_size(p, BRW_EXECUTE_8);
2266   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2267   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2268   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2269
2270   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2271
2272   /* set message header global offset field (reg 0, element 2) */
2273   brw_MOV(p,
2274	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2275			       mrf.nr,
2276			       2), BRW_REGISTER_TYPE_UD),
2277	   brw_imm_ud(offset));
2278
2279   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2280
2281   /* cast dest to a uword[8] vector */
2282   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2283
2284   brw_set_dest(p, insn, dest);
2285   if (devinfo->gen >= 6) {
2286      brw_set_src0(p, insn, mrf);
2287   } else {
2288      brw_set_src0(p, insn, brw_null_reg());
2289      brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2290   }
2291
2292   brw_set_dp_read_message(p,
2293			   insn,
2294			   bind_table_index,
2295			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2296			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2297			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2298			   1, /* msg_length */
2299                           true, /* header_present */
2300			   1); /* response_length (1 reg, 2 owords!) */
2301
2302   brw_pop_insn_state(p);
2303}
2304
2305
2306void brw_fb_WRITE(struct brw_codegen *p,
2307		  int dispatch_width,
2308                  struct brw_reg payload,
2309                  struct brw_reg implied_header,
2310                  unsigned msg_control,
2311                  unsigned binding_table_index,
2312                  unsigned msg_length,
2313                  unsigned response_length,
2314                  bool eot,
2315                  bool last_render_target,
2316                  bool header_present)
2317{
2318   const struct brw_device_info *devinfo = p->devinfo;
2319   brw_inst *insn;
2320   unsigned msg_type;
2321   struct brw_reg dest, src0;
2322
2323   if (dispatch_width == 16)
2324      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2325   else
2326      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2327
2328   if (devinfo->gen >= 6) {
2329      insn = next_insn(p, BRW_OPCODE_SENDC);
2330   } else {
2331      insn = next_insn(p, BRW_OPCODE_SEND);
2332   }
2333   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2334
2335   if (devinfo->gen >= 6) {
2336      /* headerless version, just submit color payload */
2337      src0 = payload;
2338
2339      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2340   } else {
2341      assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2342      brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2343      src0 = implied_header;
2344
2345      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2346   }
2347
2348   brw_set_dest(p, insn, dest);
2349   brw_set_src0(p, insn, src0);
2350   brw_set_dp_write_message(p,
2351			    insn,
2352			    binding_table_index,
2353			    msg_control,
2354			    msg_type,
2355			    msg_length,
2356			    header_present,
2357			    last_render_target,
2358			    response_length,
2359			    eot,
2360			    0 /* send_commit_msg */);
2361}
2362
2363
2364/**
2365 * Texture sample instruction.
2366 * Note: the msg_type plus msg_length values determine exactly what kind
2367 * of sampling operation is performed.  See volume 4, page 161 of docs.
2368 */
2369void brw_SAMPLE(struct brw_codegen *p,
2370		struct brw_reg dest,
2371		unsigned msg_reg_nr,
2372		struct brw_reg src0,
2373		unsigned binding_table_index,
2374		unsigned sampler,
2375		unsigned msg_type,
2376		unsigned response_length,
2377		unsigned msg_length,
2378		unsigned header_present,
2379		unsigned simd_mode,
2380		unsigned return_format)
2381{
2382   const struct brw_device_info *devinfo = p->devinfo;
2383   brw_inst *insn;
2384
2385   if (msg_reg_nr != -1)
2386      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2387
2388   insn = next_insn(p, BRW_OPCODE_SEND);
2389   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2390
2391   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2392    *
2393    *    "Instruction compression is not allowed for this instruction (that
2394    *     is, send). The hardware behavior is undefined if this instruction is
2395    *     set as compressed. However, compress control can be set to "SecHalf"
2396    *     to affect the EMask generation."
2397    *
2398    * No similar wording is found in later PRMs, but there are examples
2399    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2400    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2401    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2402    */
2403   if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_2NDHALF)
2404      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2405
2406   if (devinfo->gen < 6)
2407      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2408
2409   brw_set_dest(p, insn, dest);
2410   brw_set_src0(p, insn, src0);
2411   brw_set_sampler_message(p, insn,
2412                           binding_table_index,
2413                           sampler,
2414                           msg_type,
2415                           response_length,
2416                           msg_length,
2417                           header_present,
2418                           simd_mode,
2419                           return_format);
2420}
2421
2422/* Adjust the message header's sampler state pointer to
2423 * select the correct group of 16 samplers.
2424 */
2425void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2426                                      struct brw_reg header,
2427                                      struct brw_reg sampler_index)
2428{
2429   /* The "Sampler Index" field can only store values between 0 and 15.
2430    * However, we can add an offset to the "Sampler State Pointer"
2431    * field, effectively selecting a different set of 16 samplers.
2432    *
2433    * The "Sampler State Pointer" needs to be aligned to a 32-byte
2434    * offset, and each sampler state is only 16-bytes, so we can't
2435    * exclusively use the offset - we have to use both.
2436    */
2437
2438   const struct brw_device_info *devinfo = p->devinfo;
2439
2440   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2441      const int sampler_state_size = 16; /* 16 bytes */
2442      uint32_t sampler = sampler_index.ud;
2443
2444      if (sampler >= 16) {
2445         assert(devinfo->is_haswell || devinfo->gen >= 8);
2446         brw_ADD(p,
2447                 get_element_ud(header, 3),
2448                 get_element_ud(brw_vec8_grf(0, 0), 3),
2449                 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2450      }
2451   } else {
2452      /* Non-const sampler array indexing case */
2453      if (devinfo->gen < 8 && !devinfo->is_haswell) {
2454         return;
2455      }
2456
2457      struct brw_reg temp = get_element_ud(header, 3);
2458
2459      brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2460      brw_SHL(p, temp, temp, brw_imm_ud(4));
2461      brw_ADD(p,
2462              get_element_ud(header, 3),
2463              get_element_ud(brw_vec8_grf(0, 0), 3),
2464              temp);
2465   }
2466}
2467
2468/* All these variables are pretty confusing - we might be better off
2469 * using bitmasks and macros for this, in the old style.  Or perhaps
2470 * just having the caller instantiate the fields in dword3 itself.
2471 */
2472void brw_urb_WRITE(struct brw_codegen *p,
2473		   struct brw_reg dest,
2474		   unsigned msg_reg_nr,
2475		   struct brw_reg src0,
2476                   enum brw_urb_write_flags flags,
2477		   unsigned msg_length,
2478		   unsigned response_length,
2479		   unsigned offset,
2480		   unsigned swizzle)
2481{
2482   const struct brw_device_info *devinfo = p->devinfo;
2483   brw_inst *insn;
2484
2485   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2486
2487   if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2488      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2489      brw_push_insn_state(p);
2490      brw_set_default_access_mode(p, BRW_ALIGN_1);
2491      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2492      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2493		       BRW_REGISTER_TYPE_UD),
2494	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2495		brw_imm_ud(0xff00));
2496      brw_pop_insn_state(p);
2497   }
2498
2499   insn = next_insn(p, BRW_OPCODE_SEND);
2500
2501   assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2502
2503   brw_set_dest(p, insn, dest);
2504   brw_set_src0(p, insn, src0);
2505   brw_set_src1(p, insn, brw_imm_d(0));
2506
2507   if (devinfo->gen < 6)
2508      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2509
2510   brw_set_urb_message(p,
2511		       insn,
2512		       flags,
2513		       msg_length,
2514		       response_length,
2515		       offset,
2516		       swizzle);
2517}
2518
2519struct brw_inst *
2520brw_send_indirect_message(struct brw_codegen *p,
2521                          unsigned sfid,
2522                          struct brw_reg dst,
2523                          struct brw_reg payload,
2524                          struct brw_reg desc)
2525{
2526   const struct brw_device_info *devinfo = p->devinfo;
2527   struct brw_inst *send;
2528   int setup;
2529
2530   assert(desc.type == BRW_REGISTER_TYPE_UD);
2531
2532   /* We hold on to the setup instruction (the SEND in the direct case, the OR
2533    * in the indirect case) by its index in the instruction store.  The
2534    * pointer returned by next_insn() may become invalid if emitting the SEND
2535    * in the indirect case reallocs the store.
2536    */
2537
2538   if (desc.file == BRW_IMMEDIATE_VALUE) {
2539      setup = p->nr_insn;
2540      send = next_insn(p, BRW_OPCODE_SEND);
2541      brw_set_src1(p, send, desc);
2542
2543   } else {
2544      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2545
2546      brw_push_insn_state(p);
2547      brw_set_default_access_mode(p, BRW_ALIGN_1);
2548      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2549      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2550
2551      /* Load the indirect descriptor to an address register using OR so the
2552       * caller can specify additional descriptor bits with the usual
2553       * brw_set_*_message() helper functions.
2554       */
2555      setup = p->nr_insn;
2556      brw_OR(p, addr, desc, brw_imm_ud(0));
2557
2558      brw_pop_insn_state(p);
2559
2560      send = next_insn(p, BRW_OPCODE_SEND);
2561      brw_set_src1(p, send, addr);
2562   }
2563
2564   brw_set_dest(p, send, dst);
2565   brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2566   brw_inst_set_sfid(devinfo, send, sfid);
2567
2568   return &p->store[setup];
2569}
2570
2571static struct brw_inst *
2572brw_send_indirect_surface_message(struct brw_codegen *p,
2573                                  unsigned sfid,
2574                                  struct brw_reg dst,
2575                                  struct brw_reg payload,
2576                                  struct brw_reg surface,
2577                                  unsigned message_len,
2578                                  unsigned response_len,
2579                                  bool header_present)
2580{
2581   const struct brw_device_info *devinfo = p->devinfo;
2582   struct brw_inst *insn;
2583
2584   if (surface.file != BRW_IMMEDIATE_VALUE) {
2585      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2586
2587      brw_push_insn_state(p);
2588      brw_set_default_access_mode(p, BRW_ALIGN_1);
2589      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2590      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2591
2592      /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2593       * some surface array is accessed out of bounds.
2594       */
2595      insn = brw_AND(p, addr,
2596                     suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2597                               BRW_GET_SWZ(surface.swizzle, 0)),
2598                     brw_imm_ud(0xff));
2599
2600      brw_pop_insn_state(p);
2601
2602      surface = addr;
2603   }
2604
2605   insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2606   brw_inst_set_mlen(devinfo, insn, message_len);
2607   brw_inst_set_rlen(devinfo, insn, response_len);
2608   brw_inst_set_header_present(devinfo, insn, header_present);
2609
2610   return insn;
2611}
2612
2613static int
2614brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2615{
2616   int offset;
2617   void *store = p->store;
2618   const struct brw_device_info *devinfo = p->devinfo;
2619
2620   int depth = 0;
2621
2622   for (offset = next_offset(devinfo, store, start_offset);
2623        offset < p->next_insn_offset;
2624        offset = next_offset(devinfo, store, offset)) {
2625      brw_inst *insn = store + offset;
2626
2627      switch (brw_inst_opcode(devinfo, insn)) {
2628      case BRW_OPCODE_IF:
2629         depth++;
2630         break;
2631      case BRW_OPCODE_ENDIF:
2632         if (depth == 0)
2633            return offset;
2634         depth--;
2635         break;
2636      case BRW_OPCODE_ELSE:
2637      case BRW_OPCODE_WHILE:
2638      case BRW_OPCODE_HALT:
2639         if (depth == 0)
2640            return offset;
2641      }
2642   }
2643
2644   return 0;
2645}
2646
2647/* There is no DO instruction on gen6, so to find the end of the loop
2648 * we have to see if the loop is jumping back before our start
2649 * instruction.
2650 */
2651static int
2652brw_find_loop_end(struct brw_codegen *p, int start_offset)
2653{
2654   const struct brw_device_info *devinfo = p->devinfo;
2655   int offset;
2656   int scale = 16 / brw_jump_scale(devinfo);
2657   void *store = p->store;
2658
2659   assert(devinfo->gen >= 6);
2660
2661   /* Always start after the instruction (such as a WHILE) we're trying to fix
2662    * up.
2663    */
2664   for (offset = next_offset(devinfo, store, start_offset);
2665        offset < p->next_insn_offset;
2666        offset = next_offset(devinfo, store, offset)) {
2667      brw_inst *insn = store + offset;
2668
2669      if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2670         int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2671                                     : brw_inst_jip(devinfo, insn);
2672	 if (offset + jip * scale <= start_offset)
2673	    return offset;
2674      }
2675   }
2676   assert(!"not reached");
2677   return start_offset;
2678}
2679
2680/* After program generation, go back and update the UIP and JIP of
2681 * BREAK, CONT, and HALT instructions to their correct locations.
2682 */
2683void
2684brw_set_uip_jip(struct brw_codegen *p)
2685{
2686   const struct brw_device_info *devinfo = p->devinfo;
2687   int offset;
2688   int br = brw_jump_scale(devinfo);
2689   int scale = 16 / br;
2690   void *store = p->store;
2691
2692   if (devinfo->gen < 6)
2693      return;
2694
2695   for (offset = 0; offset < p->next_insn_offset;
2696        offset = next_offset(devinfo, store, offset)) {
2697      brw_inst *insn = store + offset;
2698
2699      if (brw_inst_cmpt_control(devinfo, insn)) {
2700	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2701         assert(brw_inst_opcode(devinfo, insn) != BRW_OPCODE_BREAK &&
2702                brw_inst_opcode(devinfo, insn) != BRW_OPCODE_CONTINUE &&
2703                brw_inst_opcode(devinfo, insn) != BRW_OPCODE_HALT);
2704	 continue;
2705      }
2706
2707      int block_end_offset = brw_find_next_block_end(p, offset);
2708      switch (brw_inst_opcode(devinfo, insn)) {
2709      case BRW_OPCODE_BREAK:
2710         assert(block_end_offset != 0);
2711         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2712	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2713         brw_inst_set_uip(devinfo, insn,
2714	    (brw_find_loop_end(p, offset) - offset +
2715             (devinfo->gen == 6 ? 16 : 0)) / scale);
2716	 break;
2717      case BRW_OPCODE_CONTINUE:
2718         assert(block_end_offset != 0);
2719         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2720         brw_inst_set_uip(devinfo, insn,
2721            (brw_find_loop_end(p, offset) - offset) / scale);
2722
2723         assert(brw_inst_uip(devinfo, insn) != 0);
2724         assert(brw_inst_jip(devinfo, insn) != 0);
2725	 break;
2726
2727      case BRW_OPCODE_ENDIF: {
2728         int32_t jump = (block_end_offset == 0) ?
2729                        1 * br : (block_end_offset - offset) / scale;
2730         if (devinfo->gen >= 7)
2731            brw_inst_set_jip(devinfo, insn, jump);
2732         else
2733            brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2734	 break;
2735      }
2736
2737      case BRW_OPCODE_HALT:
2738	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2739	  *
2740	  *    "In case of the halt instruction not inside any conditional
2741	  *     code block, the value of <JIP> and <UIP> should be the
2742	  *     same. In case of the halt instruction inside conditional code
2743	  *     block, the <UIP> should be the end of the program, and the
2744	  *     <JIP> should be end of the most inner conditional code block."
2745	  *
2746	  * The uip will have already been set by whoever set up the
2747	  * instruction.
2748	  */
2749	 if (block_end_offset == 0) {
2750            brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2751	 } else {
2752            brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2753	 }
2754         assert(brw_inst_uip(devinfo, insn) != 0);
2755         assert(brw_inst_jip(devinfo, insn) != 0);
2756	 break;
2757      }
2758   }
2759}
2760
2761void brw_ff_sync(struct brw_codegen *p,
2762		   struct brw_reg dest,
2763		   unsigned msg_reg_nr,
2764		   struct brw_reg src0,
2765		   bool allocate,
2766		   unsigned response_length,
2767		   bool eot)
2768{
2769   const struct brw_device_info *devinfo = p->devinfo;
2770   brw_inst *insn;
2771
2772   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2773
2774   insn = next_insn(p, BRW_OPCODE_SEND);
2775   brw_set_dest(p, insn, dest);
2776   brw_set_src0(p, insn, src0);
2777   brw_set_src1(p, insn, brw_imm_d(0));
2778
2779   if (devinfo->gen < 6)
2780      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2781
2782   brw_set_ff_sync_message(p,
2783			   insn,
2784			   allocate,
2785			   response_length,
2786			   eot);
2787}
2788
2789/**
2790 * Emit the SEND instruction necessary to generate stream output data on Gen6
2791 * (for transform feedback).
2792 *
2793 * If send_commit_msg is true, this is the last piece of stream output data
2794 * from this thread, so send the data as a committed write.  According to the
2795 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2796 *
2797 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2798 *   writes are complete by sending the final write as a committed write."
2799 */
2800void
2801brw_svb_write(struct brw_codegen *p,
2802              struct brw_reg dest,
2803              unsigned msg_reg_nr,
2804              struct brw_reg src0,
2805              unsigned binding_table_index,
2806              bool   send_commit_msg)
2807{
2808   brw_inst *insn;
2809
2810   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2811
2812   insn = next_insn(p, BRW_OPCODE_SEND);
2813   brw_set_dest(p, insn, dest);
2814   brw_set_src0(p, insn, src0);
2815   brw_set_src1(p, insn, brw_imm_d(0));
2816   brw_set_dp_write_message(p, insn,
2817                            binding_table_index,
2818                            0, /* msg_control: ignored */
2819                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2820                            1, /* msg_length */
2821                            true, /* header_present */
2822                            0, /* last_render_target: ignored */
2823                            send_commit_msg, /* response_length */
2824                            0, /* end_of_thread */
2825                            send_commit_msg); /* send_commit_msg */
2826}
2827
2828static unsigned
2829brw_surface_payload_size(struct brw_codegen *p,
2830                         unsigned num_channels,
2831                         bool has_simd4x2,
2832                         bool has_simd16)
2833{
2834   if (has_simd4x2 && brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2835      return 1;
2836   else if (has_simd16 && p->compressed)
2837      return 2 * num_channels;
2838   else
2839      return num_channels;
2840}
2841
2842static void
2843brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2844                                  brw_inst *insn,
2845                                  unsigned atomic_op,
2846                                  bool response_expected)
2847{
2848   const struct brw_device_info *devinfo = p->devinfo;
2849   unsigned msg_control =
2850      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2851      (response_expected ? 1 << 5 : 0); /* Return data expected */
2852
2853   if (devinfo->gen >= 8 || devinfo->is_haswell) {
2854      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2855         if (!p->compressed)
2856            msg_control |= 1 << 4; /* SIMD8 mode */
2857
2858         brw_inst_set_dp_msg_type(devinfo, insn,
2859                                  HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2860      } else {
2861         brw_inst_set_dp_msg_type(devinfo, insn,
2862            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2863      }
2864   } else {
2865      brw_inst_set_dp_msg_type(devinfo, insn,
2866                               GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2867
2868      if (!p->compressed)
2869         msg_control |= 1 << 4; /* SIMD8 mode */
2870   }
2871
2872   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2873}
2874
2875void
2876brw_untyped_atomic(struct brw_codegen *p,
2877                   struct brw_reg dst,
2878                   struct brw_reg payload,
2879                   struct brw_reg surface,
2880                   unsigned atomic_op,
2881                   unsigned msg_length,
2882                   bool response_expected)
2883{
2884   const struct brw_device_info *devinfo = p->devinfo;
2885   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2886                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2887                          GEN7_SFID_DATAPORT_DATA_CACHE);
2888   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2889   /* Mask out unused components -- This is especially important in Align16
2890    * mode on generations that don't have native support for SIMD4x2 atomics,
2891    * because unused but enabled components will cause the dataport to perform
2892    * additional atomic operations on the addresses that happen to be in the
2893    * uninitialized Y, Z and W coordinates of the payload.
2894    */
2895   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2896   struct brw_inst *insn = brw_send_indirect_surface_message(
2897      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2898      brw_surface_payload_size(p, response_expected,
2899                               devinfo->gen >= 8 || devinfo->is_haswell, true),
2900      align1);
2901
2902   brw_set_dp_untyped_atomic_message(
2903      p, insn, atomic_op, response_expected);
2904}
2905
2906static void
2907brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2908                                        struct brw_inst *insn,
2909                                        unsigned num_channels)
2910{
2911   const struct brw_device_info *devinfo = p->devinfo;
2912   /* Set mask of 32-bit channels to drop. */
2913   unsigned msg_control = 0xf & (0xf << num_channels);
2914
2915   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2916      if (p->compressed)
2917         msg_control |= 1 << 4; /* SIMD16 mode */
2918      else
2919         msg_control |= 2 << 4; /* SIMD8 mode */
2920   }
2921
2922   brw_inst_set_dp_msg_type(devinfo, insn,
2923                            (devinfo->gen >= 8 || devinfo->is_haswell ?
2924                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2925                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2926   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2927}
2928
2929void
2930brw_untyped_surface_read(struct brw_codegen *p,
2931                         struct brw_reg dst,
2932                         struct brw_reg payload,
2933                         struct brw_reg surface,
2934                         unsigned msg_length,
2935                         unsigned num_channels)
2936{
2937   const struct brw_device_info *devinfo = p->devinfo;
2938   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2939                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2940                          GEN7_SFID_DATAPORT_DATA_CACHE);
2941   struct brw_inst *insn = brw_send_indirect_surface_message(
2942      p, sfid, dst, payload, surface, msg_length,
2943      brw_surface_payload_size(p, num_channels, true, true),
2944      false);
2945
2946   brw_set_dp_untyped_surface_read_message(
2947      p, insn, num_channels);
2948}
2949
2950static void
2951brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2952                                         struct brw_inst *insn,
2953                                         unsigned num_channels)
2954{
2955   const struct brw_device_info *devinfo = p->devinfo;
2956   /* Set mask of 32-bit channels to drop. */
2957   unsigned msg_control = 0xf & (0xf << num_channels);
2958
2959   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2960      if (p->compressed)
2961         msg_control |= 1 << 4; /* SIMD16 mode */
2962      else
2963         msg_control |= 2 << 4; /* SIMD8 mode */
2964   } else {
2965      if (devinfo->gen >= 8 || devinfo->is_haswell)
2966         msg_control |= 0 << 4; /* SIMD4x2 mode */
2967      else
2968         msg_control |= 2 << 4; /* SIMD8 mode */
2969   }
2970
2971   brw_inst_set_dp_msg_type(devinfo, insn,
2972                            devinfo->gen >= 8 || devinfo->is_haswell ?
2973                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
2974                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
2975   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2976}
2977
2978void
2979brw_untyped_surface_write(struct brw_codegen *p,
2980                          struct brw_reg payload,
2981                          struct brw_reg surface,
2982                          unsigned msg_length,
2983                          unsigned num_channels)
2984{
2985   const struct brw_device_info *devinfo = p->devinfo;
2986   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2987                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2988                          GEN7_SFID_DATAPORT_DATA_CACHE);
2989   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2990   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2991   const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
2992                          WRITEMASK_X : WRITEMASK_XYZW;
2993   struct brw_inst *insn = brw_send_indirect_surface_message(
2994      p, sfid, brw_writemask(brw_null_reg(), mask),
2995      payload, surface, msg_length, 0, align1);
2996
2997   brw_set_dp_untyped_surface_write_message(
2998      p, insn, num_channels);
2999}
3000
3001static void
3002brw_set_dp_typed_atomic_message(struct brw_codegen *p,
3003                                struct brw_inst *insn,
3004                                unsigned atomic_op,
3005                                bool response_expected)
3006{
3007   const struct brw_device_info *devinfo = p->devinfo;
3008   unsigned msg_control =
3009      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
3010      (response_expected ? 1 << 5 : 0); /* Return data expected */
3011
3012   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3013      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3014         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3015            msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3016
3017         brw_inst_set_dp_msg_type(devinfo, insn,
3018                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
3019      } else {
3020         brw_inst_set_dp_msg_type(devinfo, insn,
3021                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
3022      }
3023
3024   } else {
3025      brw_inst_set_dp_msg_type(devinfo, insn,
3026                               GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
3027
3028      if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3029         msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
3030   }
3031
3032   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3033}
3034
3035void
3036brw_typed_atomic(struct brw_codegen *p,
3037                 struct brw_reg dst,
3038                 struct brw_reg payload,
3039                 struct brw_reg surface,
3040                 unsigned atomic_op,
3041                 unsigned msg_length,
3042                 bool response_expected) {
3043   const struct brw_device_info *devinfo = p->devinfo;
3044   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3045                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3046                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3047   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3048   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3049   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3050   struct brw_inst *insn = brw_send_indirect_surface_message(
3051      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3052      brw_surface_payload_size(p, response_expected,
3053                               devinfo->gen >= 8 || devinfo->is_haswell, false),
3054      true);
3055
3056   brw_set_dp_typed_atomic_message(
3057      p, insn, atomic_op, response_expected);
3058}
3059
3060static void
3061brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3062                                      struct brw_inst *insn,
3063                                      unsigned num_channels)
3064{
3065   const struct brw_device_info *devinfo = p->devinfo;
3066   /* Set mask of unused channels. */
3067   unsigned msg_control = 0xf & (0xf << num_channels);
3068
3069   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3070      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3071         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3072            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3073         else
3074            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3075      }
3076
3077      brw_inst_set_dp_msg_type(devinfo, insn,
3078                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3079   } else {
3080      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3081         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3082            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3083      }
3084
3085      brw_inst_set_dp_msg_type(devinfo, insn,
3086                               GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3087   }
3088
3089   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3090}
3091
3092void
3093brw_typed_surface_read(struct brw_codegen *p,
3094                       struct brw_reg dst,
3095                       struct brw_reg payload,
3096                       struct brw_reg surface,
3097                       unsigned msg_length,
3098                       unsigned num_channels)
3099{
3100   const struct brw_device_info *devinfo = p->devinfo;
3101   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3102                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3103                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3104   struct brw_inst *insn = brw_send_indirect_surface_message(
3105      p, sfid, dst, payload, surface, msg_length,
3106      brw_surface_payload_size(p, num_channels,
3107                               devinfo->gen >= 8 || devinfo->is_haswell, false),
3108      true);
3109
3110   brw_set_dp_typed_surface_read_message(
3111      p, insn, num_channels);
3112}
3113
3114static void
3115brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3116                                       struct brw_inst *insn,
3117                                       unsigned num_channels)
3118{
3119   const struct brw_device_info *devinfo = p->devinfo;
3120   /* Set mask of unused channels. */
3121   unsigned msg_control = 0xf & (0xf << num_channels);
3122
3123   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3124      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3125         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3126            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3127         else
3128            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3129      }
3130
3131      brw_inst_set_dp_msg_type(devinfo, insn,
3132                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3133
3134   } else {
3135      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3136         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3137            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3138      }
3139
3140      brw_inst_set_dp_msg_type(devinfo, insn,
3141                               GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3142   }
3143
3144   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3145}
3146
3147void
3148brw_typed_surface_write(struct brw_codegen *p,
3149                        struct brw_reg payload,
3150                        struct brw_reg surface,
3151                        unsigned msg_length,
3152                        unsigned num_channels)
3153{
3154   const struct brw_device_info *devinfo = p->devinfo;
3155   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3156                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3157                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3158   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3159   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3160   const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3161                          WRITEMASK_X : WRITEMASK_XYZW);
3162   struct brw_inst *insn = brw_send_indirect_surface_message(
3163      p, sfid, brw_writemask(brw_null_reg(), mask),
3164      payload, surface, msg_length, 0, true);
3165
3166   brw_set_dp_typed_surface_write_message(
3167      p, insn, num_channels);
3168}
3169
3170static void
3171brw_set_memory_fence_message(struct brw_codegen *p,
3172                             struct brw_inst *insn,
3173                             enum brw_message_target sfid,
3174                             bool commit_enable)
3175{
3176   const struct brw_device_info *devinfo = p->devinfo;
3177
3178   brw_set_message_descriptor(p, insn, sfid,
3179                              1 /* message length */,
3180                              (commit_enable ? 1 : 0) /* response length */,
3181                              true /* header present */,
3182                              false);
3183
3184   switch (sfid) {
3185   case GEN6_SFID_DATAPORT_RENDER_CACHE:
3186      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3187      break;
3188   case GEN7_SFID_DATAPORT_DATA_CACHE:
3189      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3190      break;
3191   default:
3192      unreachable("Not reached");
3193   }
3194
3195   if (commit_enable)
3196      brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3197}
3198
3199void
3200brw_memory_fence(struct brw_codegen *p,
3201                 struct brw_reg dst)
3202{
3203   const struct brw_device_info *devinfo = p->devinfo;
3204   const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3205   struct brw_inst *insn;
3206
3207   /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3208    * message doesn't write anything back.
3209    */
3210   insn = next_insn(p, BRW_OPCODE_SEND);
3211   brw_set_dest(p, insn, dst);
3212   brw_set_src0(p, insn, dst);
3213   brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3214                                commit_enable);
3215
3216   if (devinfo->gen == 7 && !devinfo->is_haswell) {
3217      /* IVB does typed surface access through the render cache, so we need to
3218       * flush it too.  Use a different register so both flushes can be
3219       * pipelined by the hardware.
3220       */
3221      insn = next_insn(p, BRW_OPCODE_SEND);
3222      brw_set_dest(p, insn, offset(dst, 1));
3223      brw_set_src0(p, insn, offset(dst, 1));
3224      brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3225                                   commit_enable);
3226
3227      /* Now write the response of the second message into the response of the
3228       * first to trigger a pipeline stall -- This way future render and data
3229       * cache messages will be properly ordered with respect to past data and
3230       * render cache messages.
3231       */
3232      brw_push_insn_state(p);
3233      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3234      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3235      brw_MOV(p, dst, offset(dst, 1));
3236      brw_pop_insn_state(p);
3237   }
3238}
3239
3240void
3241brw_pixel_interpolator_query(struct brw_codegen *p,
3242                             struct brw_reg dest,
3243                             struct brw_reg mrf,
3244                             bool noperspective,
3245                             unsigned mode,
3246                             struct brw_reg data,
3247                             unsigned msg_length,
3248                             unsigned response_length)
3249{
3250   const struct brw_device_info *devinfo = p->devinfo;
3251   struct brw_inst *insn;
3252   const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
3253
3254   /* brw_send_indirect_message will automatically use a direct send message
3255    * if data is actually immediate.
3256    */
3257   insn = brw_send_indirect_message(p,
3258                                    GEN7_SFID_PIXEL_INTERPOLATOR,
3259                                    dest,
3260                                    mrf,
3261                                    vec1(data));
3262   brw_inst_set_mlen(devinfo, insn, msg_length);
3263   brw_inst_set_rlen(devinfo, insn, response_length);
3264
3265   brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
3266   brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3267   brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3268   brw_inst_set_pi_message_type(devinfo, insn, mode);
3269}
3270
3271void
3272brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst)
3273{
3274   const struct brw_device_info *devinfo = p->devinfo;
3275   brw_inst *inst;
3276
3277   assert(devinfo->gen >= 7);
3278
3279   brw_push_insn_state(p);
3280
3281   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3282      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3283
3284      if (devinfo->gen >= 8) {
3285         /* Getting the first active channel index is easy on Gen8: Just find
3286          * the first bit set in the mask register.  The same register exists
3287          * on HSW already but it reads back as all ones when the current
3288          * instruction has execution masking disabled, so it's kind of
3289          * useless.
3290          */
3291         inst = brw_FBL(p, vec1(dst),
3292                        retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
3293
3294         /* Quarter control has the effect of magically shifting the value of
3295          * this register.  Make sure it's set to zero.
3296          */
3297         brw_inst_set_qtr_control(devinfo, inst, GEN6_COMPRESSION_1Q);
3298      } else {
3299         const struct brw_reg flag = retype(brw_flag_reg(1, 0),
3300                                            BRW_REGISTER_TYPE_UD);
3301
3302         brw_MOV(p, flag, brw_imm_ud(0));
3303
3304         /* Run a 16-wide instruction returning zero with execution masking
3305          * and a conditional modifier enabled in order to get the current
3306          * execution mask in f1.0.
3307          */
3308         inst = brw_MOV(p, brw_null_reg(), brw_imm_ud(0));
3309         brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_16);
3310         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3311         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3312         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3313
3314         brw_FBL(p, vec1(dst), flag);
3315      }
3316   } else {
3317      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3318
3319      if (devinfo->gen >= 8) {
3320         /* In SIMD4x2 mode the first active channel index is just the
3321          * negation of the first bit of the mask register.
3322          */
3323         inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3324                        negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3325                        brw_imm_ud(1));
3326
3327      } else {
3328         /* Overwrite the destination without and with execution masking to
3329          * find out which of the channels is active.
3330          */
3331         brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3332                 brw_imm_ud(1));
3333
3334         inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3335                        brw_imm_ud(0));
3336         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3337      }
3338   }
3339
3340   brw_pop_insn_state(p);
3341}
3342
3343void
3344brw_broadcast(struct brw_codegen *p,
3345              struct brw_reg dst,
3346              struct brw_reg src,
3347              struct brw_reg idx)
3348{
3349   const struct brw_device_info *devinfo = p->devinfo;
3350   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3351   brw_inst *inst;
3352
3353   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3354          src.address_mode == BRW_ADDRESS_DIRECT);
3355
3356   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3357       idx.file == BRW_IMMEDIATE_VALUE) {
3358      /* Trivial, the source is already uniform or the index is a constant.
3359       * We will typically not get here if the optimizer is doing its job, but
3360       * asserting would be mean.
3361       */
3362      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3363      brw_MOV(p, dst,
3364              (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3365               stride(suboffset(src, 4 * i), 0, 4, 1)));
3366   } else {
3367      if (align1) {
3368         const struct brw_reg addr =
3369            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3370         const unsigned offset = src.nr * REG_SIZE + src.subnr;
3371         /* Limit in bytes of the signed indirect addressing immediate. */
3372         const unsigned limit = 512;
3373
3374         brw_push_insn_state(p);
3375         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3376         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3377
3378         /* Take into account the component size and horizontal stride. */
3379         assert(src.vstride == src.hstride + src.width);
3380         brw_SHL(p, addr, vec1(idx),
3381                 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3382                            src.hstride - 1));
3383
3384         /* We can only address up to limit bytes using the indirect
3385          * addressing immediate, account for the difference if the source
3386          * register is above this limit.
3387          */
3388         if (offset >= limit)
3389            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3390
3391         brw_pop_insn_state(p);
3392
3393         /* Use indirect addressing to fetch the specified component. */
3394         brw_MOV(p, dst,
3395                 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3396                        src.type));
3397      } else {
3398         /* In SIMD4x2 mode the index can be either zero or one, replicate it
3399          * to all bits of a flag register,
3400          */
3401         inst = brw_MOV(p,
3402                        brw_null_reg(),
3403                        stride(brw_swizzle1(idx, 0), 0, 4, 1));
3404         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3405         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3406         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3407
3408         /* and use predicated SEL to pick the right channel. */
3409         inst = brw_SEL(p, dst,
3410                        stride(suboffset(src, 4), 0, 4, 1),
3411                        stride(src, 0, 4, 1));
3412         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3413         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3414      }
3415   }
3416}
3417
3418/**
3419 * This instruction is generated as a single-channel align1 instruction by
3420 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3421 *
3422 * We can't use the typed atomic op in the FS because that has the execution
3423 * mask ANDed with the pixel mask, but we just want to write the one dword for
3424 * all the pixels.
3425 *
3426 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3427 * one u32.  So we use the same untyped atomic write message as the pixel
3428 * shader.
3429 *
3430 * The untyped atomic operation requires a BUFFER surface type with RAW
3431 * format, and is only accessible through the legacy DATA_CACHE dataport
3432 * messages.
3433 */
3434void brw_shader_time_add(struct brw_codegen *p,
3435                         struct brw_reg payload,
3436                         uint32_t surf_index)
3437{
3438   const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3439                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3440                          GEN7_SFID_DATAPORT_DATA_CACHE);
3441   assert(p->devinfo->gen >= 7);
3442
3443   brw_push_insn_state(p);
3444   brw_set_default_access_mode(p, BRW_ALIGN_1);
3445   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3446   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3447   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3448
3449   /* We use brw_vec1_reg and unmasked because we want to increment the given
3450    * offset only once.
3451    */
3452   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3453                                      BRW_ARF_NULL, 0));
3454   brw_set_src0(p, send, brw_vec1_reg(payload.file,
3455                                      payload.nr, 0));
3456   brw_set_src1(p, send, brw_imm_ud(0));
3457   brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3458   brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3459   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3460
3461   brw_pop_insn_state(p);
3462}
3463
3464
3465/**
3466 * Emit the SEND message for a barrier
3467 */
3468void
3469brw_barrier(struct brw_codegen *p, struct brw_reg src)
3470{
3471   const struct brw_device_info *devinfo = p->devinfo;
3472   struct brw_inst *inst;
3473
3474   assert(devinfo->gen >= 7);
3475
3476   inst = next_insn(p, BRW_OPCODE_SEND);
3477   brw_set_dest(p, inst, brw_null_reg());
3478   brw_set_src0(p, inst, src);
3479   brw_set_src1(p, inst, brw_null_reg());
3480
3481   brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3482                              1 /* msg_length */,
3483                              0 /* response_length */,
3484                              false /* header_present */,
3485                              false /* end_of_thread */);
3486
3487   brw_inst_set_gateway_notify(devinfo, inst, 1);
3488   brw_inst_set_gateway_subfuncid(devinfo, inst,
3489                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3490
3491   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3492}
3493
3494
3495/**
3496 * Emit the wait instruction for a barrier
3497 */
3498void
3499brw_WAIT(struct brw_codegen *p)
3500{
3501   const struct brw_device_info *devinfo = p->devinfo;
3502   struct brw_inst *insn;
3503
3504   struct brw_reg src = brw_notification_reg();
3505
3506   insn = next_insn(p, BRW_OPCODE_WAIT);
3507   brw_set_dest(p, insn, src);
3508   brw_set_src0(p, insn, src);
3509   brw_set_src1(p, insn, brw_null_reg());
3510
3511   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3512   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3513}
3514