brw_eu_emit.c revision dd7290cf59206c49f1a322d53baa9957b13d2949
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "util/ralloc.h"
38
39/**
40 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
41 * registers, implicitly moving the operand to a message register.
42 *
43 * On Sandybridge, this is no longer the case.  This function performs the
44 * explicit move; it should be called before emitting a SEND instruction.
45 */
46void
47gen6_resolve_implied_move(struct brw_codegen *p,
48			  struct brw_reg *src,
49			  unsigned msg_reg_nr)
50{
51   const struct brw_device_info *devinfo = p->devinfo;
52   if (devinfo->gen < 6)
53      return;
54
55   if (src->file == BRW_MESSAGE_REGISTER_FILE)
56      return;
57
58   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
59      brw_push_insn_state(p);
60      brw_set_default_exec_size(p, BRW_EXECUTE_8);
61      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64	      retype(*src, BRW_REGISTER_TYPE_UD));
65      brw_pop_insn_state(p);
66   }
67   *src = brw_message_reg(msg_reg_nr);
68}
69
70static void
71gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72{
73   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74    * "The send with EOT should use register space R112-R127 for <src>. This is
75    *  to enable loading of a new thread into the same slot while the message
76    *  with EOT for current thread is pending dispatch."
77    *
78    * Since we're pretending to have 16 MRFs anyway, we may as well use the
79    * registers required for messages with EOT.
80    */
81   const struct brw_device_info *devinfo = p->devinfo;
82   if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83      reg->file = BRW_GENERAL_REGISTER_FILE;
84      reg->nr += GEN7_MRF_HACK_START;
85   }
86}
87
88/**
89 * Convert a brw_reg_type enumeration value into the hardware representation.
90 *
91 * The hardware encoding may depend on whether the value is an immediate.
92 */
93unsigned
94brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
95                        enum brw_reg_type type, unsigned file)
96{
97   if (file == BRW_IMMEDIATE_VALUE) {
98      static const int imm_hw_types[] = {
99         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
100         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
101         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
102         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
103         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
104         [BRW_REGISTER_TYPE_UB] = -1,
105         [BRW_REGISTER_TYPE_B]  = -1,
106         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
107         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
108         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
109         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
110         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
111         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
112         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
113      };
114      assert(type < ARRAY_SIZE(imm_hw_types));
115      assert(imm_hw_types[type] != -1);
116      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
117      return imm_hw_types[type];
118   } else {
119      /* Non-immediate registers */
120      static const int hw_types[] = {
121         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
122         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
123         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
124         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
125         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
126         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
127         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
128         [BRW_REGISTER_TYPE_UV] = -1,
129         [BRW_REGISTER_TYPE_VF] = -1,
130         [BRW_REGISTER_TYPE_V]  = -1,
131         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
132         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
133         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
134         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
135      };
136      assert(type < ARRAY_SIZE(hw_types));
137      assert(hw_types[type] != -1);
138      assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
139      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
140      return hw_types[type];
141   }
142}
143
144void
145brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
146{
147   const struct brw_device_info *devinfo = p->devinfo;
148
149   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
150       dest.file != BRW_MESSAGE_REGISTER_FILE)
151      assert(dest.nr < 128);
152
153   gen7_convert_mrf_to_grf(p, &dest);
154
155   brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
156   brw_inst_set_dst_reg_type(devinfo, inst,
157                             brw_reg_type_to_hw_type(devinfo, dest.type,
158                                                     dest.file));
159   brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
160
161   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
162      brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
163
164      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
165         brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
166	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
167	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
168         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
169      } else {
170         brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
171         brw_inst_set_da16_writemask(devinfo, inst, dest.dw1.bits.writemask);
172         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
173             dest.file == BRW_MESSAGE_REGISTER_FILE) {
174            assert(dest.dw1.bits.writemask != 0);
175         }
176	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
177	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
178	  *    this to be programmed as "01".
179	  */
180         brw_inst_set_dst_hstride(devinfo, inst, 1);
181      }
182   } else {
183      brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
184
185      /* These are different sizes in align1 vs align16:
186       */
187      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
188         brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
189                                       dest.dw1.bits.indirect_offset);
190	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
191	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
192         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
193      } else {
194         brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
195                                        dest.dw1.bits.indirect_offset);
196	 /* even ignored in da16, still need to set as '01' */
197         brw_inst_set_dst_hstride(devinfo, inst, 1);
198      }
199   }
200
201   /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
202    * or 16 (SIMD16), as that's normally correct.  However, when dealing with
203    * small registers, we automatically reduce it to match the register size.
204    */
205   if (dest.width < BRW_EXECUTE_8)
206      brw_inst_set_exec_size(devinfo, inst, dest.width);
207}
208
209extern int reg_type_size[];
210
211static void
212validate_reg(const struct brw_device_info *devinfo,
213             brw_inst *inst, struct brw_reg reg)
214{
215   const int hstride_for_reg[] = {0, 1, 2, 4};
216   const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
217   const int width_for_reg[] = {1, 2, 4, 8, 16};
218   const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
219   int width, hstride, vstride, execsize;
220
221   if (reg.file == BRW_IMMEDIATE_VALUE) {
222      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
223       * mean the destination has to be 128-bit aligned and the
224       * destination horiz stride has to be a word.
225       */
226      if (reg.type == BRW_REGISTER_TYPE_V) {
227         assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
228                reg_type_size[brw_inst_dst_reg_type(devinfo, inst)] == 2);
229      }
230
231      return;
232   }
233
234   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
235       reg.file == BRW_ARF_NULL)
236      return;
237
238   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
239    *
240    *    "Swizzling is not allowed when an accumulator is used as an implicit
241    *    source or an explicit source in an instruction."
242    */
243   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
244       reg.nr == BRW_ARF_ACCUMULATOR)
245      assert(reg.dw1.bits.swizzle == BRW_SWIZZLE_XYZW);
246
247   assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
248   hstride = hstride_for_reg[reg.hstride];
249
250   if (reg.vstride == 0xf) {
251      vstride = -1;
252   } else {
253      assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
254      vstride = vstride_for_reg[reg.vstride];
255   }
256
257   assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
258   width = width_for_reg[reg.width];
259
260   assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
261          brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
262   execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
263
264   /* Restrictions from 3.3.10: Register Region Restrictions. */
265   /* 3. */
266   assert(execsize >= width);
267
268   /* 4. */
269   if (execsize == width && hstride != 0) {
270      assert(vstride == -1 || vstride == width * hstride);
271   }
272
273   /* 5. */
274   if (execsize == width && hstride == 0) {
275      /* no restriction on vstride. */
276   }
277
278   /* 6. */
279   if (width == 1) {
280      assert(hstride == 0);
281   }
282
283   /* 7. */
284   if (execsize == 1 && width == 1) {
285      assert(hstride == 0);
286      assert(vstride == 0);
287   }
288
289   /* 8. */
290   if (vstride == 0 && hstride == 0) {
291      assert(width == 1);
292   }
293
294   /* 10. Check destination issues. */
295}
296
297static bool
298is_compactable_immediate(unsigned imm)
299{
300   /* We get the low 12 bits as-is. */
301   imm &= ~0xfff;
302
303   /* We get one bit replicated through the top 20 bits. */
304   return imm == 0 || imm == 0xfffff000;
305}
306
307void
308brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
309{
310   const struct brw_device_info *devinfo = p->devinfo;
311
312   if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
313      assert(reg.nr < 128);
314
315   gen7_convert_mrf_to_grf(p, &reg);
316
317   if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
318                             brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
319      /* Any source modifiers or regions will be ignored, since this just
320       * identifies the MRF/GRF to start reading the message contents from.
321       * Check for some likely failures.
322       */
323      assert(!reg.negate);
324      assert(!reg.abs);
325      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
326   }
327
328   validate_reg(devinfo, inst, reg);
329
330   brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
331   brw_inst_set_src0_reg_type(devinfo, inst,
332                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
333   brw_inst_set_src0_abs(devinfo, inst, reg.abs);
334   brw_inst_set_src0_negate(devinfo, inst, reg.negate);
335   brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
336
337   if (reg.file == BRW_IMMEDIATE_VALUE) {
338      brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud);
339
340      /* The Bspec's section titled "Non-present Operands" claims that if src0
341       * is an immediate that src1's type must be the same as that of src0.
342       *
343       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
344       * that do not follow this rule. E.g., from the IVB/HSW table:
345       *
346       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
347       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
348       *
349       * And from the SNB table:
350       *
351       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
352       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
353       *
354       * Neither of these cause warnings from the simulator when used,
355       * compacted or otherwise. In fact, all compaction mappings that have an
356       * immediate in src0 use a:ud for src1.
357       *
358       * The GM45 instruction compaction tables do not contain mapped meanings
359       * so it's not clear whether it has the restriction. We'll assume it was
360       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
361       */
362      brw_inst_set_src1_reg_file(devinfo, inst, BRW_ARCHITECTURE_REGISTER_FILE);
363      if (devinfo->gen < 6) {
364         brw_inst_set_src1_reg_type(devinfo, inst,
365                                    brw_inst_src0_reg_type(devinfo, inst));
366      } else {
367         brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
368      }
369
370      /* Compacted instructions only have 12-bits (plus 1 for the other 20)
371       * for immediate values. Presumably the hardware engineers realized
372       * that the only useful floating-point value that could be represented
373       * in this format is 0.0, which can also be represented as a VF-typed
374       * immediate, so they gave us the previously mentioned mapping on IVB+.
375       *
376       * Strangely, we do have a mapping for imm:f in src1, so we don't need
377       * to do this there.
378       *
379       * If we see a 0.0:F, change the type to VF so that it can be compacted.
380       */
381      if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
382          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F) {
383         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
384      }
385
386      /* There are no mappings for dst:d | i:d, so if the immediate is suitable
387       * set the types to :UD so the instruction can be compacted.
388       */
389      if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
390          brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
391          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
392          brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
393         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
394         brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
395      }
396   } else {
397      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
398         brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
399         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400             brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
401	 } else {
402            brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403	 }
404      } else {
405         brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
406
407         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
408            brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset);
409	 } else {
410            brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.dw1.bits.indirect_offset);
411	 }
412      }
413
414      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
415	 if (reg.width == BRW_WIDTH_1 &&
416             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
417            brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
418            brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
419            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
420	 } else {
421            brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
422            brw_inst_set_src0_width(devinfo, inst, reg.width);
423            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
424	 }
425      } else {
426         brw_inst_set_src0_da16_swiz_x(devinfo, inst,
427            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
428         brw_inst_set_src0_da16_swiz_y(devinfo, inst,
429            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
430         brw_inst_set_src0_da16_swiz_z(devinfo, inst,
431            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
432         brw_inst_set_src0_da16_swiz_w(devinfo, inst,
433            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
434
435	 /* This is an oddity of the fact we're using the same
436	  * descriptions for registers in align_16 as align_1:
437	  */
438	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
439            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
440	 else
441            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
442      }
443   }
444}
445
446
447void
448brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
449{
450   const struct brw_device_info *devinfo = p->devinfo;
451
452   if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
453      assert(reg.nr < 128);
454
455   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
456    *
457    *    "Accumulator registers may be accessed explicitly as src0
458    *    operands only."
459    */
460   assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
461          reg.nr != BRW_ARF_ACCUMULATOR);
462
463   gen7_convert_mrf_to_grf(p, &reg);
464   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
465
466   validate_reg(devinfo, inst, reg);
467
468   brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
469   brw_inst_set_src1_reg_type(devinfo, inst,
470                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
471   brw_inst_set_src1_abs(devinfo, inst, reg.abs);
472   brw_inst_set_src1_negate(devinfo, inst, reg.negate);
473
474   /* Only src1 can be immediate in two-argument instructions.
475    */
476   assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
477
478   if (reg.file == BRW_IMMEDIATE_VALUE) {
479      brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud);
480   } else {
481      /* This is a hardware restriction, which may or may not be lifted
482       * in the future:
483       */
484      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
485      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
486
487      brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
488      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
489         brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
490      } else {
491         brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
492      }
493
494      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
495	 if (reg.width == BRW_WIDTH_1 &&
496             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
497            brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
498            brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
499            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
500	 } else {
501            brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
502            brw_inst_set_src1_width(devinfo, inst, reg.width);
503            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
504	 }
505      } else {
506         brw_inst_set_src1_da16_swiz_x(devinfo, inst,
507            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
508         brw_inst_set_src1_da16_swiz_y(devinfo, inst,
509            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
510         brw_inst_set_src1_da16_swiz_z(devinfo, inst,
511            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
512         brw_inst_set_src1_da16_swiz_w(devinfo, inst,
513            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
514
515	 /* This is an oddity of the fact we're using the same
516	  * descriptions for registers in align_16 as align_1:
517	  */
518	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
519            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
520	 else
521            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
522      }
523   }
524}
525
526/**
527 * Set the Message Descriptor and Extended Message Descriptor fields
528 * for SEND messages.
529 *
530 * \note This zeroes out the Function Control bits, so it must be called
531 *       \b before filling out any message-specific data.  Callers can
532 *       choose not to fill in irrelevant bits; they will be zero.
533 */
534static void
535brw_set_message_descriptor(struct brw_codegen *p,
536			   brw_inst *inst,
537			   enum brw_message_target sfid,
538			   unsigned msg_length,
539			   unsigned response_length,
540			   bool header_present,
541			   bool end_of_thread)
542{
543   const struct brw_device_info *devinfo = p->devinfo;
544
545   brw_set_src1(p, inst, brw_imm_d(0));
546
547   /* For indirect sends, `inst` will not be the SEND/SENDC instruction
548    * itself; instead, it will be a MOV/OR into the address register.
549    *
550    * In this case, we avoid setting the extended message descriptor bits,
551    * since they go on the later SEND/SENDC instead and if set here would
552    * instead clobber the conditionalmod bits.
553    */
554   unsigned opcode = brw_inst_opcode(devinfo, inst);
555   if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
556      brw_inst_set_sfid(devinfo, inst, sfid);
557   }
558
559   brw_inst_set_mlen(devinfo, inst, msg_length);
560   brw_inst_set_rlen(devinfo, inst, response_length);
561   brw_inst_set_eot(devinfo, inst, end_of_thread);
562
563   if (devinfo->gen >= 5) {
564      brw_inst_set_header_present(devinfo, inst, header_present);
565   }
566}
567
568static void brw_set_math_message( struct brw_codegen *p,
569				  brw_inst *inst,
570				  unsigned function,
571				  unsigned integer_type,
572				  bool low_precision,
573				  unsigned dataType )
574{
575   const struct brw_device_info *devinfo = p->devinfo;
576   unsigned msg_length;
577   unsigned response_length;
578
579   /* Infer message length from the function */
580   switch (function) {
581   case BRW_MATH_FUNCTION_POW:
582   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
583   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
584   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
585      msg_length = 2;
586      break;
587   default:
588      msg_length = 1;
589      break;
590   }
591
592   /* Infer response length from the function */
593   switch (function) {
594   case BRW_MATH_FUNCTION_SINCOS:
595   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
596      response_length = 2;
597      break;
598   default:
599      response_length = 1;
600      break;
601   }
602
603
604   brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
605			      msg_length, response_length, false, false);
606   brw_inst_set_math_msg_function(devinfo, inst, function);
607   brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
608   brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
609   brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
610   brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
611   brw_inst_set_saturate(devinfo, inst, 0);
612}
613
614
615static void brw_set_ff_sync_message(struct brw_codegen *p,
616				    brw_inst *insn,
617				    bool allocate,
618				    unsigned response_length,
619				    bool end_of_thread)
620{
621   const struct brw_device_info *devinfo = p->devinfo;
622
623   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
624			      1, response_length, true, end_of_thread);
625   brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
626   brw_inst_set_urb_allocate(devinfo, insn, allocate);
627   /* The following fields are not used by FF_SYNC: */
628   brw_inst_set_urb_global_offset(devinfo, insn, 0);
629   brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
630   brw_inst_set_urb_used(devinfo, insn, 0);
631   brw_inst_set_urb_complete(devinfo, insn, 0);
632}
633
634static void brw_set_urb_message( struct brw_codegen *p,
635				 brw_inst *insn,
636                                 enum brw_urb_write_flags flags,
637				 unsigned msg_length,
638				 unsigned response_length,
639				 unsigned offset,
640				 unsigned swizzle_control )
641{
642   const struct brw_device_info *devinfo = p->devinfo;
643
644   assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
645   assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
646   assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
647
648   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
649			      msg_length, response_length, true,
650                              flags & BRW_URB_WRITE_EOT);
651
652   if (flags & BRW_URB_WRITE_OWORD) {
653      assert(msg_length == 2); /* header + one OWORD of data */
654      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
655   } else {
656      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
657   }
658
659   brw_inst_set_urb_global_offset(devinfo, insn, offset);
660   brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
661
662   if (devinfo->gen < 8) {
663      brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
664   }
665
666   if (devinfo->gen < 7) {
667      brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
668      brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
669   } else {
670      brw_inst_set_urb_per_slot_offset(devinfo, insn,
671         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
672   }
673}
674
675void
676brw_set_dp_write_message(struct brw_codegen *p,
677			 brw_inst *insn,
678			 unsigned binding_table_index,
679			 unsigned msg_control,
680			 unsigned msg_type,
681			 unsigned msg_length,
682			 bool header_present,
683			 unsigned last_render_target,
684			 unsigned response_length,
685			 unsigned end_of_thread,
686			 unsigned send_commit_msg)
687{
688   const struct brw_device_info *devinfo = p->devinfo;
689   unsigned sfid;
690
691   if (devinfo->gen >= 7) {
692      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
693      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
694	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
695      else
696	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
697   } else if (devinfo->gen == 6) {
698      /* Use the render cache for all write messages. */
699      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
700   } else {
701      sfid = BRW_SFID_DATAPORT_WRITE;
702   }
703
704   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
705			      header_present, end_of_thread);
706
707   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
708   brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
709   brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
710   brw_inst_set_rt_last(devinfo, insn, last_render_target);
711   if (devinfo->gen < 7) {
712      brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
713   }
714}
715
716void
717brw_set_dp_read_message(struct brw_codegen *p,
718			brw_inst *insn,
719			unsigned binding_table_index,
720			unsigned msg_control,
721			unsigned msg_type,
722			unsigned target_cache,
723			unsigned msg_length,
724                        bool header_present,
725			unsigned response_length)
726{
727   const struct brw_device_info *devinfo = p->devinfo;
728   unsigned sfid;
729
730   if (devinfo->gen >= 7) {
731      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
732   } else if (devinfo->gen == 6) {
733      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
734	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
735      else
736	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
737   } else {
738      sfid = BRW_SFID_DATAPORT_READ;
739   }
740
741   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
742			      header_present, false);
743
744   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
745   brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
746   brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
747   if (devinfo->gen < 6)
748      brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
749}
750
751void
752brw_set_sampler_message(struct brw_codegen *p,
753                        brw_inst *inst,
754                        unsigned binding_table_index,
755                        unsigned sampler,
756                        unsigned msg_type,
757                        unsigned response_length,
758                        unsigned msg_length,
759                        unsigned header_present,
760                        unsigned simd_mode,
761                        unsigned return_format)
762{
763   const struct brw_device_info *devinfo = p->devinfo;
764
765   brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
766			      response_length, header_present, false);
767
768   brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
769   brw_inst_set_sampler(devinfo, inst, sampler);
770   brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
771   if (devinfo->gen >= 5) {
772      brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
773   } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
774      brw_inst_set_sampler_return_format(devinfo, inst, return_format);
775   }
776}
777
778static void
779gen7_set_dp_scratch_message(struct brw_codegen *p,
780                            brw_inst *inst,
781                            bool write,
782                            bool dword,
783                            bool invalidate_after_read,
784                            unsigned num_regs,
785                            unsigned addr_offset,
786                            unsigned mlen,
787                            unsigned rlen,
788                            bool header_present)
789{
790   const struct brw_device_info *devinfo = p->devinfo;
791   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
792          (devinfo->gen >= 8 && num_regs == 8));
793   brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
794                              mlen, rlen, header_present, false);
795   brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
796   brw_inst_set_scratch_read_write(devinfo, inst, write);
797   brw_inst_set_scratch_type(devinfo, inst, dword);
798   brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
799   brw_inst_set_scratch_block_size(devinfo, inst, ffs(num_regs) - 1);
800   brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
801}
802
803#define next_insn brw_next_insn
804brw_inst *
805brw_next_insn(struct brw_codegen *p, unsigned opcode)
806{
807   const struct brw_device_info *devinfo = p->devinfo;
808   brw_inst *insn;
809
810   if (p->nr_insn + 1 > p->store_size) {
811      p->store_size <<= 1;
812      p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
813   }
814
815   p->next_insn_offset += 16;
816   insn = &p->store[p->nr_insn++];
817   memcpy(insn, p->current, sizeof(*insn));
818
819   brw_inst_set_opcode(devinfo, insn, opcode);
820   return insn;
821}
822
823static brw_inst *
824brw_alu1(struct brw_codegen *p, unsigned opcode,
825         struct brw_reg dest, struct brw_reg src)
826{
827   brw_inst *insn = next_insn(p, opcode);
828   brw_set_dest(p, insn, dest);
829   brw_set_src0(p, insn, src);
830   return insn;
831}
832
833static brw_inst *
834brw_alu2(struct brw_codegen *p, unsigned opcode,
835         struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
836{
837   brw_inst *insn = next_insn(p, opcode);
838   brw_set_dest(p, insn, dest);
839   brw_set_src0(p, insn, src0);
840   brw_set_src1(p, insn, src1);
841   return insn;
842}
843
844static int
845get_3src_subreg_nr(struct brw_reg reg)
846{
847   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
848      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
849      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
850   } else {
851      return reg.subnr / 4;
852   }
853}
854
855static brw_inst *
856brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
857         struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
858{
859   const struct brw_device_info *devinfo = p->devinfo;
860   brw_inst *inst = next_insn(p, opcode);
861
862   gen7_convert_mrf_to_grf(p, &dest);
863
864   assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
865
866   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
867	  dest.file == BRW_MESSAGE_REGISTER_FILE);
868   assert(dest.nr < 128);
869   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
870   assert(dest.type == BRW_REGISTER_TYPE_F ||
871          dest.type == BRW_REGISTER_TYPE_D ||
872          dest.type == BRW_REGISTER_TYPE_UD);
873   if (devinfo->gen == 6) {
874      brw_inst_set_3src_dst_reg_file(devinfo, inst,
875                                     dest.file == BRW_MESSAGE_REGISTER_FILE);
876   }
877   brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
878   brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
879   brw_inst_set_3src_dst_writemask(devinfo, inst, dest.dw1.bits.writemask);
880
881   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
882   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
883   assert(src0.nr < 128);
884   brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.dw1.bits.swizzle);
885   brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
886   brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
887   brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
888   brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
889   brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
890                                   src0.vstride == BRW_VERTICAL_STRIDE_0);
891
892   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
893   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
894   assert(src1.nr < 128);
895   brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.dw1.bits.swizzle);
896   brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
897   brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
898   brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
899   brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
900   brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
901                                   src1.vstride == BRW_VERTICAL_STRIDE_0);
902
903   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
904   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
905   assert(src2.nr < 128);
906   brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.dw1.bits.swizzle);
907   brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
908   brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
909   brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
910   brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
911   brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
912                                   src2.vstride == BRW_VERTICAL_STRIDE_0);
913
914   if (devinfo->gen >= 7) {
915      /* Set both the source and destination types based on dest.type,
916       * ignoring the source register types.  The MAD and LRP emitters ensure
917       * that all four types are float.  The BFE and BFI2 emitters, however,
918       * may send us mixed D and UD types and want us to ignore that and use
919       * the destination type.
920       */
921      switch (dest.type) {
922      case BRW_REGISTER_TYPE_F:
923         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
924         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
925         break;
926      case BRW_REGISTER_TYPE_D:
927         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
928         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
929         break;
930      case BRW_REGISTER_TYPE_UD:
931         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
932         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
933         break;
934      default:
935         unreachable("not reached");
936      }
937   }
938
939   return inst;
940}
941
942
943/***********************************************************************
944 * Convenience routines.
945 */
946#define ALU1(OP)					\
947brw_inst *brw_##OP(struct brw_codegen *p,		\
948	      struct brw_reg dest,			\
949	      struct brw_reg src0)   			\
950{							\
951   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
952}
953
954#define ALU2(OP)					\
955brw_inst *brw_##OP(struct brw_codegen *p,		\
956	      struct brw_reg dest,			\
957	      struct brw_reg src0,			\
958	      struct brw_reg src1)   			\
959{							\
960   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
961}
962
963#define ALU3(OP)					\
964brw_inst *brw_##OP(struct brw_codegen *p,		\
965	      struct brw_reg dest,			\
966	      struct brw_reg src0,			\
967	      struct brw_reg src1,			\
968	      struct brw_reg src2)   			\
969{							\
970   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
971}
972
973#define ALU3F(OP)                                               \
974brw_inst *brw_##OP(struct brw_codegen *p,         \
975                                 struct brw_reg dest,           \
976                                 struct brw_reg src0,           \
977                                 struct brw_reg src1,           \
978                                 struct brw_reg src2)           \
979{                                                               \
980   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
981   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
982   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
983   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
984   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
985}
986
987/* Rounding operations (other than RNDD) require two instructions - the first
988 * stores a rounded value (possibly the wrong way) in the dest register, but
989 * also sets a per-channel "increment bit" in the flag register.  A predicated
990 * add of 1.0 fixes dest to contain the desired result.
991 *
992 * Sandybridge and later appear to round correctly without an ADD.
993 */
994#define ROUND(OP)							      \
995void brw_##OP(struct brw_codegen *p,					      \
996	      struct brw_reg dest,					      \
997	      struct brw_reg src)					      \
998{									      \
999   const struct brw_device_info *devinfo = p->devinfo;					      \
1000   brw_inst *rnd, *add;							      \
1001   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
1002   brw_set_dest(p, rnd, dest);						      \
1003   brw_set_src0(p, rnd, src);						      \
1004									      \
1005   if (devinfo->gen < 6) {							      \
1006      /* turn on round-increments */					      \
1007      brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
1008      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
1009      brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
1010   }									      \
1011}
1012
1013
1014ALU1(MOV)
1015ALU2(SEL)
1016ALU1(NOT)
1017ALU2(AND)
1018ALU2(OR)
1019ALU2(XOR)
1020ALU2(SHR)
1021ALU2(SHL)
1022ALU2(ASR)
1023ALU1(FRC)
1024ALU1(RNDD)
1025ALU2(MAC)
1026ALU2(MACH)
1027ALU1(LZD)
1028ALU2(DP4)
1029ALU2(DPH)
1030ALU2(DP3)
1031ALU2(DP2)
1032ALU3F(MAD)
1033ALU3F(LRP)
1034ALU1(BFREV)
1035ALU3(BFE)
1036ALU2(BFI1)
1037ALU3(BFI2)
1038ALU1(FBH)
1039ALU1(FBL)
1040ALU1(CBIT)
1041ALU2(ADDC)
1042ALU2(SUBB)
1043
1044ROUND(RNDZ)
1045ROUND(RNDE)
1046
1047
1048brw_inst *
1049brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1050        struct brw_reg src0, struct brw_reg src1)
1051{
1052   /* 6.2.2: add */
1053   if (src0.type == BRW_REGISTER_TYPE_F ||
1054       (src0.file == BRW_IMMEDIATE_VALUE &&
1055	src0.type == BRW_REGISTER_TYPE_VF)) {
1056      assert(src1.type != BRW_REGISTER_TYPE_UD);
1057      assert(src1.type != BRW_REGISTER_TYPE_D);
1058   }
1059
1060   if (src1.type == BRW_REGISTER_TYPE_F ||
1061       (src1.file == BRW_IMMEDIATE_VALUE &&
1062	src1.type == BRW_REGISTER_TYPE_VF)) {
1063      assert(src0.type != BRW_REGISTER_TYPE_UD);
1064      assert(src0.type != BRW_REGISTER_TYPE_D);
1065   }
1066
1067   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1068}
1069
1070brw_inst *
1071brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1072        struct brw_reg src0, struct brw_reg src1)
1073{
1074   assert(dest.type == src0.type);
1075   assert(src0.type == src1.type);
1076   switch (src0.type) {
1077   case BRW_REGISTER_TYPE_B:
1078   case BRW_REGISTER_TYPE_UB:
1079   case BRW_REGISTER_TYPE_W:
1080   case BRW_REGISTER_TYPE_UW:
1081   case BRW_REGISTER_TYPE_D:
1082   case BRW_REGISTER_TYPE_UD:
1083      break;
1084   default:
1085      unreachable("Bad type for brw_AVG");
1086   }
1087
1088   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1089}
1090
1091brw_inst *
1092brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1093        struct brw_reg src0, struct brw_reg src1)
1094{
1095   /* 6.32.38: mul */
1096   if (src0.type == BRW_REGISTER_TYPE_D ||
1097       src0.type == BRW_REGISTER_TYPE_UD ||
1098       src1.type == BRW_REGISTER_TYPE_D ||
1099       src1.type == BRW_REGISTER_TYPE_UD) {
1100      assert(dest.type != BRW_REGISTER_TYPE_F);
1101   }
1102
1103   if (src0.type == BRW_REGISTER_TYPE_F ||
1104       (src0.file == BRW_IMMEDIATE_VALUE &&
1105	src0.type == BRW_REGISTER_TYPE_VF)) {
1106      assert(src1.type != BRW_REGISTER_TYPE_UD);
1107      assert(src1.type != BRW_REGISTER_TYPE_D);
1108   }
1109
1110   if (src1.type == BRW_REGISTER_TYPE_F ||
1111       (src1.file == BRW_IMMEDIATE_VALUE &&
1112	src1.type == BRW_REGISTER_TYPE_VF)) {
1113      assert(src0.type != BRW_REGISTER_TYPE_UD);
1114      assert(src0.type != BRW_REGISTER_TYPE_D);
1115   }
1116
1117   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1118	  src0.nr != BRW_ARF_ACCUMULATOR);
1119   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1120	  src1.nr != BRW_ARF_ACCUMULATOR);
1121
1122   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1123}
1124
1125brw_inst *
1126brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1127         struct brw_reg src0, struct brw_reg src1)
1128{
1129   src0.vstride = BRW_VERTICAL_STRIDE_0;
1130   src0.width = BRW_WIDTH_1;
1131   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1132   return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1133}
1134
1135brw_inst *
1136brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1137        struct brw_reg src0, struct brw_reg src1)
1138{
1139   src0.vstride = BRW_VERTICAL_STRIDE_0;
1140   src0.width = BRW_WIDTH_1;
1141   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1142   src1.vstride = BRW_VERTICAL_STRIDE_8;
1143   src1.width = BRW_WIDTH_8;
1144   src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1145   return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1146}
1147
1148brw_inst *
1149brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1150{
1151   const struct brw_device_info *devinfo = p->devinfo;
1152   const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1153   /* The F32TO16 instruction doesn't support 32-bit destination types in
1154    * Align1 mode, and neither does the Gen8 implementation in terms of a
1155    * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
1156    * an undocumented feature.
1157    */
1158   const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1159                                 (!align16 || devinfo->gen >= 8));
1160   brw_inst *inst;
1161
1162   if (align16) {
1163      assert(dst.type == BRW_REGISTER_TYPE_UD);
1164   } else {
1165      assert(dst.type == BRW_REGISTER_TYPE_UD ||
1166             dst.type == BRW_REGISTER_TYPE_W ||
1167             dst.type == BRW_REGISTER_TYPE_UW ||
1168             dst.type == BRW_REGISTER_TYPE_HF);
1169   }
1170
1171   brw_push_insn_state(p);
1172
1173   if (needs_zero_fill) {
1174      brw_set_default_access_mode(p, BRW_ALIGN_1);
1175      dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1176   }
1177
1178   if (devinfo->gen >= 8) {
1179      inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1180   } else {
1181      assert(devinfo->gen == 7);
1182      inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1183   }
1184
1185   if (needs_zero_fill) {
1186      brw_inst_set_no_dd_clear(devinfo, inst, true);
1187      inst = brw_MOV(p, suboffset(dst, 1), brw_imm_ud(0u));
1188      brw_inst_set_no_dd_check(devinfo, inst, true);
1189   }
1190
1191   brw_pop_insn_state(p);
1192   return inst;
1193}
1194
1195brw_inst *
1196brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1197{
1198   const struct brw_device_info *devinfo = p->devinfo;
1199   bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
1200
1201   if (align16) {
1202      assert(src.type == BRW_REGISTER_TYPE_UD);
1203   } else {
1204      /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1205       *
1206       *   Because this instruction does not have a 16-bit floating-point
1207       *   type, the source data type must be Word (W). The destination type
1208       *   must be F (Float).
1209       */
1210      if (src.type == BRW_REGISTER_TYPE_UD)
1211         src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1212
1213      assert(src.type == BRW_REGISTER_TYPE_W ||
1214             src.type == BRW_REGISTER_TYPE_UW ||
1215             src.type == BRW_REGISTER_TYPE_HF);
1216   }
1217
1218   if (devinfo->gen >= 8) {
1219      return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1220   } else {
1221      assert(devinfo->gen == 7);
1222      return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1223   }
1224}
1225
1226
1227void brw_NOP(struct brw_codegen *p)
1228{
1229   brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1230   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1231   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1232   brw_set_src1(p, insn, brw_imm_ud(0x0));
1233}
1234
1235
1236
1237
1238
1239/***********************************************************************
1240 * Comparisons, if/else/endif
1241 */
1242
1243brw_inst *
1244brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1245         unsigned predicate_control)
1246{
1247   const struct brw_device_info *devinfo = p->devinfo;
1248   struct brw_reg ip = brw_ip_reg();
1249   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1250
1251   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
1252   brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1253   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1254   brw_inst_set_pred_control(devinfo, inst, predicate_control);
1255
1256   return inst;
1257}
1258
1259static void
1260push_if_stack(struct brw_codegen *p, brw_inst *inst)
1261{
1262   p->if_stack[p->if_stack_depth] = inst - p->store;
1263
1264   p->if_stack_depth++;
1265   if (p->if_stack_array_size <= p->if_stack_depth) {
1266      p->if_stack_array_size *= 2;
1267      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1268			     p->if_stack_array_size);
1269   }
1270}
1271
1272static brw_inst *
1273pop_if_stack(struct brw_codegen *p)
1274{
1275   p->if_stack_depth--;
1276   return &p->store[p->if_stack[p->if_stack_depth]];
1277}
1278
1279static void
1280push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1281{
1282   if (p->loop_stack_array_size < p->loop_stack_depth) {
1283      p->loop_stack_array_size *= 2;
1284      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1285			       p->loop_stack_array_size);
1286      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1287				     p->loop_stack_array_size);
1288   }
1289
1290   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1291   p->loop_stack_depth++;
1292   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1293}
1294
1295static brw_inst *
1296get_inner_do_insn(struct brw_codegen *p)
1297{
1298   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1299}
1300
1301/* EU takes the value from the flag register and pushes it onto some
1302 * sort of a stack (presumably merging with any flag value already on
1303 * the stack).  Within an if block, the flags at the top of the stack
1304 * control execution on each channel of the unit, eg. on each of the
1305 * 16 pixel values in our wm programs.
1306 *
1307 * When the matching 'else' instruction is reached (presumably by
1308 * countdown of the instruction count patched in by our ELSE/ENDIF
1309 * functions), the relevant flags are inverted.
1310 *
1311 * When the matching 'endif' instruction is reached, the flags are
1312 * popped off.  If the stack is now empty, normal execution resumes.
1313 */
1314brw_inst *
1315brw_IF(struct brw_codegen *p, unsigned execute_size)
1316{
1317   const struct brw_device_info *devinfo = p->devinfo;
1318   brw_inst *insn;
1319
1320   insn = next_insn(p, BRW_OPCODE_IF);
1321
1322   /* Override the defaults for this instruction:
1323    */
1324   if (devinfo->gen < 6) {
1325      brw_set_dest(p, insn, brw_ip_reg());
1326      brw_set_src0(p, insn, brw_ip_reg());
1327      brw_set_src1(p, insn, brw_imm_d(0x0));
1328   } else if (devinfo->gen == 6) {
1329      brw_set_dest(p, insn, brw_imm_w(0));
1330      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1331      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1332      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1333   } else if (devinfo->gen == 7) {
1334      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1335      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1336      brw_set_src1(p, insn, brw_imm_w(0));
1337      brw_inst_set_jip(devinfo, insn, 0);
1338      brw_inst_set_uip(devinfo, insn, 0);
1339   } else {
1340      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1341      brw_set_src0(p, insn, brw_imm_d(0));
1342      brw_inst_set_jip(devinfo, insn, 0);
1343      brw_inst_set_uip(devinfo, insn, 0);
1344   }
1345
1346   brw_inst_set_exec_size(devinfo, insn, execute_size);
1347   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1348   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1349   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1350   if (!p->single_program_flow && devinfo->gen < 6)
1351      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1352
1353   push_if_stack(p, insn);
1354   p->if_depth_in_loop[p->loop_stack_depth]++;
1355   return insn;
1356}
1357
1358/* This function is only used for gen6-style IF instructions with an
1359 * embedded comparison (conditional modifier).  It is not used on gen7.
1360 */
1361brw_inst *
1362gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1363	struct brw_reg src0, struct brw_reg src1)
1364{
1365   const struct brw_device_info *devinfo = p->devinfo;
1366   brw_inst *insn;
1367
1368   insn = next_insn(p, BRW_OPCODE_IF);
1369
1370   brw_set_dest(p, insn, brw_imm_w(0));
1371   brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1372                                                   : BRW_EXECUTE_8);
1373   brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1374   brw_set_src0(p, insn, src0);
1375   brw_set_src1(p, insn, src1);
1376
1377   assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1378   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1379   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1380
1381   push_if_stack(p, insn);
1382   return insn;
1383}
1384
1385/**
1386 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1387 */
1388static void
1389convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1390                       brw_inst *if_inst, brw_inst *else_inst)
1391{
1392   const struct brw_device_info *devinfo = p->devinfo;
1393
1394   /* The next instruction (where the ENDIF would be, if it existed) */
1395   brw_inst *next_inst = &p->store[p->nr_insn];
1396
1397   assert(p->single_program_flow);
1398   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1399   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1400   assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1401
1402   /* Convert IF to an ADD instruction that moves the instruction pointer
1403    * to the first instruction of the ELSE block.  If there is no ELSE
1404    * block, point to where ENDIF would be.  Reverse the predicate.
1405    *
1406    * There's no need to execute an ENDIF since we don't need to do any
1407    * stack operations, and if we're currently executing, we just want to
1408    * continue normally.
1409    */
1410   brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1411   brw_inst_set_pred_inv(devinfo, if_inst, true);
1412
1413   if (else_inst != NULL) {
1414      /* Convert ELSE to an ADD instruction that points where the ENDIF
1415       * would be.
1416       */
1417      brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1418
1419      brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1420      brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1421   } else {
1422      brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1423   }
1424}
1425
1426/**
1427 * Patch IF and ELSE instructions with appropriate jump targets.
1428 */
1429static void
1430patch_IF_ELSE(struct brw_codegen *p,
1431              brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1432{
1433   const struct brw_device_info *devinfo = p->devinfo;
1434
1435   /* We shouldn't be patching IF and ELSE instructions in single program flow
1436    * mode when gen < 6, because in single program flow mode on those
1437    * platforms, we convert flow control instructions to conditional ADDs that
1438    * operate on IP (see brw_ENDIF).
1439    *
1440    * However, on Gen6, writing to IP doesn't work in single program flow mode
1441    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1442    * not be updated by non-flow control instructions.").  And on later
1443    * platforms, there is no significant benefit to converting control flow
1444    * instructions to conditional ADDs.  So we do patch IF and ELSE
1445    * instructions in single program flow mode on those platforms.
1446    */
1447   if (devinfo->gen < 6)
1448      assert(!p->single_program_flow);
1449
1450   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1451   assert(endif_inst != NULL);
1452   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1453
1454   unsigned br = brw_jump_scale(devinfo);
1455
1456   assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1457   brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1458
1459   if (else_inst == NULL) {
1460      /* Patch IF -> ENDIF */
1461      if (devinfo->gen < 6) {
1462	 /* Turn it into an IFF, which means no mask stack operations for
1463	  * all-false and jumping past the ENDIF.
1464	  */
1465         brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1466         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1467                                      br * (endif_inst - if_inst + 1));
1468         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1469      } else if (devinfo->gen == 6) {
1470	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1471         brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1472      } else {
1473         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1474         brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1475      }
1476   } else {
1477      brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1478
1479      /* Patch IF -> ELSE */
1480      if (devinfo->gen < 6) {
1481         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1482                                      br * (else_inst - if_inst));
1483         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1484      } else if (devinfo->gen == 6) {
1485         brw_inst_set_gen6_jump_count(devinfo, if_inst,
1486                                      br * (else_inst - if_inst + 1));
1487      }
1488
1489      /* Patch ELSE -> ENDIF */
1490      if (devinfo->gen < 6) {
1491	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1492	  * matching ENDIF.
1493	  */
1494         brw_inst_set_gen4_jump_count(devinfo, else_inst,
1495                                      br * (endif_inst - else_inst + 1));
1496         brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1497      } else if (devinfo->gen == 6) {
1498	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1499         brw_inst_set_gen6_jump_count(devinfo, else_inst,
1500                                      br * (endif_inst - else_inst));
1501      } else {
1502	 /* The IF instruction's JIP should point just past the ELSE */
1503         brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1504	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1505         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1506         brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1507         if (devinfo->gen >= 8) {
1508            /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1509             * should point to ENDIF.
1510             */
1511            brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1512         }
1513      }
1514   }
1515}
1516
1517void
1518brw_ELSE(struct brw_codegen *p)
1519{
1520   const struct brw_device_info *devinfo = p->devinfo;
1521   brw_inst *insn;
1522
1523   insn = next_insn(p, BRW_OPCODE_ELSE);
1524
1525   if (devinfo->gen < 6) {
1526      brw_set_dest(p, insn, brw_ip_reg());
1527      brw_set_src0(p, insn, brw_ip_reg());
1528      brw_set_src1(p, insn, brw_imm_d(0x0));
1529   } else if (devinfo->gen == 6) {
1530      brw_set_dest(p, insn, brw_imm_w(0));
1531      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1532      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1533      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1534   } else if (devinfo->gen == 7) {
1535      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1536      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1537      brw_set_src1(p, insn, brw_imm_w(0));
1538      brw_inst_set_jip(devinfo, insn, 0);
1539      brw_inst_set_uip(devinfo, insn, 0);
1540   } else {
1541      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1542      brw_set_src0(p, insn, brw_imm_d(0));
1543      brw_inst_set_jip(devinfo, insn, 0);
1544      brw_inst_set_uip(devinfo, insn, 0);
1545   }
1546
1547   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1548   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1549   if (!p->single_program_flow && devinfo->gen < 6)
1550      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1551
1552   push_if_stack(p, insn);
1553}
1554
1555void
1556brw_ENDIF(struct brw_codegen *p)
1557{
1558   const struct brw_device_info *devinfo = p->devinfo;
1559   brw_inst *insn = NULL;
1560   brw_inst *else_inst = NULL;
1561   brw_inst *if_inst = NULL;
1562   brw_inst *tmp;
1563   bool emit_endif = true;
1564
1565   /* In single program flow mode, we can express IF and ELSE instructions
1566    * equivalently as ADD instructions that operate on IP.  On platforms prior
1567    * to Gen6, flow control instructions cause an implied thread switch, so
1568    * this is a significant savings.
1569    *
1570    * However, on Gen6, writing to IP doesn't work in single program flow mode
1571    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1572    * not be updated by non-flow control instructions.").  And on later
1573    * platforms, there is no significant benefit to converting control flow
1574    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1575    * Gen5.
1576    */
1577   if (devinfo->gen < 6 && p->single_program_flow)
1578      emit_endif = false;
1579
1580   /*
1581    * A single next_insn() may change the base address of instruction store
1582    * memory(p->store), so call it first before referencing the instruction
1583    * store pointer from an index
1584    */
1585   if (emit_endif)
1586      insn = next_insn(p, BRW_OPCODE_ENDIF);
1587
1588   /* Pop the IF and (optional) ELSE instructions from the stack */
1589   p->if_depth_in_loop[p->loop_stack_depth]--;
1590   tmp = pop_if_stack(p);
1591   if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1592      else_inst = tmp;
1593      tmp = pop_if_stack(p);
1594   }
1595   if_inst = tmp;
1596
1597   if (!emit_endif) {
1598      /* ENDIF is useless; don't bother emitting it. */
1599      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1600      return;
1601   }
1602
1603   if (devinfo->gen < 6) {
1604      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1605      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1606      brw_set_src1(p, insn, brw_imm_d(0x0));
1607   } else if (devinfo->gen == 6) {
1608      brw_set_dest(p, insn, brw_imm_w(0));
1609      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1610      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1611   } else if (devinfo->gen == 7) {
1612      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1613      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1614      brw_set_src1(p, insn, brw_imm_w(0));
1615   } else {
1616      brw_set_src0(p, insn, brw_imm_d(0));
1617   }
1618
1619   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1620   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1621   if (devinfo->gen < 6)
1622      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1623
1624   /* Also pop item off the stack in the endif instruction: */
1625   if (devinfo->gen < 6) {
1626      brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1627      brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1628   } else if (devinfo->gen == 6) {
1629      brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1630   } else {
1631      brw_inst_set_jip(devinfo, insn, 2);
1632   }
1633   patch_IF_ELSE(p, if_inst, else_inst, insn);
1634}
1635
1636brw_inst *
1637brw_BREAK(struct brw_codegen *p)
1638{
1639   const struct brw_device_info *devinfo = p->devinfo;
1640   brw_inst *insn;
1641
1642   insn = next_insn(p, BRW_OPCODE_BREAK);
1643   if (devinfo->gen >= 8) {
1644      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1645      brw_set_src0(p, insn, brw_imm_d(0x0));
1646   } else if (devinfo->gen >= 6) {
1647      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1648      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1649      brw_set_src1(p, insn, brw_imm_d(0x0));
1650   } else {
1651      brw_set_dest(p, insn, brw_ip_reg());
1652      brw_set_src0(p, insn, brw_ip_reg());
1653      brw_set_src1(p, insn, brw_imm_d(0x0));
1654      brw_inst_set_gen4_pop_count(devinfo, insn,
1655                                  p->if_depth_in_loop[p->loop_stack_depth]);
1656   }
1657   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1658   brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1659                                                   : BRW_EXECUTE_8);
1660
1661   return insn;
1662}
1663
1664brw_inst *
1665brw_CONT(struct brw_codegen *p)
1666{
1667   const struct brw_device_info *devinfo = p->devinfo;
1668   brw_inst *insn;
1669
1670   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1671   brw_set_dest(p, insn, brw_ip_reg());
1672   if (devinfo->gen >= 8) {
1673      brw_set_src0(p, insn, brw_imm_d(0x0));
1674   } else {
1675      brw_set_src0(p, insn, brw_ip_reg());
1676      brw_set_src1(p, insn, brw_imm_d(0x0));
1677   }
1678
1679   if (devinfo->gen < 6) {
1680      brw_inst_set_gen4_pop_count(devinfo, insn,
1681                                  p->if_depth_in_loop[p->loop_stack_depth]);
1682   }
1683   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1684   brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1685                                                   : BRW_EXECUTE_8);
1686   return insn;
1687}
1688
1689brw_inst *
1690gen6_HALT(struct brw_codegen *p)
1691{
1692   const struct brw_device_info *devinfo = p->devinfo;
1693   brw_inst *insn;
1694
1695   insn = next_insn(p, BRW_OPCODE_HALT);
1696   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1697   if (devinfo->gen >= 8) {
1698      brw_set_src0(p, insn, brw_imm_d(0x0));
1699   } else {
1700      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1701      brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1702   }
1703
1704   if (p->compressed) {
1705      brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_16);
1706   } else {
1707      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1708      brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_8);
1709   }
1710   return insn;
1711}
1712
1713/* DO/WHILE loop:
1714 *
1715 * The DO/WHILE is just an unterminated loop -- break or continue are
1716 * used for control within the loop.  We have a few ways they can be
1717 * done.
1718 *
1719 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1720 * jip and no DO instruction.
1721 *
1722 * For non-uniform control flow pre-gen6, there's a DO instruction to
1723 * push the mask, and a WHILE to jump back, and BREAK to get out and
1724 * pop the mask.
1725 *
1726 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1727 * just points back to the first instruction of the loop.
1728 */
1729brw_inst *
1730brw_DO(struct brw_codegen *p, unsigned execute_size)
1731{
1732   const struct brw_device_info *devinfo = p->devinfo;
1733
1734   if (devinfo->gen >= 6 || p->single_program_flow) {
1735      push_loop_stack(p, &p->store[p->nr_insn]);
1736      return &p->store[p->nr_insn];
1737   } else {
1738      brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1739
1740      push_loop_stack(p, insn);
1741
1742      /* Override the defaults for this instruction:
1743       */
1744      brw_set_dest(p, insn, brw_null_reg());
1745      brw_set_src0(p, insn, brw_null_reg());
1746      brw_set_src1(p, insn, brw_null_reg());
1747
1748      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1749      brw_inst_set_exec_size(devinfo, insn, execute_size);
1750      brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1751
1752      return insn;
1753   }
1754}
1755
1756/**
1757 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1758 * instruction here.
1759 *
1760 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1761 * nesting, since it can always just point to the end of the block/current loop.
1762 */
1763static void
1764brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1765{
1766   const struct brw_device_info *devinfo = p->devinfo;
1767   brw_inst *do_inst = get_inner_do_insn(p);
1768   brw_inst *inst;
1769   unsigned br = brw_jump_scale(devinfo);
1770
1771   assert(devinfo->gen < 6);
1772
1773   for (inst = while_inst - 1; inst != do_inst; inst--) {
1774      /* If the jump count is != 0, that means that this instruction has already
1775       * been patched because it's part of a loop inside of the one we're
1776       * patching.
1777       */
1778      if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1779          brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1780         brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1781      } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1782                 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1783         brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1784      }
1785   }
1786}
1787
1788brw_inst *
1789brw_WHILE(struct brw_codegen *p)
1790{
1791   const struct brw_device_info *devinfo = p->devinfo;
1792   brw_inst *insn, *do_insn;
1793   unsigned br = brw_jump_scale(devinfo);
1794
1795   if (devinfo->gen >= 6) {
1796      insn = next_insn(p, BRW_OPCODE_WHILE);
1797      do_insn = get_inner_do_insn(p);
1798
1799      if (devinfo->gen >= 8) {
1800         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1801         brw_set_src0(p, insn, brw_imm_d(0));
1802         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1803      } else if (devinfo->gen == 7) {
1804         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1805         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1806         brw_set_src1(p, insn, brw_imm_w(0));
1807         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1808      } else {
1809         brw_set_dest(p, insn, brw_imm_w(0));
1810         brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1811         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1812         brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1813      }
1814
1815      brw_inst_set_exec_size(devinfo, insn, p->compressed ? BRW_EXECUTE_16
1816                                                      : BRW_EXECUTE_8);
1817   } else {
1818      if (p->single_program_flow) {
1819	 insn = next_insn(p, BRW_OPCODE_ADD);
1820         do_insn = get_inner_do_insn(p);
1821
1822	 brw_set_dest(p, insn, brw_ip_reg());
1823	 brw_set_src0(p, insn, brw_ip_reg());
1824	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1825         brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1826      } else {
1827	 insn = next_insn(p, BRW_OPCODE_WHILE);
1828         do_insn = get_inner_do_insn(p);
1829
1830         assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1831
1832	 brw_set_dest(p, insn, brw_ip_reg());
1833	 brw_set_src0(p, insn, brw_ip_reg());
1834	 brw_set_src1(p, insn, brw_imm_d(0));
1835
1836         brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1837         brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1838         brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1839
1840	 brw_patch_break_cont(p, insn);
1841      }
1842   }
1843   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1844
1845   p->loop_stack_depth--;
1846
1847   return insn;
1848}
1849
1850/* FORWARD JUMPS:
1851 */
1852void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1853{
1854   const struct brw_device_info *devinfo = p->devinfo;
1855   brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1856   unsigned jmpi = 1;
1857
1858   if (devinfo->gen >= 5)
1859      jmpi = 2;
1860
1861   assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1862   assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1863
1864   brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1865                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
1866}
1867
1868/* To integrate with the above, it makes sense that the comparison
1869 * instruction should populate the flag register.  It might be simpler
1870 * just to use the flag reg for most WM tasks?
1871 */
1872void brw_CMP(struct brw_codegen *p,
1873	     struct brw_reg dest,
1874	     unsigned conditional,
1875	     struct brw_reg src0,
1876	     struct brw_reg src1)
1877{
1878   const struct brw_device_info *devinfo = p->devinfo;
1879   brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1880
1881   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1882   brw_set_dest(p, insn, dest);
1883   brw_set_src0(p, insn, src0);
1884   brw_set_src1(p, insn, src1);
1885
1886   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1887    * page says:
1888    *    "Any CMP instruction with a null destination must use a {switch}."
1889    *
1890    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1891    * mentioned on their work-arounds pages.
1892    */
1893   if (devinfo->gen == 7) {
1894      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1895          dest.nr == BRW_ARF_NULL) {
1896         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1897      }
1898   }
1899}
1900
1901/***********************************************************************
1902 * Helpers for the various SEND message types:
1903 */
1904
1905/** Extended math function, float[8].
1906 */
1907void gen4_math(struct brw_codegen *p,
1908	       struct brw_reg dest,
1909	       unsigned function,
1910	       unsigned msg_reg_nr,
1911	       struct brw_reg src,
1912	       unsigned precision )
1913{
1914   const struct brw_device_info *devinfo = p->devinfo;
1915   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1916   unsigned data_type;
1917   if (has_scalar_region(src)) {
1918      data_type = BRW_MATH_DATA_SCALAR;
1919   } else {
1920      data_type = BRW_MATH_DATA_VECTOR;
1921   }
1922
1923   assert(devinfo->gen < 6);
1924
1925   /* Example code doesn't set predicate_control for send
1926    * instructions.
1927    */
1928   brw_inst_set_pred_control(devinfo, insn, 0);
1929   brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1930
1931   brw_set_dest(p, insn, dest);
1932   brw_set_src0(p, insn, src);
1933   brw_set_math_message(p,
1934                        insn,
1935                        function,
1936                        src.type == BRW_REGISTER_TYPE_D,
1937                        precision,
1938                        data_type);
1939}
1940
1941void gen6_math(struct brw_codegen *p,
1942	       struct brw_reg dest,
1943	       unsigned function,
1944	       struct brw_reg src0,
1945	       struct brw_reg src1)
1946{
1947   const struct brw_device_info *devinfo = p->devinfo;
1948   brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1949
1950   assert(devinfo->gen >= 6);
1951
1952   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1953          (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1954   assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
1955          (devinfo->gen >= 8 && src0.file == BRW_IMMEDIATE_VALUE));
1956
1957   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1958   if (devinfo->gen == 6) {
1959      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1960      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1961   }
1962
1963   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1964       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1965       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1966      assert(src0.type != BRW_REGISTER_TYPE_F);
1967      assert(src1.type != BRW_REGISTER_TYPE_F);
1968      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1969             (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1970   } else {
1971      assert(src0.type == BRW_REGISTER_TYPE_F);
1972      assert(src1.type == BRW_REGISTER_TYPE_F);
1973      if (function == BRW_MATH_FUNCTION_POW) {
1974         assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1975                (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1976      } else {
1977         assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1978                src1.nr == BRW_ARF_NULL);
1979      }
1980   }
1981
1982   /* Source modifiers are ignored for extended math instructions on Gen6. */
1983   if (devinfo->gen == 6) {
1984      assert(!src0.negate);
1985      assert(!src0.abs);
1986      assert(!src1.negate);
1987      assert(!src1.abs);
1988   }
1989
1990   brw_inst_set_math_function(devinfo, insn, function);
1991
1992   brw_set_dest(p, insn, dest);
1993   brw_set_src0(p, insn, src0);
1994   brw_set_src1(p, insn, src1);
1995}
1996
1997
1998/**
1999 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2000 * using a constant offset per channel.
2001 *
2002 * The offset must be aligned to oword size (16 bytes).  Used for
2003 * register spilling.
2004 */
2005void brw_oword_block_write_scratch(struct brw_codegen *p,
2006				   struct brw_reg mrf,
2007				   int num_regs,
2008				   unsigned offset)
2009{
2010   const struct brw_device_info *devinfo = p->devinfo;
2011   uint32_t msg_control, msg_type;
2012   int mlen;
2013
2014   if (devinfo->gen >= 6)
2015      offset /= 16;
2016
2017   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2018
2019   if (num_regs == 1) {
2020      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2021      mlen = 2;
2022   } else {
2023      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2024      mlen = 3;
2025   }
2026
2027   /* Set up the message header.  This is g0, with g0.2 filled with
2028    * the offset.  We don't want to leave our offset around in g0 or
2029    * it'll screw up texture samples, so set it up inside the message
2030    * reg.
2031    */
2032   {
2033      brw_push_insn_state(p);
2034      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2035      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2036      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2037
2038      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2039
2040      /* set message header global offset field (reg 0, element 2) */
2041      brw_MOV(p,
2042	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2043				  mrf.nr,
2044				  2), BRW_REGISTER_TYPE_UD),
2045	      brw_imm_ud(offset));
2046
2047      brw_pop_insn_state(p);
2048   }
2049
2050   {
2051      struct brw_reg dest;
2052      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2053      int send_commit_msg;
2054      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2055					 BRW_REGISTER_TYPE_UW);
2056
2057      if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_NONE) {
2058         brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2059	 src_header = vec16(src_header);
2060      }
2061      assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2062      if (devinfo->gen < 6)
2063         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2064
2065      /* Until gen6, writes followed by reads from the same location
2066       * are not guaranteed to be ordered unless write_commit is set.
2067       * If set, then a no-op write is issued to the destination
2068       * register to set a dependency, and a read from the destination
2069       * can be used to ensure the ordering.
2070       *
2071       * For gen6, only writes between different threads need ordering
2072       * protection.  Our use of DP writes is all about register
2073       * spilling within a thread.
2074       */
2075      if (devinfo->gen >= 6) {
2076	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2077	 send_commit_msg = 0;
2078      } else {
2079	 dest = src_header;
2080	 send_commit_msg = 1;
2081      }
2082
2083      brw_set_dest(p, insn, dest);
2084      if (devinfo->gen >= 6) {
2085	 brw_set_src0(p, insn, mrf);
2086      } else {
2087	 brw_set_src0(p, insn, brw_null_reg());
2088      }
2089
2090      if (devinfo->gen >= 6)
2091	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2092      else
2093	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2094
2095      brw_set_dp_write_message(p,
2096			       insn,
2097			       255, /* binding table index (255=stateless) */
2098			       msg_control,
2099			       msg_type,
2100			       mlen,
2101			       true, /* header_present */
2102			       0, /* not a render target */
2103			       send_commit_msg, /* response_length */
2104			       0, /* eot */
2105			       send_commit_msg);
2106   }
2107}
2108
2109
2110/**
2111 * Read a block of owords (half a GRF each) from the scratch buffer
2112 * using a constant index per channel.
2113 *
2114 * Offset must be aligned to oword size (16 bytes).  Used for register
2115 * spilling.
2116 */
2117void
2118brw_oword_block_read_scratch(struct brw_codegen *p,
2119			     struct brw_reg dest,
2120			     struct brw_reg mrf,
2121			     int num_regs,
2122			     unsigned offset)
2123{
2124   const struct brw_device_info *devinfo = p->devinfo;
2125   uint32_t msg_control;
2126   int rlen;
2127
2128   if (devinfo->gen >= 6)
2129      offset /= 16;
2130
2131   if (p->devinfo->gen >= 7) {
2132      /* On gen 7 and above, we no longer have message registers and we can
2133       * send from any register we want.  By using the destination register
2134       * for the message, we guarantee that the implied message write won't
2135       * accidentally overwrite anything.  This has been a problem because
2136       * the MRF registers and source for the final FB write are both fixed
2137       * and may overlap.
2138       */
2139      mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2140   } else {
2141      mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2142   }
2143   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2144
2145   if (num_regs == 1) {
2146      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
2147      rlen = 1;
2148   } else {
2149      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2150      rlen = 2;
2151   }
2152
2153   {
2154      brw_push_insn_state(p);
2155      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2156      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2157      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2158
2159      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2160
2161      /* set message header global offset field (reg 0, element 2) */
2162      brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2163
2164      brw_pop_insn_state(p);
2165   }
2166
2167   {
2168      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2169
2170      assert(brw_inst_pred_control(devinfo, insn) == 0);
2171      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2172
2173      brw_set_dest(p, insn, dest);	/* UW? */
2174      if (devinfo->gen >= 6) {
2175	 brw_set_src0(p, insn, mrf);
2176      } else {
2177	 brw_set_src0(p, insn, brw_null_reg());
2178         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2179      }
2180
2181      brw_set_dp_read_message(p,
2182			      insn,
2183			      255, /* binding table index (255=stateless) */
2184			      msg_control,
2185			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2186			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2187			      1, /* msg_length */
2188                              true, /* header_present */
2189			      rlen);
2190   }
2191}
2192
2193void
2194gen7_block_read_scratch(struct brw_codegen *p,
2195                        struct brw_reg dest,
2196                        int num_regs,
2197                        unsigned offset)
2198{
2199   const struct brw_device_info *devinfo = p->devinfo;
2200   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2201   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2202
2203   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2204   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2205
2206   /* The HW requires that the header is present; this is to get the g0.5
2207    * scratch offset.
2208    */
2209   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2210
2211   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2212    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2213    * is 32 bytes, which happens to be the size of a register.
2214    */
2215   offset /= REG_SIZE;
2216   assert(offset < (1 << 12));
2217
2218   gen7_set_dp_scratch_message(p, insn,
2219                               false, /* scratch read */
2220                               false, /* OWords */
2221                               false, /* invalidate after read */
2222                               num_regs,
2223                               offset,
2224                               1,        /* mlen: just g0 */
2225                               num_regs, /* rlen */
2226                               true);    /* header present */
2227}
2228
2229/**
2230 * Read a float[4] vector from the data port Data Cache (const buffer).
2231 * Location (in buffer) should be a multiple of 16.
2232 * Used for fetching shader constants.
2233 */
2234void brw_oword_block_read(struct brw_codegen *p,
2235			  struct brw_reg dest,
2236			  struct brw_reg mrf,
2237			  uint32_t offset,
2238			  uint32_t bind_table_index)
2239{
2240   const struct brw_device_info *devinfo = p->devinfo;
2241
2242   /* On newer hardware, offset is in units of owords. */
2243   if (devinfo->gen >= 6)
2244      offset /= 16;
2245
2246   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2247
2248   brw_push_insn_state(p);
2249   brw_set_default_exec_size(p, BRW_EXECUTE_8);
2250   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2251   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2252   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2253
2254   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2255
2256   /* set message header global offset field (reg 0, element 2) */
2257   brw_MOV(p,
2258	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2259			       mrf.nr,
2260			       2), BRW_REGISTER_TYPE_UD),
2261	   brw_imm_ud(offset));
2262
2263   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2264
2265   /* cast dest to a uword[8] vector */
2266   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2267
2268   brw_set_dest(p, insn, dest);
2269   if (devinfo->gen >= 6) {
2270      brw_set_src0(p, insn, mrf);
2271   } else {
2272      brw_set_src0(p, insn, brw_null_reg());
2273      brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2274   }
2275
2276   brw_set_dp_read_message(p,
2277			   insn,
2278			   bind_table_index,
2279			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2280			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2281			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2282			   1, /* msg_length */
2283                           true, /* header_present */
2284			   1); /* response_length (1 reg, 2 owords!) */
2285
2286   brw_pop_insn_state(p);
2287}
2288
2289
2290void brw_fb_WRITE(struct brw_codegen *p,
2291		  int dispatch_width,
2292                  struct brw_reg payload,
2293                  struct brw_reg implied_header,
2294                  unsigned msg_control,
2295                  unsigned binding_table_index,
2296                  unsigned msg_length,
2297                  unsigned response_length,
2298                  bool eot,
2299                  bool last_render_target,
2300                  bool header_present)
2301{
2302   const struct brw_device_info *devinfo = p->devinfo;
2303   brw_inst *insn;
2304   unsigned msg_type;
2305   struct brw_reg dest, src0;
2306
2307   if (dispatch_width == 16)
2308      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2309   else
2310      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2311
2312   if (devinfo->gen >= 6) {
2313      insn = next_insn(p, BRW_OPCODE_SENDC);
2314   } else {
2315      insn = next_insn(p, BRW_OPCODE_SEND);
2316   }
2317   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2318
2319   if (devinfo->gen >= 6) {
2320      /* headerless version, just submit color payload */
2321      src0 = payload;
2322
2323      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2324   } else {
2325      assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2326      brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2327      src0 = implied_header;
2328
2329      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2330   }
2331
2332   brw_set_dest(p, insn, dest);
2333   brw_set_src0(p, insn, src0);
2334   brw_set_dp_write_message(p,
2335			    insn,
2336			    binding_table_index,
2337			    msg_control,
2338			    msg_type,
2339			    msg_length,
2340			    header_present,
2341			    last_render_target,
2342			    response_length,
2343			    eot,
2344			    0 /* send_commit_msg */);
2345}
2346
2347
2348/**
2349 * Texture sample instruction.
2350 * Note: the msg_type plus msg_length values determine exactly what kind
2351 * of sampling operation is performed.  See volume 4, page 161 of docs.
2352 */
2353void brw_SAMPLE(struct brw_codegen *p,
2354		struct brw_reg dest,
2355		unsigned msg_reg_nr,
2356		struct brw_reg src0,
2357		unsigned binding_table_index,
2358		unsigned sampler,
2359		unsigned msg_type,
2360		unsigned response_length,
2361		unsigned msg_length,
2362		unsigned header_present,
2363		unsigned simd_mode,
2364		unsigned return_format)
2365{
2366   const struct brw_device_info *devinfo = p->devinfo;
2367   brw_inst *insn;
2368
2369   if (msg_reg_nr != -1)
2370      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2371
2372   insn = next_insn(p, BRW_OPCODE_SEND);
2373   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2374
2375   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2376    *
2377    *    "Instruction compression is not allowed for this instruction (that
2378    *     is, send). The hardware behavior is undefined if this instruction is
2379    *     set as compressed. However, compress control can be set to "SecHalf"
2380    *     to affect the EMask generation."
2381    *
2382    * No similar wording is found in later PRMs, but there are examples
2383    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2384    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2385    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2386    */
2387   if (brw_inst_qtr_control(devinfo, insn) != BRW_COMPRESSION_2NDHALF)
2388      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
2389
2390   if (devinfo->gen < 6)
2391      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2392
2393   brw_set_dest(p, insn, dest);
2394   brw_set_src0(p, insn, src0);
2395   brw_set_sampler_message(p, insn,
2396                           binding_table_index,
2397                           sampler,
2398                           msg_type,
2399                           response_length,
2400                           msg_length,
2401                           header_present,
2402                           simd_mode,
2403                           return_format);
2404}
2405
2406/* Adjust the message header's sampler state pointer to
2407 * select the correct group of 16 samplers.
2408 */
2409void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2410                                      struct brw_reg header,
2411                                      struct brw_reg sampler_index)
2412{
2413   /* The "Sampler Index" field can only store values between 0 and 15.
2414    * However, we can add an offset to the "Sampler State Pointer"
2415    * field, effectively selecting a different set of 16 samplers.
2416    *
2417    * The "Sampler State Pointer" needs to be aligned to a 32-byte
2418    * offset, and each sampler state is only 16-bytes, so we can't
2419    * exclusively use the offset - we have to use both.
2420    */
2421
2422   const struct brw_device_info *devinfo = p->devinfo;
2423
2424   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2425      const int sampler_state_size = 16; /* 16 bytes */
2426      uint32_t sampler = sampler_index.dw1.ud;
2427
2428      if (sampler >= 16) {
2429         assert(devinfo->is_haswell || devinfo->gen >= 8);
2430         brw_ADD(p,
2431                 get_element_ud(header, 3),
2432                 get_element_ud(brw_vec8_grf(0, 0), 3),
2433                 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2434      }
2435   } else {
2436      /* Non-const sampler array indexing case */
2437      if (devinfo->gen < 8 && !devinfo->is_haswell) {
2438         return;
2439      }
2440
2441      struct brw_reg temp = get_element_ud(header, 3);
2442
2443      brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2444      brw_SHL(p, temp, temp, brw_imm_ud(4));
2445      brw_ADD(p,
2446              get_element_ud(header, 3),
2447              get_element_ud(brw_vec8_grf(0, 0), 3),
2448              temp);
2449   }
2450}
2451
2452/* All these variables are pretty confusing - we might be better off
2453 * using bitmasks and macros for this, in the old style.  Or perhaps
2454 * just having the caller instantiate the fields in dword3 itself.
2455 */
2456void brw_urb_WRITE(struct brw_codegen *p,
2457		   struct brw_reg dest,
2458		   unsigned msg_reg_nr,
2459		   struct brw_reg src0,
2460                   enum brw_urb_write_flags flags,
2461		   unsigned msg_length,
2462		   unsigned response_length,
2463		   unsigned offset,
2464		   unsigned swizzle)
2465{
2466   const struct brw_device_info *devinfo = p->devinfo;
2467   brw_inst *insn;
2468
2469   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2470
2471   if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2472      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2473      brw_push_insn_state(p);
2474      brw_set_default_access_mode(p, BRW_ALIGN_1);
2475      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2476      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2477		       BRW_REGISTER_TYPE_UD),
2478	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2479		brw_imm_ud(0xff00));
2480      brw_pop_insn_state(p);
2481   }
2482
2483   insn = next_insn(p, BRW_OPCODE_SEND);
2484
2485   assert(msg_length < BRW_MAX_MRF);
2486
2487   brw_set_dest(p, insn, dest);
2488   brw_set_src0(p, insn, src0);
2489   brw_set_src1(p, insn, brw_imm_d(0));
2490
2491   if (devinfo->gen < 6)
2492      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2493
2494   brw_set_urb_message(p,
2495		       insn,
2496		       flags,
2497		       msg_length,
2498		       response_length,
2499		       offset,
2500		       swizzle);
2501}
2502
2503struct brw_inst *
2504brw_send_indirect_message(struct brw_codegen *p,
2505                          unsigned sfid,
2506                          struct brw_reg dst,
2507                          struct brw_reg payload,
2508                          struct brw_reg desc)
2509{
2510   const struct brw_device_info *devinfo = p->devinfo;
2511   struct brw_inst *send, *setup;
2512
2513   assert(desc.type == BRW_REGISTER_TYPE_UD);
2514
2515   if (desc.file == BRW_IMMEDIATE_VALUE) {
2516      setup = send = next_insn(p, BRW_OPCODE_SEND);
2517      brw_set_src1(p, send, desc);
2518
2519   } else {
2520      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2521
2522      brw_push_insn_state(p);
2523      brw_set_default_access_mode(p, BRW_ALIGN_1);
2524      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2525      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2526
2527      /* Load the indirect descriptor to an address register using OR so the
2528       * caller can specify additional descriptor bits with the usual
2529       * brw_set_*_message() helper functions.
2530       */
2531      setup = brw_OR(p, addr, desc, brw_imm_ud(0));
2532
2533      brw_pop_insn_state(p);
2534
2535      send = next_insn(p, BRW_OPCODE_SEND);
2536      brw_set_src1(p, send, addr);
2537   }
2538
2539   brw_set_dest(p, send, dst);
2540   brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2541   brw_inst_set_sfid(devinfo, send, sfid);
2542
2543   return setup;
2544}
2545
2546static struct brw_inst *
2547brw_send_indirect_surface_message(struct brw_codegen *p,
2548                                  unsigned sfid,
2549                                  struct brw_reg dst,
2550                                  struct brw_reg payload,
2551                                  struct brw_reg surface,
2552                                  unsigned message_len,
2553                                  unsigned response_len,
2554                                  bool header_present)
2555{
2556   const struct brw_device_info *devinfo = p->devinfo;
2557   struct brw_inst *insn;
2558
2559   if (surface.file != BRW_IMMEDIATE_VALUE) {
2560      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2561
2562      brw_push_insn_state(p);
2563      brw_set_default_access_mode(p, BRW_ALIGN_1);
2564      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2565      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2566
2567      /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2568       * some surface array is accessed out of bounds.
2569       */
2570      insn = brw_AND(p, addr,
2571                     suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2572                               BRW_GET_SWZ(surface.dw1.bits.swizzle, 0)),
2573                     brw_imm_ud(0xff));
2574
2575      brw_pop_insn_state(p);
2576
2577      surface = addr;
2578   }
2579
2580   insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
2581   brw_inst_set_mlen(devinfo, insn, message_len);
2582   brw_inst_set_rlen(devinfo, insn, response_len);
2583   brw_inst_set_header_present(devinfo, insn, header_present);
2584
2585   return insn;
2586}
2587
2588static int
2589brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2590{
2591   int offset;
2592   void *store = p->store;
2593   const struct brw_device_info *devinfo = p->devinfo;
2594
2595   for (offset = next_offset(devinfo, store, start_offset);
2596        offset < p->next_insn_offset;
2597        offset = next_offset(devinfo, store, offset)) {
2598      brw_inst *insn = store + offset;
2599
2600      switch (brw_inst_opcode(devinfo, insn)) {
2601      case BRW_OPCODE_ENDIF:
2602      case BRW_OPCODE_ELSE:
2603      case BRW_OPCODE_WHILE:
2604      case BRW_OPCODE_HALT:
2605	 return offset;
2606      }
2607   }
2608
2609   return 0;
2610}
2611
2612/* There is no DO instruction on gen6, so to find the end of the loop
2613 * we have to see if the loop is jumping back before our start
2614 * instruction.
2615 */
2616static int
2617brw_find_loop_end(struct brw_codegen *p, int start_offset)
2618{
2619   const struct brw_device_info *devinfo = p->devinfo;
2620   int offset;
2621   int scale = 16 / brw_jump_scale(devinfo);
2622   void *store = p->store;
2623
2624   assert(devinfo->gen >= 6);
2625
2626   /* Always start after the instruction (such as a WHILE) we're trying to fix
2627    * up.
2628    */
2629   for (offset = next_offset(devinfo, store, start_offset);
2630        offset < p->next_insn_offset;
2631        offset = next_offset(devinfo, store, offset)) {
2632      brw_inst *insn = store + offset;
2633
2634      if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2635         int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2636                                     : brw_inst_jip(devinfo, insn);
2637	 if (offset + jip * scale <= start_offset)
2638	    return offset;
2639      }
2640   }
2641   assert(!"not reached");
2642   return start_offset;
2643}
2644
2645/* After program generation, go back and update the UIP and JIP of
2646 * BREAK, CONT, and HALT instructions to their correct locations.
2647 */
2648void
2649brw_set_uip_jip(struct brw_codegen *p)
2650{
2651   const struct brw_device_info *devinfo = p->devinfo;
2652   int offset;
2653   int br = brw_jump_scale(devinfo);
2654   int scale = 16 / br;
2655   void *store = p->store;
2656
2657   if (devinfo->gen < 6)
2658      return;
2659
2660   for (offset = 0; offset < p->next_insn_offset;
2661        offset = next_offset(devinfo, store, offset)) {
2662      brw_inst *insn = store + offset;
2663
2664      if (brw_inst_cmpt_control(devinfo, insn)) {
2665	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2666         assert(brw_inst_opcode(devinfo, insn) != BRW_OPCODE_BREAK &&
2667                brw_inst_opcode(devinfo, insn) != BRW_OPCODE_CONTINUE &&
2668                brw_inst_opcode(devinfo, insn) != BRW_OPCODE_HALT);
2669	 continue;
2670      }
2671
2672      int block_end_offset = brw_find_next_block_end(p, offset);
2673      switch (brw_inst_opcode(devinfo, insn)) {
2674      case BRW_OPCODE_BREAK:
2675         assert(block_end_offset != 0);
2676         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2677	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2678         brw_inst_set_uip(devinfo, insn,
2679	    (brw_find_loop_end(p, offset) - offset +
2680             (devinfo->gen == 6 ? 16 : 0)) / scale);
2681	 break;
2682      case BRW_OPCODE_CONTINUE:
2683         assert(block_end_offset != 0);
2684         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2685         brw_inst_set_uip(devinfo, insn,
2686            (brw_find_loop_end(p, offset) - offset) / scale);
2687
2688         assert(brw_inst_uip(devinfo, insn) != 0);
2689         assert(brw_inst_jip(devinfo, insn) != 0);
2690	 break;
2691
2692      case BRW_OPCODE_ENDIF: {
2693         int32_t jump = (block_end_offset == 0) ?
2694                        1 * br : (block_end_offset - offset) / scale;
2695         if (devinfo->gen >= 7)
2696            brw_inst_set_jip(devinfo, insn, jump);
2697         else
2698            brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2699	 break;
2700      }
2701
2702      case BRW_OPCODE_HALT:
2703	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2704	  *
2705	  *    "In case of the halt instruction not inside any conditional
2706	  *     code block, the value of <JIP> and <UIP> should be the
2707	  *     same. In case of the halt instruction inside conditional code
2708	  *     block, the <UIP> should be the end of the program, and the
2709	  *     <JIP> should be end of the most inner conditional code block."
2710	  *
2711	  * The uip will have already been set by whoever set up the
2712	  * instruction.
2713	  */
2714	 if (block_end_offset == 0) {
2715            brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2716	 } else {
2717            brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2718	 }
2719         assert(brw_inst_uip(devinfo, insn) != 0);
2720         assert(brw_inst_jip(devinfo, insn) != 0);
2721	 break;
2722      }
2723   }
2724}
2725
2726void brw_ff_sync(struct brw_codegen *p,
2727		   struct brw_reg dest,
2728		   unsigned msg_reg_nr,
2729		   struct brw_reg src0,
2730		   bool allocate,
2731		   unsigned response_length,
2732		   bool eot)
2733{
2734   const struct brw_device_info *devinfo = p->devinfo;
2735   brw_inst *insn;
2736
2737   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2738
2739   insn = next_insn(p, BRW_OPCODE_SEND);
2740   brw_set_dest(p, insn, dest);
2741   brw_set_src0(p, insn, src0);
2742   brw_set_src1(p, insn, brw_imm_d(0));
2743
2744   if (devinfo->gen < 6)
2745      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2746
2747   brw_set_ff_sync_message(p,
2748			   insn,
2749			   allocate,
2750			   response_length,
2751			   eot);
2752}
2753
2754/**
2755 * Emit the SEND instruction necessary to generate stream output data on Gen6
2756 * (for transform feedback).
2757 *
2758 * If send_commit_msg is true, this is the last piece of stream output data
2759 * from this thread, so send the data as a committed write.  According to the
2760 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2761 *
2762 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2763 *   writes are complete by sending the final write as a committed write."
2764 */
2765void
2766brw_svb_write(struct brw_codegen *p,
2767              struct brw_reg dest,
2768              unsigned msg_reg_nr,
2769              struct brw_reg src0,
2770              unsigned binding_table_index,
2771              bool   send_commit_msg)
2772{
2773   brw_inst *insn;
2774
2775   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2776
2777   insn = next_insn(p, BRW_OPCODE_SEND);
2778   brw_set_dest(p, insn, dest);
2779   brw_set_src0(p, insn, src0);
2780   brw_set_src1(p, insn, brw_imm_d(0));
2781   brw_set_dp_write_message(p, insn,
2782                            binding_table_index,
2783                            0, /* msg_control: ignored */
2784                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2785                            1, /* msg_length */
2786                            true, /* header_present */
2787                            0, /* last_render_target: ignored */
2788                            send_commit_msg, /* response_length */
2789                            0, /* end_of_thread */
2790                            send_commit_msg); /* send_commit_msg */
2791}
2792
2793static unsigned
2794brw_surface_payload_size(struct brw_codegen *p,
2795                         unsigned num_channels,
2796                         bool has_simd4x2,
2797                         bool has_simd16)
2798{
2799   if (has_simd4x2 && brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
2800      return 1;
2801   else if (has_simd16 && p->compressed)
2802      return 2 * num_channels;
2803   else
2804      return num_channels;
2805}
2806
2807static void
2808brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
2809                                  brw_inst *insn,
2810                                  unsigned atomic_op,
2811                                  bool response_expected)
2812{
2813   const struct brw_device_info *devinfo = p->devinfo;
2814   unsigned msg_control =
2815      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2816      (response_expected ? 1 << 5 : 0); /* Return data expected */
2817
2818   if (devinfo->gen >= 8 || devinfo->is_haswell) {
2819      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2820         if (!p->compressed)
2821            msg_control |= 1 << 4; /* SIMD8 mode */
2822
2823         brw_inst_set_dp_msg_type(devinfo, insn,
2824                                  HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2825      } else {
2826         brw_inst_set_dp_msg_type(devinfo, insn,
2827            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2828      }
2829   } else {
2830      brw_inst_set_dp_msg_type(devinfo, insn,
2831                               GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2832
2833      if (!p->compressed)
2834         msg_control |= 1 << 4; /* SIMD8 mode */
2835   }
2836
2837   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2838}
2839
2840void
2841brw_untyped_atomic(struct brw_codegen *p,
2842                   struct brw_reg dst,
2843                   struct brw_reg payload,
2844                   struct brw_reg surface,
2845                   unsigned atomic_op,
2846                   unsigned msg_length,
2847                   bool response_expected)
2848{
2849   const struct brw_device_info *devinfo = p->devinfo;
2850   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2851                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2852                          GEN7_SFID_DATAPORT_DATA_CACHE);
2853   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2854   /* Mask out unused components -- This is especially important in Align16
2855    * mode on generations that don't have native support for SIMD4x2 atomics,
2856    * because unused but enabled components will cause the dataport to perform
2857    * additional atomic operations on the addresses that happen to be in the
2858    * uninitialized Y, Z and W coordinates of the payload.
2859    */
2860   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2861   struct brw_inst *insn = brw_send_indirect_surface_message(
2862      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
2863      brw_surface_payload_size(p, response_expected,
2864                               devinfo->gen >= 8 || devinfo->is_haswell, true),
2865      align1);
2866
2867   brw_set_dp_untyped_atomic_message(
2868      p, insn, atomic_op, response_expected);
2869}
2870
2871static void
2872brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
2873                                        struct brw_inst *insn,
2874                                        unsigned num_channels)
2875{
2876   const struct brw_device_info *devinfo = p->devinfo;
2877   /* Set mask of 32-bit channels to drop. */
2878   unsigned msg_control = 0xf & (0xf << num_channels);
2879
2880   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2881      if (p->compressed)
2882         msg_control |= 1 << 4; /* SIMD16 mode */
2883      else
2884         msg_control |= 2 << 4; /* SIMD8 mode */
2885   }
2886
2887   brw_inst_set_dp_msg_type(devinfo, insn,
2888                            (devinfo->gen >= 8 || devinfo->is_haswell ?
2889                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
2890                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
2891   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2892}
2893
2894void
2895brw_untyped_surface_read(struct brw_codegen *p,
2896                         struct brw_reg dst,
2897                         struct brw_reg payload,
2898                         struct brw_reg surface,
2899                         unsigned msg_length,
2900                         unsigned num_channels)
2901{
2902   const struct brw_device_info *devinfo = p->devinfo;
2903   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2904                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2905                          GEN7_SFID_DATAPORT_DATA_CACHE);
2906   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
2907   struct brw_inst *insn = brw_send_indirect_surface_message(
2908      p, sfid, dst, payload, surface, msg_length,
2909      brw_surface_payload_size(p, num_channels, true, true),
2910      align1);
2911
2912   brw_set_dp_untyped_surface_read_message(
2913      p, insn, num_channels);
2914}
2915
2916static void
2917brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
2918                                         struct brw_inst *insn,
2919                                         unsigned num_channels)
2920{
2921   const struct brw_device_info *devinfo = p->devinfo;
2922   /* Set mask of 32-bit channels to drop. */
2923   unsigned msg_control = 0xf & (0xf << num_channels);
2924
2925   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2926      if (p->compressed)
2927         msg_control |= 1 << 4; /* SIMD16 mode */
2928      else
2929         msg_control |= 2 << 4; /* SIMD8 mode */
2930   } else {
2931      if (devinfo->gen >= 8 || devinfo->is_haswell)
2932         msg_control |= 0 << 4; /* SIMD4x2 mode */
2933      else
2934         msg_control |= 2 << 4; /* SIMD8 mode */
2935   }
2936
2937   brw_inst_set_dp_msg_type(devinfo, insn,
2938                            devinfo->gen >= 8 || devinfo->is_haswell ?
2939                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
2940                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
2941   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2942}
2943
2944void
2945brw_untyped_surface_write(struct brw_codegen *p,
2946                          struct brw_reg payload,
2947                          struct brw_reg surface,
2948                          unsigned msg_length,
2949                          unsigned num_channels)
2950{
2951   const struct brw_device_info *devinfo = p->devinfo;
2952   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2953                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2954                          GEN7_SFID_DATAPORT_DATA_CACHE);
2955   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
2956   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
2957   const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
2958                          WRITEMASK_X : WRITEMASK_XYZW;
2959   struct brw_inst *insn = brw_send_indirect_surface_message(
2960      p, sfid, brw_writemask(brw_null_reg(), mask),
2961      payload, surface, msg_length, 0, align1);
2962
2963   brw_set_dp_untyped_surface_write_message(
2964      p, insn, num_channels);
2965}
2966
2967static void
2968brw_set_dp_typed_atomic_message(struct brw_codegen *p,
2969                                struct brw_inst *insn,
2970                                unsigned atomic_op,
2971                                bool response_expected)
2972{
2973   const struct brw_device_info *devinfo = p->devinfo;
2974   unsigned msg_control =
2975      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2976      (response_expected ? 1 << 5 : 0); /* Return data expected */
2977
2978   if (devinfo->gen >= 8 || devinfo->is_haswell) {
2979      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
2980         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
2981            msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
2982
2983         brw_inst_set_dp_msg_type(devinfo, insn,
2984                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
2985      } else {
2986         brw_inst_set_dp_msg_type(devinfo, insn,
2987                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
2988      }
2989
2990   } else {
2991      brw_inst_set_dp_msg_type(devinfo, insn,
2992                               GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
2993
2994      if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
2995         msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
2996   }
2997
2998   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
2999}
3000
3001void
3002brw_typed_atomic(struct brw_codegen *p,
3003                 struct brw_reg dst,
3004                 struct brw_reg payload,
3005                 struct brw_reg surface,
3006                 unsigned atomic_op,
3007                 unsigned msg_length,
3008                 bool response_expected) {
3009   const struct brw_device_info *devinfo = p->devinfo;
3010   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3011                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3012                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3013   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3014   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3015   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3016   struct brw_inst *insn = brw_send_indirect_surface_message(
3017      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
3018      brw_surface_payload_size(p, response_expected,
3019                               devinfo->gen >= 8 || devinfo->is_haswell, false),
3020      true);
3021
3022   brw_set_dp_typed_atomic_message(
3023      p, insn, atomic_op, response_expected);
3024}
3025
3026static void
3027brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
3028                                      struct brw_inst *insn,
3029                                      unsigned num_channels)
3030{
3031   const struct brw_device_info *devinfo = p->devinfo;
3032   /* Set mask of unused channels. */
3033   unsigned msg_control = 0xf & (0xf << num_channels);
3034
3035   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3036      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3037         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3038            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3039         else
3040            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3041      }
3042
3043      brw_inst_set_dp_msg_type(devinfo, insn,
3044                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
3045   } else {
3046      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3047         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3048            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3049      }
3050
3051      brw_inst_set_dp_msg_type(devinfo, insn,
3052                               GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
3053   }
3054
3055   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3056}
3057
3058void
3059brw_typed_surface_read(struct brw_codegen *p,
3060                       struct brw_reg dst,
3061                       struct brw_reg payload,
3062                       struct brw_reg surface,
3063                       unsigned msg_length,
3064                       unsigned num_channels)
3065{
3066   const struct brw_device_info *devinfo = p->devinfo;
3067   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3068                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3069                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3070   struct brw_inst *insn = brw_send_indirect_surface_message(
3071      p, sfid, dst, payload, surface, msg_length,
3072      brw_surface_payload_size(p, num_channels,
3073                               devinfo->gen >= 8 || devinfo->is_haswell, false),
3074      true);
3075
3076   brw_set_dp_typed_surface_read_message(
3077      p, insn, num_channels);
3078}
3079
3080static void
3081brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
3082                                       struct brw_inst *insn,
3083                                       unsigned num_channels)
3084{
3085   const struct brw_device_info *devinfo = p->devinfo;
3086   /* Set mask of unused channels. */
3087   unsigned msg_control = 0xf & (0xf << num_channels);
3088
3089   if (devinfo->gen >= 8 || devinfo->is_haswell) {
3090      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3091         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3092            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
3093         else
3094            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
3095      }
3096
3097      brw_inst_set_dp_msg_type(devinfo, insn,
3098                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
3099
3100   } else {
3101      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3102         if (brw_inst_qtr_control(devinfo, p->current) == GEN6_COMPRESSION_2Q)
3103            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
3104      }
3105
3106      brw_inst_set_dp_msg_type(devinfo, insn,
3107                               GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
3108   }
3109
3110   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
3111}
3112
3113void
3114brw_typed_surface_write(struct brw_codegen *p,
3115                        struct brw_reg payload,
3116                        struct brw_reg surface,
3117                        unsigned msg_length,
3118                        unsigned num_channels)
3119{
3120   const struct brw_device_info *devinfo = p->devinfo;
3121   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3122                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3123                          GEN6_SFID_DATAPORT_RENDER_CACHE);
3124   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
3125   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3126   const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
3127                          WRITEMASK_X : WRITEMASK_XYZW);
3128   struct brw_inst *insn = brw_send_indirect_surface_message(
3129      p, sfid, brw_writemask(brw_null_reg(), mask),
3130      payload, surface, msg_length, 0, true);
3131
3132   brw_set_dp_typed_surface_write_message(
3133      p, insn, num_channels);
3134}
3135
3136static void
3137brw_set_memory_fence_message(struct brw_codegen *p,
3138                             struct brw_inst *insn,
3139                             enum brw_message_target sfid,
3140                             bool commit_enable)
3141{
3142   const struct brw_device_info *devinfo = p->devinfo;
3143
3144   brw_set_message_descriptor(p, insn, sfid,
3145                              1 /* message length */,
3146                              (commit_enable ? 1 : 0) /* response length */,
3147                              true /* header present */,
3148                              false);
3149
3150   switch (sfid) {
3151   case GEN6_SFID_DATAPORT_RENDER_CACHE:
3152      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3153      break;
3154   case GEN7_SFID_DATAPORT_DATA_CACHE:
3155      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3156      break;
3157   default:
3158      unreachable("Not reached");
3159   }
3160
3161   if (commit_enable)
3162      brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3163}
3164
3165void
3166brw_memory_fence(struct brw_codegen *p,
3167                 struct brw_reg dst)
3168{
3169   const struct brw_device_info *devinfo = p->devinfo;
3170   const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
3171   struct brw_inst *insn;
3172
3173   /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3174    * message doesn't write anything back.
3175    */
3176   insn = next_insn(p, BRW_OPCODE_SEND);
3177   brw_set_dest(p, insn, dst);
3178   brw_set_src0(p, insn, dst);
3179   brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3180                                commit_enable);
3181
3182   if (devinfo->gen == 7 && !devinfo->is_haswell) {
3183      /* IVB does typed surface access through the render cache, so we need to
3184       * flush it too.  Use a different register so both flushes can be
3185       * pipelined by the hardware.
3186       */
3187      insn = next_insn(p, BRW_OPCODE_SEND);
3188      brw_set_dest(p, insn, offset(dst, 1));
3189      brw_set_src0(p, insn, offset(dst, 1));
3190      brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3191                                   commit_enable);
3192
3193      /* Now write the response of the second message into the response of the
3194       * first to trigger a pipeline stall -- This way future render and data
3195       * cache messages will be properly ordered with respect to past data and
3196       * render cache messages.
3197       */
3198      brw_push_insn_state(p);
3199      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3200      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3201      brw_MOV(p, dst, offset(dst, 1));
3202      brw_pop_insn_state(p);
3203   }
3204}
3205
3206void
3207brw_pixel_interpolator_query(struct brw_codegen *p,
3208                             struct brw_reg dest,
3209                             struct brw_reg mrf,
3210                             bool noperspective,
3211                             unsigned mode,
3212                             unsigned data,
3213                             unsigned msg_length,
3214                             unsigned response_length)
3215{
3216   const struct brw_device_info *devinfo = p->devinfo;
3217   struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
3218
3219   brw_set_dest(p, insn, dest);
3220   brw_set_src0(p, insn, mrf);
3221   brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR,
3222                              msg_length, response_length,
3223                              false /* header is never present for PI */,
3224                              false);
3225
3226   brw_inst_set_pi_simd_mode(
3227         devinfo, insn, brw_inst_exec_size(devinfo, insn) == BRW_EXECUTE_16);
3228   brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
3229   brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
3230   brw_inst_set_pi_message_type(devinfo, insn, mode);
3231   brw_inst_set_pi_message_data(devinfo, insn, data);
3232}
3233
3234void
3235brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst)
3236{
3237   const struct brw_device_info *devinfo = p->devinfo;
3238   brw_inst *inst;
3239
3240   assert(devinfo->gen >= 7);
3241
3242   brw_push_insn_state(p);
3243
3244   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
3245      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3246
3247      if (devinfo->gen >= 8) {
3248         /* Getting the first active channel index is easy on Gen8: Just find
3249          * the first bit set in the mask register.  The same register exists
3250          * on HSW already but it reads back as all ones when the current
3251          * instruction has execution masking disabled, so it's kind of
3252          * useless.
3253          */
3254         inst = brw_FBL(p, vec1(dst),
3255                        retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
3256
3257         /* Quarter control has the effect of magically shifting the value of
3258          * this register.  Make sure it's set to zero.
3259          */
3260         brw_inst_set_qtr_control(devinfo, inst, GEN6_COMPRESSION_1Q);
3261      } else {
3262         const struct brw_reg flag = retype(brw_flag_reg(1, 0),
3263                                            BRW_REGISTER_TYPE_UD);
3264
3265         brw_MOV(p, flag, brw_imm_ud(0));
3266
3267         /* Run a 16-wide instruction returning zero with execution masking
3268          * and a conditional modifier enabled in order to get the current
3269          * execution mask in f1.0.
3270          */
3271         inst = brw_MOV(p, brw_null_reg(), brw_imm_ud(0));
3272         brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_16);
3273         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3274         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3275         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3276
3277         brw_FBL(p, vec1(dst), flag);
3278      }
3279   } else {
3280      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3281
3282      if (devinfo->gen >= 8) {
3283         /* In SIMD4x2 mode the first active channel index is just the
3284          * negation of the first bit of the mask register.
3285          */
3286         inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3287                        negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3288                        brw_imm_ud(1));
3289
3290      } else {
3291         /* Overwrite the destination without and with execution masking to
3292          * find out which of the channels is active.
3293          */
3294         brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3295                 brw_imm_ud(1));
3296
3297         inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3298                        brw_imm_ud(0));
3299         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3300      }
3301   }
3302
3303   brw_pop_insn_state(p);
3304}
3305
3306void
3307brw_broadcast(struct brw_codegen *p,
3308              struct brw_reg dst,
3309              struct brw_reg src,
3310              struct brw_reg idx)
3311{
3312   const struct brw_device_info *devinfo = p->devinfo;
3313   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
3314   brw_inst *inst;
3315
3316   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3317          src.address_mode == BRW_ADDRESS_DIRECT);
3318
3319   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3320       idx.file == BRW_IMMEDIATE_VALUE) {
3321      /* Trivial, the source is already uniform or the index is a constant.
3322       * We will typically not get here if the optimizer is doing its job, but
3323       * asserting would be mean.
3324       */
3325      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0;
3326      brw_MOV(p, dst,
3327              (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3328               stride(suboffset(src, 4 * i), 0, 4, 1)));
3329   } else {
3330      if (align1) {
3331         const struct brw_reg addr =
3332            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3333         const unsigned offset = src.nr * REG_SIZE + src.subnr;
3334         /* Limit in bytes of the signed indirect addressing immediate. */
3335         const unsigned limit = 512;
3336
3337         brw_push_insn_state(p);
3338         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3339         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3340
3341         /* Take into account the component size and horizontal stride. */
3342         assert(src.vstride == src.hstride + src.width);
3343         brw_SHL(p, addr, vec1(idx),
3344                 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3345                            src.hstride - 1));
3346
3347         /* We can only address up to limit bytes using the indirect
3348          * addressing immediate, account for the difference if the source
3349          * register is above this limit.
3350          */
3351         if (offset >= limit)
3352            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3353
3354         brw_pop_insn_state(p);
3355
3356         /* Use indirect addressing to fetch the specified component. */
3357         brw_MOV(p, dst,
3358                 retype(brw_vec1_indirect(addr.subnr, offset % limit),
3359                        src.type));
3360      } else {
3361         /* In SIMD4x2 mode the index can be either zero or one, replicate it
3362          * to all bits of a flag register,
3363          */
3364         inst = brw_MOV(p,
3365                        brw_null_reg(),
3366                        stride(brw_swizzle1(idx, 0), 0, 4, 1));
3367         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3368         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3369         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3370
3371         /* and use predicated SEL to pick the right channel. */
3372         inst = brw_SEL(p, dst,
3373                        stride(suboffset(src, 4), 0, 4, 1),
3374                        stride(src, 0, 4, 1));
3375         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3376         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3377      }
3378   }
3379}
3380
3381/**
3382 * This instruction is generated as a single-channel align1 instruction by
3383 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3384 *
3385 * We can't use the typed atomic op in the FS because that has the execution
3386 * mask ANDed with the pixel mask, but we just want to write the one dword for
3387 * all the pixels.
3388 *
3389 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3390 * one u32.  So we use the same untyped atomic write message as the pixel
3391 * shader.
3392 *
3393 * The untyped atomic operation requires a BUFFER surface type with RAW
3394 * format, and is only accessible through the legacy DATA_CACHE dataport
3395 * messages.
3396 */
3397void brw_shader_time_add(struct brw_codegen *p,
3398                         struct brw_reg payload,
3399                         uint32_t surf_index)
3400{
3401   const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
3402                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3403                          GEN7_SFID_DATAPORT_DATA_CACHE);
3404   assert(p->devinfo->gen >= 7);
3405
3406   brw_push_insn_state(p);
3407   brw_set_default_access_mode(p, BRW_ALIGN_1);
3408   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3409   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3410   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3411
3412   /* We use brw_vec1_reg and unmasked because we want to increment the given
3413    * offset only once.
3414    */
3415   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3416                                      BRW_ARF_NULL, 0));
3417   brw_set_src0(p, send, brw_vec1_reg(payload.file,
3418                                      payload.nr, 0));
3419   brw_set_src1(p, send, brw_imm_ud(0));
3420   brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
3421   brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
3422   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
3423
3424   brw_pop_insn_state(p);
3425}
3426
3427
3428/**
3429 * Emit the SEND message for a barrier
3430 */
3431void
3432brw_barrier(struct brw_codegen *p, struct brw_reg src)
3433{
3434   const struct brw_device_info *devinfo = p->devinfo;
3435   struct brw_inst *inst;
3436
3437   assert(devinfo->gen >= 7);
3438
3439   inst = next_insn(p, BRW_OPCODE_SEND);
3440   brw_set_dest(p, inst, brw_null_reg());
3441   brw_set_src0(p, inst, src);
3442   brw_set_src1(p, inst, brw_null_reg());
3443
3444   brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
3445                              1 /* msg_length */,
3446                              0 /* response_length */,
3447                              false /* header_present */,
3448                              false /* end_of_thread */);
3449
3450   brw_inst_set_gateway_notify(devinfo, inst, 1);
3451   brw_inst_set_gateway_subfuncid(devinfo, inst,
3452                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3453
3454   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3455}
3456
3457
3458/**
3459 * Emit the wait instruction for a barrier
3460 */
3461void
3462brw_WAIT(struct brw_codegen *p)
3463{
3464   const struct brw_device_info *devinfo = p->devinfo;
3465   struct brw_inst *insn;
3466
3467   struct brw_reg src = brw_notification_reg();
3468
3469   insn = next_insn(p, BRW_OPCODE_WAIT);
3470   brw_set_dest(p, insn, src);
3471   brw_set_src0(p, insn, src);
3472   brw_set_src1(p, insn, brw_null_reg());
3473
3474   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3475   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3476}
3477