brw_eu_emit.c revision 0457464c3343b3809048249fa5c1c0867ef499dc
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "util/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 brw_inst *insn,
45				 struct brw_reg reg)
46{
47   const struct brw_context *brw = p->brw;
48
49   if (reg.width == BRW_WIDTH_8 && p->compressed) {
50      brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_16);
51   } else {
52      /* Register width definitions are compatible with BRW_EXECUTE_* enums. */
53      brw_inst_set_exec_size(brw, insn, reg.width);
54   }
55}
56
57
58/**
59 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
60 * registers, implicitly moving the operand to a message register.
61 *
62 * On Sandybridge, this is no longer the case.  This function performs the
63 * explicit move; it should be called before emitting a SEND instruction.
64 */
65void
66gen6_resolve_implied_move(struct brw_compile *p,
67			  struct brw_reg *src,
68			  unsigned msg_reg_nr)
69{
70   struct brw_context *brw = p->brw;
71   if (brw->gen < 6)
72      return;
73
74   if (src->file == BRW_MESSAGE_REGISTER_FILE)
75      return;
76
77   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
78      brw_push_insn_state(p);
79      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
80      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
81      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
82	      retype(*src, BRW_REGISTER_TYPE_UD));
83      brw_pop_insn_state(p);
84   }
85   *src = brw_message_reg(msg_reg_nr);
86}
87
88static void
89gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
90{
91   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
92    * "The send with EOT should use register space R112-R127 for <src>. This is
93    *  to enable loading of a new thread into the same slot while the message
94    *  with EOT for current thread is pending dispatch."
95    *
96    * Since we're pretending to have 16 MRFs anyway, we may as well use the
97    * registers required for messages with EOT.
98    */
99   struct brw_context *brw = p->brw;
100   if (brw->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
101      reg->file = BRW_GENERAL_REGISTER_FILE;
102      reg->nr += GEN7_MRF_HACK_START;
103   }
104}
105
106/**
107 * Convert a brw_reg_type enumeration value into the hardware representation.
108 *
109 * The hardware encoding may depend on whether the value is an immediate.
110 */
111unsigned
112brw_reg_type_to_hw_type(const struct brw_context *brw,
113                        enum brw_reg_type type, unsigned file)
114{
115   if (file == BRW_IMMEDIATE_VALUE) {
116      const static int imm_hw_types[] = {
117         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
118         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
119         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
120         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
121         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
122         [BRW_REGISTER_TYPE_UB] = -1,
123         [BRW_REGISTER_TYPE_B]  = -1,
124         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
125         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
126         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
127         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
128         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
129         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
130         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
131      };
132      assert(type < ARRAY_SIZE(imm_hw_types));
133      assert(imm_hw_types[type] != -1);
134      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
135      return imm_hw_types[type];
136   } else {
137      /* Non-immediate registers */
138      const static int hw_types[] = {
139         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
140         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
141         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
142         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
143         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
144         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
145         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
146         [BRW_REGISTER_TYPE_UV] = -1,
147         [BRW_REGISTER_TYPE_VF] = -1,
148         [BRW_REGISTER_TYPE_V]  = -1,
149         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
150         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
151         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
152         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
153      };
154      assert(type < ARRAY_SIZE(hw_types));
155      assert(hw_types[type] != -1);
156      assert(brw->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
157      assert(brw->gen >= 8 || type < BRW_REGISTER_TYPE_HF);
158      return hw_types[type];
159   }
160}
161
162void
163brw_set_dest(struct brw_compile *p, brw_inst *inst, struct brw_reg dest)
164{
165   const struct brw_context *brw = p->brw;
166
167   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
168       dest.file != BRW_MESSAGE_REGISTER_FILE)
169      assert(dest.nr < 128);
170
171   gen7_convert_mrf_to_grf(p, &dest);
172
173   brw_inst_set_dst_reg_file(brw, inst, dest.file);
174   brw_inst_set_dst_reg_type(brw, inst, brw_reg_type_to_hw_type(brw, dest.type,
175                                                                dest.file));
176   brw_inst_set_dst_address_mode(brw, inst, dest.address_mode);
177
178   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
179      brw_inst_set_dst_da_reg_nr(brw, inst, dest.nr);
180
181      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
182         brw_inst_set_dst_da1_subreg_nr(brw, inst, dest.subnr);
183	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
184	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
185         brw_inst_set_dst_hstride(brw, inst, dest.hstride);
186      } else {
187         brw_inst_set_dst_da16_subreg_nr(brw, inst, dest.subnr / 16);
188         brw_inst_set_da16_writemask(brw, inst, dest.dw1.bits.writemask);
189         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
190             dest.file == BRW_MESSAGE_REGISTER_FILE) {
191            assert(dest.dw1.bits.writemask != 0);
192         }
193	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
194	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
195	  *    this to be programmed as "01".
196	  */
197         brw_inst_set_dst_hstride(brw, inst, 1);
198      }
199   } else {
200      brw_inst_set_dst_ia_subreg_nr(brw, inst, dest.subnr);
201
202      /* These are different sizes in align1 vs align16:
203       */
204      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
205         brw_inst_set_dst_ia1_addr_imm(brw, inst,
206                                       dest.dw1.bits.indirect_offset);
207	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
208	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
209         brw_inst_set_dst_hstride(brw, inst, dest.hstride);
210      } else {
211         brw_inst_set_dst_ia16_addr_imm(brw, inst,
212                                        dest.dw1.bits.indirect_offset);
213	 /* even ignored in da16, still need to set as '01' */
214         brw_inst_set_dst_hstride(brw, inst, 1);
215      }
216   }
217
218   /* NEW: Set the execution size based on dest.width and
219    * inst->compression_control:
220    */
221   guess_execution_size(p, inst, dest);
222}
223
224extern int reg_type_size[];
225
226static void
227validate_reg(const struct brw_context *brw, brw_inst *inst, struct brw_reg reg)
228{
229   int hstride_for_reg[] = {0, 1, 2, 4};
230   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
231   int width_for_reg[] = {1, 2, 4, 8, 16};
232   int execsize_for_reg[] = {1, 2, 4, 8, 16};
233   int width, hstride, vstride, execsize;
234
235   if (reg.file == BRW_IMMEDIATE_VALUE) {
236      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
237       * mean the destination has to be 128-bit aligned and the
238       * destination horiz stride has to be a word.
239       */
240      if (reg.type == BRW_REGISTER_TYPE_V) {
241         assert(hstride_for_reg[brw_inst_dst_hstride(brw, inst)] *
242                reg_type_size[brw_inst_dst_reg_type(brw, inst)] == 2);
243      }
244
245      return;
246   }
247
248   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
249       reg.file == BRW_ARF_NULL)
250      return;
251
252   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
253   hstride = hstride_for_reg[reg.hstride];
254
255   if (reg.vstride == 0xf) {
256      vstride = -1;
257   } else {
258      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
259      vstride = vstride_for_reg[reg.vstride];
260   }
261
262   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
263   width = width_for_reg[reg.width];
264
265   assert(brw_inst_exec_size(brw, inst) >= 0 &&
266          brw_inst_exec_size(brw, inst) < Elements(execsize_for_reg));
267   execsize = execsize_for_reg[brw_inst_exec_size(brw, inst)];
268
269   /* Restrictions from 3.3.10: Register Region Restrictions. */
270   /* 3. */
271   assert(execsize >= width);
272
273   /* 4. */
274   if (execsize == width && hstride != 0) {
275      assert(vstride == -1 || vstride == width * hstride);
276   }
277
278   /* 5. */
279   if (execsize == width && hstride == 0) {
280      /* no restriction on vstride. */
281   }
282
283   /* 6. */
284   if (width == 1) {
285      assert(hstride == 0);
286   }
287
288   /* 7. */
289   if (execsize == 1 && width == 1) {
290      assert(hstride == 0);
291      assert(vstride == 0);
292   }
293
294   /* 8. */
295   if (vstride == 0 && hstride == 0) {
296      assert(width == 1);
297   }
298
299   /* 10. Check destination issues. */
300}
301
302static bool
303is_compactable_immediate(unsigned imm)
304{
305   /* We get the low 12 bits as-is. */
306   imm &= ~0xfff;
307
308   /* We get one bit replicated through the top 20 bits. */
309   return imm == 0 || imm == 0xfffff000;
310}
311
312void
313brw_set_src0(struct brw_compile *p, brw_inst *inst, struct brw_reg reg)
314{
315   struct brw_context *brw = p->brw;
316
317   if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
318      assert(reg.nr < 128);
319
320   gen7_convert_mrf_to_grf(p, &reg);
321
322   if (brw->gen >= 6 && (brw_inst_opcode(brw, inst) == BRW_OPCODE_SEND ||
323                         brw_inst_opcode(brw, inst) == BRW_OPCODE_SENDC)) {
324      /* Any source modifiers or regions will be ignored, since this just
325       * identifies the MRF/GRF to start reading the message contents from.
326       * Check for some likely failures.
327       */
328      assert(!reg.negate);
329      assert(!reg.abs);
330      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
331   }
332
333   validate_reg(brw, inst, reg);
334
335   brw_inst_set_src0_reg_file(brw, inst, reg.file);
336   brw_inst_set_src0_reg_type(brw, inst,
337                              brw_reg_type_to_hw_type(brw, reg.type, reg.file));
338   brw_inst_set_src0_abs(brw, inst, reg.abs);
339   brw_inst_set_src0_negate(brw, inst, reg.negate);
340   brw_inst_set_src0_address_mode(brw, inst, reg.address_mode);
341
342   if (reg.file == BRW_IMMEDIATE_VALUE) {
343      brw_inst_set_imm_ud(brw, inst, reg.dw1.ud);
344
345      /* The Bspec's section titled "Non-present Operands" claims that if src0
346       * is an immediate that src1's type must be the same as that of src0.
347       *
348       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
349       * that do not follow this rule. E.g., from the IVB/HSW table:
350       *
351       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
352       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
353       *
354       * And from the SNB table:
355       *
356       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
357       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
358       *
359       * Neither of these cause warnings from the simulator when used,
360       * compacted or otherwise. In fact, all compaction mappings that have an
361       * immediate in src0 use a:ud for src1.
362       *
363       * The GM45 instruction compaction tables do not contain mapped meanings
364       * so it's not clear whether it has the restriction. We'll assume it was
365       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
366       */
367      brw_inst_set_src1_reg_file(brw, inst, BRW_ARCHITECTURE_REGISTER_FILE);
368      if (brw->gen < 6) {
369         brw_inst_set_src1_reg_type(brw, inst,
370                                    brw_inst_src0_reg_type(brw, inst));
371      } else {
372         brw_inst_set_src1_reg_type(brw, inst, BRW_HW_REG_TYPE_UD);
373      }
374
375      /* Compacted instructions only have 12-bits (plus 1 for the other 20)
376       * for immediate values. Presumably the hardware engineers realized
377       * that the only useful floating-point value that could be represented
378       * in this format is 0.0, which can also be represented as a VF-typed
379       * immediate, so they gave us the previously mentioned mapping on IVB+.
380       *
381       * Strangely, we do have a mapping for imm:f in src1, so we don't need
382       * to do this there.
383       *
384       * If we see a 0.0:F, change the type to VF so that it can be compacted.
385       */
386      if (brw_inst_imm_ud(brw, inst) == 0x0 &&
387          brw_inst_src0_reg_type(brw, inst) == BRW_HW_REG_TYPE_F) {
388         brw_inst_set_src0_reg_type(brw, inst, BRW_HW_REG_IMM_TYPE_VF);
389      }
390
391      /* There are no mappings for dst:d | i:d, so if the immediate is suitable
392       * set the types to :UD so the instruction can be compacted.
393       */
394      if (is_compactable_immediate(brw_inst_imm_ud(brw, inst)) &&
395          brw_inst_cond_modifier(brw, inst) == BRW_CONDITIONAL_NONE &&
396          brw_inst_src0_reg_type(brw, inst) == BRW_HW_REG_TYPE_D &&
397          brw_inst_dst_reg_type(brw, inst) == BRW_HW_REG_TYPE_D) {
398         brw_inst_set_src0_reg_type(brw, inst, BRW_HW_REG_TYPE_UD);
399         brw_inst_set_dst_reg_type(brw, inst, BRW_HW_REG_TYPE_UD);
400      }
401   } else {
402      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
403         brw_inst_set_src0_da_reg_nr(brw, inst, reg.nr);
404         if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
405             brw_inst_set_src0_da1_subreg_nr(brw, inst, reg.subnr);
406	 } else {
407            brw_inst_set_src0_da16_subreg_nr(brw, inst, reg.subnr / 16);
408	 }
409      } else {
410         brw_inst_set_src0_ia_subreg_nr(brw, inst, reg.subnr);
411
412         if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
413            brw_inst_set_src0_ia1_addr_imm(brw, inst, reg.dw1.bits.indirect_offset);
414	 } else {
415            brw_inst_set_src0_ia_subreg_nr(brw, inst, reg.dw1.bits.indirect_offset);
416	 }
417      }
418
419      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
420	 if (reg.width == BRW_WIDTH_1 &&
421             brw_inst_exec_size(brw, inst) == BRW_EXECUTE_1) {
422            brw_inst_set_src0_hstride(brw, inst, BRW_HORIZONTAL_STRIDE_0);
423            brw_inst_set_src0_width(brw, inst, BRW_WIDTH_1);
424            brw_inst_set_src0_vstride(brw, inst, BRW_VERTICAL_STRIDE_0);
425	 } else {
426            brw_inst_set_src0_hstride(brw, inst, reg.hstride);
427            brw_inst_set_src0_width(brw, inst, reg.width);
428            brw_inst_set_src0_vstride(brw, inst, reg.vstride);
429	 }
430      } else {
431         brw_inst_set_src0_da16_swiz_x(brw, inst,
432            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
433         brw_inst_set_src0_da16_swiz_y(brw, inst,
434            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
435         brw_inst_set_src0_da16_swiz_z(brw, inst,
436            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
437         brw_inst_set_src0_da16_swiz_w(brw, inst,
438            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
439
440	 /* This is an oddity of the fact we're using the same
441	  * descriptions for registers in align_16 as align_1:
442	  */
443	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
444            brw_inst_set_src0_vstride(brw, inst, BRW_VERTICAL_STRIDE_4);
445	 else
446            brw_inst_set_src0_vstride(brw, inst, reg.vstride);
447      }
448   }
449}
450
451
452void
453brw_set_src1(struct brw_compile *p, brw_inst *inst, struct brw_reg reg)
454{
455   const struct brw_context *brw = p->brw;
456   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
457
458   if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
459      assert(reg.nr < 128);
460
461   gen7_convert_mrf_to_grf(p, &reg);
462
463   validate_reg(brw, inst, reg);
464
465   brw_inst_set_src1_reg_file(brw, inst, reg.file);
466   brw_inst_set_src1_reg_type(brw, inst,
467                              brw_reg_type_to_hw_type(brw, reg.type, reg.file));
468   brw_inst_set_src1_abs(brw, inst, reg.abs);
469   brw_inst_set_src1_negate(brw, inst, reg.negate);
470
471   /* Only src1 can be immediate in two-argument instructions.
472    */
473   assert(brw_inst_src0_reg_file(brw, inst) != BRW_IMMEDIATE_VALUE);
474
475   if (reg.file == BRW_IMMEDIATE_VALUE) {
476      brw_inst_set_imm_ud(brw, inst, reg.dw1.ud);
477   } else {
478      /* This is a hardware restriction, which may or may not be lifted
479       * in the future:
480       */
481      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
482      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
483
484      brw_inst_set_src1_da_reg_nr(brw, inst, reg.nr);
485      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
486         brw_inst_set_src1_da1_subreg_nr(brw, inst, reg.subnr);
487      } else {
488         brw_inst_set_src1_da16_subreg_nr(brw, inst, reg.subnr / 16);
489      }
490
491      if (brw_inst_access_mode(brw, inst) == BRW_ALIGN_1) {
492	 if (reg.width == BRW_WIDTH_1 &&
493             brw_inst_exec_size(brw, inst) == BRW_EXECUTE_1) {
494            brw_inst_set_src1_hstride(brw, inst, BRW_HORIZONTAL_STRIDE_0);
495            brw_inst_set_src1_width(brw, inst, BRW_WIDTH_1);
496            brw_inst_set_src1_vstride(brw, inst, BRW_VERTICAL_STRIDE_0);
497	 } else {
498            brw_inst_set_src1_hstride(brw, inst, reg.hstride);
499            brw_inst_set_src1_width(brw, inst, reg.width);
500            brw_inst_set_src1_vstride(brw, inst, reg.vstride);
501	 }
502      } else {
503         brw_inst_set_src1_da16_swiz_x(brw, inst,
504            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
505         brw_inst_set_src1_da16_swiz_y(brw, inst,
506            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
507         brw_inst_set_src1_da16_swiz_z(brw, inst,
508            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
509         brw_inst_set_src1_da16_swiz_w(brw, inst,
510            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
511
512	 /* This is an oddity of the fact we're using the same
513	  * descriptions for registers in align_16 as align_1:
514	  */
515	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
516            brw_inst_set_src1_vstride(brw, inst, BRW_VERTICAL_STRIDE_4);
517	 else
518            brw_inst_set_src1_vstride(brw, inst, reg.vstride);
519      }
520   }
521}
522
523/**
524 * Set the Message Descriptor and Extended Message Descriptor fields
525 * for SEND messages.
526 *
527 * \note This zeroes out the Function Control bits, so it must be called
528 *       \b before filling out any message-specific data.  Callers can
529 *       choose not to fill in irrelevant bits; they will be zero.
530 */
531static void
532brw_set_message_descriptor(struct brw_compile *p,
533			   brw_inst *inst,
534			   enum brw_message_target sfid,
535			   unsigned msg_length,
536			   unsigned response_length,
537			   bool header_present,
538			   bool end_of_thread)
539{
540   struct brw_context *brw = p->brw;
541
542   brw_set_src1(p, inst, brw_imm_d(0));
543   brw_inst_set_sfid(brw, inst, sfid);
544   brw_inst_set_mlen(brw, inst, msg_length);
545   brw_inst_set_rlen(brw, inst, response_length);
546   brw_inst_set_eot(brw, inst, end_of_thread);
547
548   if (brw->gen >= 5) {
549      brw_inst_set_header_present(brw, inst, header_present);
550   }
551}
552
553static void brw_set_math_message( struct brw_compile *p,
554				  brw_inst *inst,
555				  unsigned function,
556				  unsigned integer_type,
557				  bool low_precision,
558				  unsigned dataType )
559{
560   struct brw_context *brw = p->brw;
561   unsigned msg_length;
562   unsigned response_length;
563
564   /* Infer message length from the function */
565   switch (function) {
566   case BRW_MATH_FUNCTION_POW:
567   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
568   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
569   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
570      msg_length = 2;
571      break;
572   default:
573      msg_length = 1;
574      break;
575   }
576
577   /* Infer response length from the function */
578   switch (function) {
579   case BRW_MATH_FUNCTION_SINCOS:
580   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
581      response_length = 2;
582      break;
583   default:
584      response_length = 1;
585      break;
586   }
587
588
589   brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
590			      msg_length, response_length, false, false);
591   brw_inst_set_math_msg_function(brw, inst, function);
592   brw_inst_set_math_msg_signed_int(brw, inst, integer_type);
593   brw_inst_set_math_msg_precision(brw, inst, low_precision);
594   brw_inst_set_math_msg_saturate(brw, inst, brw_inst_saturate(brw, inst));
595   brw_inst_set_math_msg_data_type(brw, inst, dataType);
596   brw_inst_set_saturate(brw, inst, 0);
597}
598
599
600static void brw_set_ff_sync_message(struct brw_compile *p,
601				    brw_inst *insn,
602				    bool allocate,
603				    unsigned response_length,
604				    bool end_of_thread)
605{
606   const struct brw_context *brw = p->brw;
607
608   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
609			      1, response_length, true, end_of_thread);
610   brw_inst_set_urb_opcode(brw, insn, 1); /* FF_SYNC */
611   brw_inst_set_urb_allocate(brw, insn, allocate);
612   /* The following fields are not used by FF_SYNC: */
613   brw_inst_set_urb_global_offset(brw, insn, 0);
614   brw_inst_set_urb_swizzle_control(brw, insn, 0);
615   brw_inst_set_urb_used(brw, insn, 0);
616   brw_inst_set_urb_complete(brw, insn, 0);
617}
618
619static void brw_set_urb_message( struct brw_compile *p,
620				 brw_inst *insn,
621                                 enum brw_urb_write_flags flags,
622				 unsigned msg_length,
623				 unsigned response_length,
624				 unsigned offset,
625				 unsigned swizzle_control )
626{
627   struct brw_context *brw = p->brw;
628
629   assert(brw->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
630   assert(brw->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
631   assert(brw->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
632
633   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
634			      msg_length, response_length, true,
635                              flags & BRW_URB_WRITE_EOT);
636
637   if (flags & BRW_URB_WRITE_OWORD) {
638      assert(msg_length == 2); /* header + one OWORD of data */
639      brw_inst_set_urb_opcode(brw, insn, BRW_URB_OPCODE_WRITE_OWORD);
640   } else {
641      brw_inst_set_urb_opcode(brw, insn, BRW_URB_OPCODE_WRITE_HWORD);
642   }
643
644   brw_inst_set_urb_global_offset(brw, insn, offset);
645   brw_inst_set_urb_swizzle_control(brw, insn, swizzle_control);
646
647   if (brw->gen < 8) {
648      brw_inst_set_urb_complete(brw, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
649   }
650
651   if (brw->gen < 7) {
652      brw_inst_set_urb_allocate(brw, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
653      brw_inst_set_urb_used(brw, insn, !(flags & BRW_URB_WRITE_UNUSED));
654   } else {
655      brw_inst_set_urb_per_slot_offset(brw, insn,
656         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
657   }
658}
659
660void
661brw_set_dp_write_message(struct brw_compile *p,
662			 brw_inst *insn,
663			 unsigned binding_table_index,
664			 unsigned msg_control,
665			 unsigned msg_type,
666			 unsigned msg_length,
667			 bool header_present,
668			 unsigned last_render_target,
669			 unsigned response_length,
670			 unsigned end_of_thread,
671			 unsigned send_commit_msg)
672{
673   struct brw_context *brw = p->brw;
674   unsigned sfid;
675
676   if (brw->gen >= 7) {
677      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
678      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
679	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
680      else
681	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
682   } else if (brw->gen == 6) {
683      /* Use the render cache for all write messages. */
684      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
685   } else {
686      sfid = BRW_SFID_DATAPORT_WRITE;
687   }
688
689   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
690			      header_present, end_of_thread);
691
692   brw_inst_set_binding_table_index(brw, insn, binding_table_index);
693   brw_inst_set_dp_write_msg_type(brw, insn, msg_type);
694   brw_inst_set_dp_write_msg_control(brw, insn, msg_control);
695   brw_inst_set_rt_last(brw, insn, last_render_target);
696   if (brw->gen < 7) {
697      brw_inst_set_dp_write_commit(brw, insn, send_commit_msg);
698   }
699}
700
701void
702brw_set_dp_read_message(struct brw_compile *p,
703			brw_inst *insn,
704			unsigned binding_table_index,
705			unsigned msg_control,
706			unsigned msg_type,
707			unsigned target_cache,
708			unsigned msg_length,
709                        bool header_present,
710			unsigned response_length)
711{
712   struct brw_context *brw = p->brw;
713   unsigned sfid;
714
715   if (brw->gen >= 7) {
716      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
717   } else if (brw->gen == 6) {
718      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
719	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
720      else
721	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
722   } else {
723      sfid = BRW_SFID_DATAPORT_READ;
724   }
725
726   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
727			      header_present, false);
728
729   brw_inst_set_binding_table_index(brw, insn, binding_table_index);
730   brw_inst_set_dp_read_msg_type(brw, insn, msg_type);
731   brw_inst_set_dp_read_msg_control(brw, insn, msg_control);
732   if (brw->gen < 6)
733      brw_inst_set_dp_read_target_cache(brw, insn, target_cache);
734}
735
736void
737brw_set_sampler_message(struct brw_compile *p,
738                        brw_inst *inst,
739                        unsigned binding_table_index,
740                        unsigned sampler,
741                        unsigned msg_type,
742                        unsigned response_length,
743                        unsigned msg_length,
744                        unsigned header_present,
745                        unsigned simd_mode,
746                        unsigned return_format)
747{
748   struct brw_context *brw = p->brw;
749
750   brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
751			      response_length, header_present, false);
752
753   brw_inst_set_binding_table_index(brw, inst, binding_table_index);
754   brw_inst_set_sampler(brw, inst, sampler);
755   brw_inst_set_sampler_msg_type(brw, inst, msg_type);
756   if (brw->gen >= 5) {
757      brw_inst_set_sampler_simd_mode(brw, inst, simd_mode);
758   } else if (brw->gen == 4 && !brw->is_g4x) {
759      brw_inst_set_sampler_return_format(brw, inst, return_format);
760   }
761}
762
763static void
764gen7_set_dp_scratch_message(struct brw_compile *p,
765                            brw_inst *inst,
766                            bool write,
767                            bool dword,
768                            bool invalidate_after_read,
769                            unsigned num_regs,
770                            unsigned addr_offset,
771                            unsigned mlen,
772                            unsigned rlen,
773                            bool header_present)
774{
775   const struct brw_context *brw = p->brw;
776   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
777          (brw->gen >= 8 && num_regs == 8));
778   brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
779                              mlen, rlen, header_present, false);
780   brw_inst_set_dp_category(brw, inst, 1); /* Scratch Block Read/Write msgs */
781   brw_inst_set_scratch_read_write(brw, inst, write);
782   brw_inst_set_scratch_type(brw, inst, dword);
783   brw_inst_set_scratch_invalidate_after_read(brw, inst, invalidate_after_read);
784   brw_inst_set_scratch_block_size(brw, inst, ffs(num_regs) - 1);
785   brw_inst_set_scratch_addr_offset(brw, inst, addr_offset);
786}
787
788#define next_insn brw_next_insn
789brw_inst *
790brw_next_insn(struct brw_compile *p, unsigned opcode)
791{
792   const struct brw_context *brw = p->brw;
793   brw_inst *insn;
794
795   if (p->nr_insn + 1 > p->store_size) {
796      p->store_size <<= 1;
797      p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
798   }
799
800   p->next_insn_offset += 16;
801   insn = &p->store[p->nr_insn++];
802   memcpy(insn, p->current, sizeof(*insn));
803
804   brw_inst_set_opcode(brw, insn, opcode);
805   return insn;
806}
807
808static brw_inst *
809brw_alu1(struct brw_compile *p, unsigned opcode,
810         struct brw_reg dest, struct brw_reg src)
811{
812   brw_inst *insn = next_insn(p, opcode);
813   brw_set_dest(p, insn, dest);
814   brw_set_src0(p, insn, src);
815   return insn;
816}
817
818static brw_inst *
819brw_alu2(struct brw_compile *p, unsigned opcode,
820         struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
821{
822   brw_inst *insn = next_insn(p, opcode);
823   brw_set_dest(p, insn, dest);
824   brw_set_src0(p, insn, src0);
825   brw_set_src1(p, insn, src1);
826   return insn;
827}
828
829static int
830get_3src_subreg_nr(struct brw_reg reg)
831{
832   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
833      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
834      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
835   } else {
836      return reg.subnr / 4;
837   }
838}
839
840static brw_inst *
841brw_alu3(struct brw_compile *p, unsigned opcode, struct brw_reg dest,
842         struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
843{
844   struct brw_context *brw = p->brw;
845   brw_inst *inst = next_insn(p, opcode);
846
847   gen7_convert_mrf_to_grf(p, &dest);
848
849   assert(brw_inst_access_mode(brw, inst) == BRW_ALIGN_16);
850
851   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
852	  dest.file == BRW_MESSAGE_REGISTER_FILE);
853   assert(dest.nr < 128);
854   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
855   assert(dest.type == BRW_REGISTER_TYPE_F ||
856          dest.type == BRW_REGISTER_TYPE_D ||
857          dest.type == BRW_REGISTER_TYPE_UD);
858   if (brw->gen == 6) {
859      brw_inst_set_3src_dst_reg_file(brw, inst,
860                                     dest.file == BRW_MESSAGE_REGISTER_FILE);
861   }
862   brw_inst_set_3src_dst_reg_nr(brw, inst, dest.nr);
863   brw_inst_set_3src_dst_subreg_nr(brw, inst, dest.subnr / 16);
864   brw_inst_set_3src_dst_writemask(brw, inst, dest.dw1.bits.writemask);
865   guess_execution_size(p, inst, dest);
866
867   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
868   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
869   assert(src0.nr < 128);
870   brw_inst_set_3src_src0_swizzle(brw, inst, src0.dw1.bits.swizzle);
871   brw_inst_set_3src_src0_subreg_nr(brw, inst, get_3src_subreg_nr(src0));
872   brw_inst_set_3src_src0_reg_nr(brw, inst, src0.nr);
873   brw_inst_set_3src_src0_abs(brw, inst, src0.abs);
874   brw_inst_set_3src_src0_negate(brw, inst, src0.negate);
875   brw_inst_set_3src_src0_rep_ctrl(brw, inst,
876                                   src0.vstride == BRW_VERTICAL_STRIDE_0);
877
878   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
879   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
880   assert(src1.nr < 128);
881   brw_inst_set_3src_src1_swizzle(brw, inst, src1.dw1.bits.swizzle);
882   brw_inst_set_3src_src1_subreg_nr(brw, inst, get_3src_subreg_nr(src1));
883   brw_inst_set_3src_src1_reg_nr(brw, inst, src1.nr);
884   brw_inst_set_3src_src1_abs(brw, inst, src1.abs);
885   brw_inst_set_3src_src1_negate(brw, inst, src1.negate);
886   brw_inst_set_3src_src1_rep_ctrl(brw, inst,
887                                   src1.vstride == BRW_VERTICAL_STRIDE_0);
888
889   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
890   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
891   assert(src2.nr < 128);
892   brw_inst_set_3src_src2_swizzle(brw, inst, src2.dw1.bits.swizzle);
893   brw_inst_set_3src_src2_subreg_nr(brw, inst, get_3src_subreg_nr(src2));
894   brw_inst_set_3src_src2_reg_nr(brw, inst, src2.nr);
895   brw_inst_set_3src_src2_abs(brw, inst, src2.abs);
896   brw_inst_set_3src_src2_negate(brw, inst, src2.negate);
897   brw_inst_set_3src_src2_rep_ctrl(brw, inst,
898                                   src2.vstride == BRW_VERTICAL_STRIDE_0);
899
900   if (brw->gen >= 7) {
901      /* Set both the source and destination types based on dest.type,
902       * ignoring the source register types.  The MAD and LRP emitters ensure
903       * that all four types are float.  The BFE and BFI2 emitters, however,
904       * may send us mixed D and UD types and want us to ignore that and use
905       * the destination type.
906       */
907      switch (dest.type) {
908      case BRW_REGISTER_TYPE_F:
909         brw_inst_set_3src_src_type(brw, inst, BRW_3SRC_TYPE_F);
910         brw_inst_set_3src_dst_type(brw, inst, BRW_3SRC_TYPE_F);
911         break;
912      case BRW_REGISTER_TYPE_D:
913         brw_inst_set_3src_src_type(brw, inst, BRW_3SRC_TYPE_D);
914         brw_inst_set_3src_dst_type(brw, inst, BRW_3SRC_TYPE_D);
915         break;
916      case BRW_REGISTER_TYPE_UD:
917         brw_inst_set_3src_src_type(brw, inst, BRW_3SRC_TYPE_UD);
918         brw_inst_set_3src_dst_type(brw, inst, BRW_3SRC_TYPE_UD);
919         break;
920      }
921   }
922
923   return inst;
924}
925
926
927/***********************************************************************
928 * Convenience routines.
929 */
930#define ALU1(OP)					\
931brw_inst *brw_##OP(struct brw_compile *p,		\
932	      struct brw_reg dest,			\
933	      struct brw_reg src0)   			\
934{							\
935   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
936}
937
938#define ALU2(OP)					\
939brw_inst *brw_##OP(struct brw_compile *p,		\
940	      struct brw_reg dest,			\
941	      struct brw_reg src0,			\
942	      struct brw_reg src1)   			\
943{							\
944   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
945}
946
947#define ALU3(OP)					\
948brw_inst *brw_##OP(struct brw_compile *p,		\
949	      struct brw_reg dest,			\
950	      struct brw_reg src0,			\
951	      struct brw_reg src1,			\
952	      struct brw_reg src2)   			\
953{							\
954   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
955}
956
957#define ALU3F(OP)                                               \
958brw_inst *brw_##OP(struct brw_compile *p,         \
959                                 struct brw_reg dest,           \
960                                 struct brw_reg src0,           \
961                                 struct brw_reg src1,           \
962                                 struct brw_reg src2)           \
963{                                                               \
964   assert(dest.type == BRW_REGISTER_TYPE_F);                    \
965   assert(src0.type == BRW_REGISTER_TYPE_F);                    \
966   assert(src1.type == BRW_REGISTER_TYPE_F);                    \
967   assert(src2.type == BRW_REGISTER_TYPE_F);                    \
968   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
969}
970
971/* Rounding operations (other than RNDD) require two instructions - the first
972 * stores a rounded value (possibly the wrong way) in the dest register, but
973 * also sets a per-channel "increment bit" in the flag register.  A predicated
974 * add of 1.0 fixes dest to contain the desired result.
975 *
976 * Sandybridge and later appear to round correctly without an ADD.
977 */
978#define ROUND(OP)							      \
979void brw_##OP(struct brw_compile *p,					      \
980	      struct brw_reg dest,					      \
981	      struct brw_reg src)					      \
982{									      \
983   struct brw_context *brw = p->brw;					      \
984   brw_inst *rnd, *add;							      \
985   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
986   brw_set_dest(p, rnd, dest);						      \
987   brw_set_src0(p, rnd, src);						      \
988									      \
989   if (brw->gen < 6) {							      \
990      /* turn on round-increments */					      \
991      brw_inst_set_cond_modifier(brw, rnd, BRW_CONDITIONAL_R);                \
992      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
993      brw_inst_set_pred_control(brw, add, BRW_PREDICATE_NORMAL);              \
994   }									      \
995}
996
997
998ALU1(MOV)
999ALU2(SEL)
1000ALU1(NOT)
1001ALU2(AND)
1002ALU2(OR)
1003ALU2(XOR)
1004ALU2(SHR)
1005ALU2(SHL)
1006ALU2(ASR)
1007ALU1(F32TO16)
1008ALU1(F16TO32)
1009ALU1(FRC)
1010ALU1(RNDD)
1011ALU2(MAC)
1012ALU2(MACH)
1013ALU1(LZD)
1014ALU2(DP4)
1015ALU2(DPH)
1016ALU2(DP3)
1017ALU2(DP2)
1018ALU2(LINE)
1019ALU2(PLN)
1020ALU3F(MAD)
1021ALU3F(LRP)
1022ALU1(BFREV)
1023ALU3(BFE)
1024ALU2(BFI1)
1025ALU3(BFI2)
1026ALU1(FBH)
1027ALU1(FBL)
1028ALU1(CBIT)
1029ALU2(ADDC)
1030ALU2(SUBB)
1031
1032ROUND(RNDZ)
1033ROUND(RNDE)
1034
1035
1036brw_inst *
1037brw_ADD(struct brw_compile *p, struct brw_reg dest,
1038        struct brw_reg src0, struct brw_reg src1)
1039{
1040   /* 6.2.2: add */
1041   if (src0.type == BRW_REGISTER_TYPE_F ||
1042       (src0.file == BRW_IMMEDIATE_VALUE &&
1043	src0.type == BRW_REGISTER_TYPE_VF)) {
1044      assert(src1.type != BRW_REGISTER_TYPE_UD);
1045      assert(src1.type != BRW_REGISTER_TYPE_D);
1046   }
1047
1048   if (src1.type == BRW_REGISTER_TYPE_F ||
1049       (src1.file == BRW_IMMEDIATE_VALUE &&
1050	src1.type == BRW_REGISTER_TYPE_VF)) {
1051      assert(src0.type != BRW_REGISTER_TYPE_UD);
1052      assert(src0.type != BRW_REGISTER_TYPE_D);
1053   }
1054
1055   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1056}
1057
1058brw_inst *
1059brw_AVG(struct brw_compile *p, struct brw_reg dest,
1060        struct brw_reg src0, struct brw_reg src1)
1061{
1062   assert(dest.type == src0.type);
1063   assert(src0.type == src1.type);
1064   switch (src0.type) {
1065   case BRW_REGISTER_TYPE_B:
1066   case BRW_REGISTER_TYPE_UB:
1067   case BRW_REGISTER_TYPE_W:
1068   case BRW_REGISTER_TYPE_UW:
1069   case BRW_REGISTER_TYPE_D:
1070   case BRW_REGISTER_TYPE_UD:
1071      break;
1072   default:
1073      unreachable("Bad type for brw_AVG");
1074   }
1075
1076   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1077}
1078
1079brw_inst *
1080brw_MUL(struct brw_compile *p, struct brw_reg dest,
1081        struct brw_reg src0, struct brw_reg src1)
1082{
1083   /* 6.32.38: mul */
1084   if (src0.type == BRW_REGISTER_TYPE_D ||
1085       src0.type == BRW_REGISTER_TYPE_UD ||
1086       src1.type == BRW_REGISTER_TYPE_D ||
1087       src1.type == BRW_REGISTER_TYPE_UD) {
1088      assert(dest.type != BRW_REGISTER_TYPE_F);
1089   }
1090
1091   if (src0.type == BRW_REGISTER_TYPE_F ||
1092       (src0.file == BRW_IMMEDIATE_VALUE &&
1093	src0.type == BRW_REGISTER_TYPE_VF)) {
1094      assert(src1.type != BRW_REGISTER_TYPE_UD);
1095      assert(src1.type != BRW_REGISTER_TYPE_D);
1096   }
1097
1098   if (src1.type == BRW_REGISTER_TYPE_F ||
1099       (src1.file == BRW_IMMEDIATE_VALUE &&
1100	src1.type == BRW_REGISTER_TYPE_VF)) {
1101      assert(src0.type != BRW_REGISTER_TYPE_UD);
1102      assert(src0.type != BRW_REGISTER_TYPE_D);
1103   }
1104
1105   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1106	  src0.nr != BRW_ARF_ACCUMULATOR);
1107   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1108	  src1.nr != BRW_ARF_ACCUMULATOR);
1109
1110   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1111}
1112
1113
1114void brw_NOP(struct brw_compile *p)
1115{
1116   brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1117   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1118   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1119   brw_set_src1(p, insn, brw_imm_ud(0x0));
1120}
1121
1122
1123
1124
1125
1126/***********************************************************************
1127 * Comparisons, if/else/endif
1128 */
1129
1130brw_inst *
1131brw_JMPI(struct brw_compile *p, struct brw_reg index,
1132         unsigned predicate_control)
1133{
1134   const struct brw_context *brw = p->brw;
1135   struct brw_reg ip = brw_ip_reg();
1136   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1137
1138   brw_inst_set_exec_size(brw, inst, BRW_EXECUTE_2);
1139   brw_inst_set_qtr_control(brw, inst, BRW_COMPRESSION_NONE);
1140   brw_inst_set_mask_control(brw, inst, BRW_MASK_DISABLE);
1141   brw_inst_set_pred_control(brw, inst, predicate_control);
1142
1143   return inst;
1144}
1145
1146static void
1147push_if_stack(struct brw_compile *p, brw_inst *inst)
1148{
1149   p->if_stack[p->if_stack_depth] = inst - p->store;
1150
1151   p->if_stack_depth++;
1152   if (p->if_stack_array_size <= p->if_stack_depth) {
1153      p->if_stack_array_size *= 2;
1154      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1155			     p->if_stack_array_size);
1156   }
1157}
1158
1159static brw_inst *
1160pop_if_stack(struct brw_compile *p)
1161{
1162   p->if_stack_depth--;
1163   return &p->store[p->if_stack[p->if_stack_depth]];
1164}
1165
1166static void
1167push_loop_stack(struct brw_compile *p, brw_inst *inst)
1168{
1169   if (p->loop_stack_array_size < p->loop_stack_depth) {
1170      p->loop_stack_array_size *= 2;
1171      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1172			       p->loop_stack_array_size);
1173      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1174				     p->loop_stack_array_size);
1175   }
1176
1177   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1178   p->loop_stack_depth++;
1179   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1180}
1181
1182static brw_inst *
1183get_inner_do_insn(struct brw_compile *p)
1184{
1185   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1186}
1187
1188/* EU takes the value from the flag register and pushes it onto some
1189 * sort of a stack (presumably merging with any flag value already on
1190 * the stack).  Within an if block, the flags at the top of the stack
1191 * control execution on each channel of the unit, eg. on each of the
1192 * 16 pixel values in our wm programs.
1193 *
1194 * When the matching 'else' instruction is reached (presumably by
1195 * countdown of the instruction count patched in by our ELSE/ENDIF
1196 * functions), the relevent flags are inverted.
1197 *
1198 * When the matching 'endif' instruction is reached, the flags are
1199 * popped off.  If the stack is now empty, normal execution resumes.
1200 */
1201brw_inst *
1202brw_IF(struct brw_compile *p, unsigned execute_size)
1203{
1204   struct brw_context *brw = p->brw;
1205   brw_inst *insn;
1206
1207   insn = next_insn(p, BRW_OPCODE_IF);
1208
1209   /* Override the defaults for this instruction:
1210    */
1211   if (brw->gen < 6) {
1212      brw_set_dest(p, insn, brw_ip_reg());
1213      brw_set_src0(p, insn, brw_ip_reg());
1214      brw_set_src1(p, insn, brw_imm_d(0x0));
1215   } else if (brw->gen == 6) {
1216      brw_set_dest(p, insn, brw_imm_w(0));
1217      brw_inst_set_gen6_jump_count(brw, insn, 0);
1218      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1219      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1220   } else {
1221      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1222      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1223      brw_set_src1(p, insn, brw_imm_ud(0));
1224      brw_inst_set_jip(brw, insn, 0);
1225      brw_inst_set_uip(brw, insn, 0);
1226   }
1227
1228   brw_inst_set_exec_size(brw, insn, execute_size);
1229   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1230   brw_inst_set_pred_control(brw, insn, BRW_PREDICATE_NORMAL);
1231   brw_inst_set_mask_control(brw, insn, BRW_MASK_ENABLE);
1232   if (!p->single_program_flow && brw->gen < 6)
1233      brw_inst_set_thread_control(brw, insn, BRW_THREAD_SWITCH);
1234
1235   push_if_stack(p, insn);
1236   p->if_depth_in_loop[p->loop_stack_depth]++;
1237   return insn;
1238}
1239
1240/* This function is only used for gen6-style IF instructions with an
1241 * embedded comparison (conditional modifier).  It is not used on gen7.
1242 */
1243brw_inst *
1244gen6_IF(struct brw_compile *p, enum brw_conditional_mod conditional,
1245	struct brw_reg src0, struct brw_reg src1)
1246{
1247   const struct brw_context *brw = p->brw;
1248   brw_inst *insn;
1249
1250   insn = next_insn(p, BRW_OPCODE_IF);
1251
1252   brw_set_dest(p, insn, brw_imm_w(0));
1253   brw_inst_set_exec_size(brw, insn, p->compressed ? BRW_EXECUTE_16
1254                                                   : BRW_EXECUTE_8);
1255   brw_inst_set_gen6_jump_count(brw, insn, 0);
1256   brw_set_src0(p, insn, src0);
1257   brw_set_src1(p, insn, src1);
1258
1259   assert(brw_inst_qtr_control(brw, insn) == BRW_COMPRESSION_NONE);
1260   assert(brw_inst_pred_control(brw, insn) == BRW_PREDICATE_NONE);
1261   brw_inst_set_cond_modifier(brw, insn, conditional);
1262
1263   push_if_stack(p, insn);
1264   return insn;
1265}
1266
1267/**
1268 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1269 */
1270static void
1271convert_IF_ELSE_to_ADD(struct brw_compile *p,
1272                       brw_inst *if_inst, brw_inst *else_inst)
1273{
1274   const struct brw_context *brw = p->brw;
1275
1276   /* The next instruction (where the ENDIF would be, if it existed) */
1277   brw_inst *next_inst = &p->store[p->nr_insn];
1278
1279   assert(p->single_program_flow);
1280   assert(if_inst != NULL && brw_inst_opcode(brw, if_inst) == BRW_OPCODE_IF);
1281   assert(else_inst == NULL || brw_inst_opcode(brw, else_inst) == BRW_OPCODE_ELSE);
1282   assert(brw_inst_exec_size(brw, if_inst) == BRW_EXECUTE_1);
1283
1284   /* Convert IF to an ADD instruction that moves the instruction pointer
1285    * to the first instruction of the ELSE block.  If there is no ELSE
1286    * block, point to where ENDIF would be.  Reverse the predicate.
1287    *
1288    * There's no need to execute an ENDIF since we don't need to do any
1289    * stack operations, and if we're currently executing, we just want to
1290    * continue normally.
1291    */
1292   brw_inst_set_opcode(brw, if_inst, BRW_OPCODE_ADD);
1293   brw_inst_set_pred_inv(brw, if_inst, true);
1294
1295   if (else_inst != NULL) {
1296      /* Convert ELSE to an ADD instruction that points where the ENDIF
1297       * would be.
1298       */
1299      brw_inst_set_opcode(brw, else_inst, BRW_OPCODE_ADD);
1300
1301      brw_inst_set_imm_ud(brw, if_inst, (else_inst - if_inst + 1) * 16);
1302      brw_inst_set_imm_ud(brw, else_inst, (next_inst - else_inst) * 16);
1303   } else {
1304      brw_inst_set_imm_ud(brw, if_inst, (next_inst - if_inst) * 16);
1305   }
1306}
1307
1308/**
1309 * Patch IF and ELSE instructions with appropriate jump targets.
1310 */
1311static void
1312patch_IF_ELSE(struct brw_compile *p,
1313              brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1314{
1315   struct brw_context *brw = p->brw;
1316
1317   /* We shouldn't be patching IF and ELSE instructions in single program flow
1318    * mode when gen < 6, because in single program flow mode on those
1319    * platforms, we convert flow control instructions to conditional ADDs that
1320    * operate on IP (see brw_ENDIF).
1321    *
1322    * However, on Gen6, writing to IP doesn't work in single program flow mode
1323    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1324    * not be updated by non-flow control instructions.").  And on later
1325    * platforms, there is no significant benefit to converting control flow
1326    * instructions to conditional ADDs.  So we do patch IF and ELSE
1327    * instructions in single program flow mode on those platforms.
1328    */
1329   if (brw->gen < 6)
1330      assert(!p->single_program_flow);
1331
1332   assert(if_inst != NULL && brw_inst_opcode(brw, if_inst) == BRW_OPCODE_IF);
1333   assert(endif_inst != NULL);
1334   assert(else_inst == NULL || brw_inst_opcode(brw, else_inst) == BRW_OPCODE_ELSE);
1335
1336   unsigned br = 1;
1337   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1338    * requires 2 chunks.
1339    */
1340   if (brw->gen >= 5)
1341      br = 2;
1342
1343   assert(brw_inst_opcode(brw, endif_inst) == BRW_OPCODE_ENDIF);
1344   brw_inst_set_exec_size(brw, endif_inst, brw_inst_exec_size(brw, if_inst));
1345
1346   if (else_inst == NULL) {
1347      /* Patch IF -> ENDIF */
1348      if (brw->gen < 6) {
1349	 /* Turn it into an IFF, which means no mask stack operations for
1350	  * all-false and jumping past the ENDIF.
1351	  */
1352         brw_inst_set_opcode(brw, if_inst, BRW_OPCODE_IFF);
1353         brw_inst_set_gen4_jump_count(brw, if_inst,
1354                                      br * (endif_inst - if_inst + 1));
1355         brw_inst_set_gen4_pop_count(brw, if_inst, 0);
1356      } else if (brw->gen == 6) {
1357	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1358         brw_inst_set_gen6_jump_count(brw, if_inst, br*(endif_inst - if_inst));
1359      } else {
1360         brw_inst_set_uip(brw, if_inst, br * (endif_inst - if_inst));
1361         brw_inst_set_jip(brw, if_inst, br * (endif_inst - if_inst));
1362      }
1363   } else {
1364      brw_inst_set_exec_size(brw, else_inst, brw_inst_exec_size(brw, if_inst));
1365
1366      /* Patch IF -> ELSE */
1367      if (brw->gen < 6) {
1368         brw_inst_set_gen4_jump_count(brw, if_inst,
1369                                      br * (else_inst - if_inst));
1370         brw_inst_set_gen4_pop_count(brw, if_inst, 0);
1371      } else if (brw->gen == 6) {
1372         brw_inst_set_gen6_jump_count(brw, if_inst,
1373                                      br * (else_inst - if_inst + 1));
1374      }
1375
1376      /* Patch ELSE -> ENDIF */
1377      if (brw->gen < 6) {
1378	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1379	  * matching ENDIF.
1380	  */
1381         brw_inst_set_gen4_jump_count(brw, else_inst,
1382                                      br * (endif_inst - else_inst + 1));
1383         brw_inst_set_gen4_pop_count(brw, else_inst, 1);
1384      } else if (brw->gen == 6) {
1385	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1386         brw_inst_set_gen6_jump_count(brw, else_inst,
1387                                      br * (endif_inst - else_inst));
1388      } else {
1389	 /* The IF instruction's JIP should point just past the ELSE */
1390         brw_inst_set_jip(brw, if_inst, br * (else_inst - if_inst + 1));
1391	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1392         brw_inst_set_uip(brw, if_inst, br * (endif_inst - if_inst));
1393         brw_inst_set_jip(brw, else_inst, br * (endif_inst - else_inst));
1394      }
1395   }
1396}
1397
1398void
1399brw_ELSE(struct brw_compile *p)
1400{
1401   struct brw_context *brw = p->brw;
1402   brw_inst *insn;
1403
1404   insn = next_insn(p, BRW_OPCODE_ELSE);
1405
1406   if (brw->gen < 6) {
1407      brw_set_dest(p, insn, brw_ip_reg());
1408      brw_set_src0(p, insn, brw_ip_reg());
1409      brw_set_src1(p, insn, brw_imm_d(0x0));
1410   } else if (brw->gen == 6) {
1411      brw_set_dest(p, insn, brw_imm_w(0));
1412      brw_inst_set_gen6_jump_count(brw, insn, 0);
1413      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1414      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1415   } else {
1416      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1417      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1418      brw_set_src1(p, insn, brw_imm_ud(0));
1419      brw_inst_set_jip(brw, insn, 0);
1420      brw_inst_set_uip(brw, insn, 0);
1421   }
1422
1423   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1424   brw_inst_set_mask_control(brw, insn, BRW_MASK_ENABLE);
1425   if (!p->single_program_flow && brw->gen < 6)
1426      brw_inst_set_thread_control(brw, insn, BRW_THREAD_SWITCH);
1427
1428   push_if_stack(p, insn);
1429}
1430
1431void
1432brw_ENDIF(struct brw_compile *p)
1433{
1434   struct brw_context *brw = p->brw;
1435   brw_inst *insn = NULL;
1436   brw_inst *else_inst = NULL;
1437   brw_inst *if_inst = NULL;
1438   brw_inst *tmp;
1439   bool emit_endif = true;
1440
1441   /* In single program flow mode, we can express IF and ELSE instructions
1442    * equivalently as ADD instructions that operate on IP.  On platforms prior
1443    * to Gen6, flow control instructions cause an implied thread switch, so
1444    * this is a significant savings.
1445    *
1446    * However, on Gen6, writing to IP doesn't work in single program flow mode
1447    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1448    * not be updated by non-flow control instructions.").  And on later
1449    * platforms, there is no significant benefit to converting control flow
1450    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1451    * Gen5.
1452    */
1453   if (brw->gen < 6 && p->single_program_flow)
1454      emit_endif = false;
1455
1456   /*
1457    * A single next_insn() may change the base adress of instruction store
1458    * memory(p->store), so call it first before referencing the instruction
1459    * store pointer from an index
1460    */
1461   if (emit_endif)
1462      insn = next_insn(p, BRW_OPCODE_ENDIF);
1463
1464   /* Pop the IF and (optional) ELSE instructions from the stack */
1465   p->if_depth_in_loop[p->loop_stack_depth]--;
1466   tmp = pop_if_stack(p);
1467   if (brw_inst_opcode(brw, tmp) == BRW_OPCODE_ELSE) {
1468      else_inst = tmp;
1469      tmp = pop_if_stack(p);
1470   }
1471   if_inst = tmp;
1472
1473   if (!emit_endif) {
1474      /* ENDIF is useless; don't bother emitting it. */
1475      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1476      return;
1477   }
1478
1479   if (brw->gen < 6) {
1480      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1481      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1482      brw_set_src1(p, insn, brw_imm_d(0x0));
1483   } else if (brw->gen == 6) {
1484      brw_set_dest(p, insn, brw_imm_w(0));
1485      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1486      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1487   } else {
1488      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1489      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1490      brw_set_src1(p, insn, brw_imm_ud(0));
1491   }
1492
1493   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1494   brw_inst_set_mask_control(brw, insn, BRW_MASK_ENABLE);
1495   if (brw->gen < 6)
1496      brw_inst_set_thread_control(brw, insn, BRW_THREAD_SWITCH);
1497
1498   /* Also pop item off the stack in the endif instruction: */
1499   if (brw->gen < 6) {
1500      brw_inst_set_gen4_jump_count(brw, insn, 0);
1501      brw_inst_set_gen4_pop_count(brw, insn, 1);
1502   } else if (brw->gen == 6) {
1503      brw_inst_set_gen6_jump_count(brw, insn, 2);
1504   } else {
1505      brw_inst_set_jip(brw, insn, 2);
1506   }
1507   patch_IF_ELSE(p, if_inst, else_inst, insn);
1508}
1509
1510brw_inst *
1511brw_BREAK(struct brw_compile *p)
1512{
1513   struct brw_context *brw = p->brw;
1514   brw_inst *insn;
1515
1516   insn = next_insn(p, BRW_OPCODE_BREAK);
1517   if (brw->gen >= 6) {
1518      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1519      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1520      brw_set_src1(p, insn, brw_imm_d(0x0));
1521   } else {
1522      brw_set_dest(p, insn, brw_ip_reg());
1523      brw_set_src0(p, insn, brw_ip_reg());
1524      brw_set_src1(p, insn, brw_imm_d(0x0));
1525      brw_inst_set_gen4_pop_count(brw, insn,
1526                                  p->if_depth_in_loop[p->loop_stack_depth]);
1527   }
1528   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1529   brw_inst_set_exec_size(brw, insn, p->compressed ? BRW_EXECUTE_16
1530                                                   : BRW_EXECUTE_8);
1531
1532   return insn;
1533}
1534
1535brw_inst *
1536brw_CONT(struct brw_compile *p)
1537{
1538   const struct brw_context *brw = p->brw;
1539   brw_inst *insn;
1540
1541   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1542   brw_set_dest(p, insn, brw_ip_reg());
1543   brw_set_src0(p, insn, brw_ip_reg());
1544   brw_set_src1(p, insn, brw_imm_d(0x0));
1545
1546   if (brw->gen < 6) {
1547      brw_inst_set_gen4_pop_count(brw, insn,
1548                                  p->if_depth_in_loop[p->loop_stack_depth]);
1549   }
1550   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1551   brw_inst_set_exec_size(brw, insn, p->compressed ? BRW_EXECUTE_16
1552                                                   : BRW_EXECUTE_8);
1553   return insn;
1554}
1555
1556brw_inst *
1557gen6_HALT(struct brw_compile *p)
1558{
1559   const struct brw_context *brw = p->brw;
1560   brw_inst *insn;
1561
1562   insn = next_insn(p, BRW_OPCODE_HALT);
1563   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1564   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1565   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1566
1567   if (p->compressed) {
1568      brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_16);
1569   } else {
1570      brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1571      brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_8);
1572   }
1573   return insn;
1574}
1575
1576/* DO/WHILE loop:
1577 *
1578 * The DO/WHILE is just an unterminated loop -- break or continue are
1579 * used for control within the loop.  We have a few ways they can be
1580 * done.
1581 *
1582 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1583 * jip and no DO instruction.
1584 *
1585 * For non-uniform control flow pre-gen6, there's a DO instruction to
1586 * push the mask, and a WHILE to jump back, and BREAK to get out and
1587 * pop the mask.
1588 *
1589 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1590 * just points back to the first instruction of the loop.
1591 */
1592brw_inst *
1593brw_DO(struct brw_compile *p, unsigned execute_size)
1594{
1595   struct brw_context *brw = p->brw;
1596
1597   if (brw->gen >= 6 || p->single_program_flow) {
1598      push_loop_stack(p, &p->store[p->nr_insn]);
1599      return &p->store[p->nr_insn];
1600   } else {
1601      brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1602
1603      push_loop_stack(p, insn);
1604
1605      /* Override the defaults for this instruction:
1606       */
1607      brw_set_dest(p, insn, brw_null_reg());
1608      brw_set_src0(p, insn, brw_null_reg());
1609      brw_set_src1(p, insn, brw_null_reg());
1610
1611      brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1612      brw_inst_set_exec_size(brw, insn, execute_size);
1613      brw_inst_set_pred_control(brw, insn, BRW_PREDICATE_NONE);
1614
1615      return insn;
1616   }
1617}
1618
1619/**
1620 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1621 * instruction here.
1622 *
1623 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1624 * nesting, since it can always just point to the end of the block/current loop.
1625 */
1626static void
1627brw_patch_break_cont(struct brw_compile *p, brw_inst *while_inst)
1628{
1629   struct brw_context *brw = p->brw;
1630   brw_inst *do_inst = get_inner_do_insn(p);
1631   brw_inst *inst;
1632   int br = (brw->gen == 5) ? 2 : 1;
1633
1634   for (inst = while_inst - 1; inst != do_inst; inst--) {
1635      /* If the jump count is != 0, that means that this instruction has already
1636       * been patched because it's part of a loop inside of the one we're
1637       * patching.
1638       */
1639      if (brw_inst_opcode(brw, inst) == BRW_OPCODE_BREAK &&
1640          brw_inst_gen4_jump_count(brw, inst) == 0) {
1641         brw_inst_set_gen4_jump_count(brw, inst, br*((while_inst - inst) + 1));
1642      } else if (brw_inst_opcode(brw, inst) == BRW_OPCODE_CONTINUE &&
1643                 brw_inst_gen4_jump_count(brw, inst) == 0) {
1644         brw_inst_set_gen4_jump_count(brw, inst, br * (while_inst - inst));
1645      }
1646   }
1647}
1648
1649brw_inst *
1650brw_WHILE(struct brw_compile *p)
1651{
1652   struct brw_context *brw = p->brw;
1653   brw_inst *insn, *do_insn;
1654   unsigned br = 1;
1655
1656   if (brw->gen >= 5)
1657      br = 2;
1658
1659   if (brw->gen >= 7) {
1660      insn = next_insn(p, BRW_OPCODE_WHILE);
1661      do_insn = get_inner_do_insn(p);
1662
1663      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1664      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1665      brw_set_src1(p, insn, brw_imm_ud(0));
1666      brw_inst_set_jip(brw, insn, br * (do_insn - insn));
1667
1668      brw_inst_set_exec_size(brw, insn, p->compressed ? BRW_EXECUTE_16
1669                                                      : BRW_EXECUTE_8);
1670   } else if (brw->gen == 6) {
1671      insn = next_insn(p, BRW_OPCODE_WHILE);
1672      do_insn = get_inner_do_insn(p);
1673
1674      brw_set_dest(p, insn, brw_imm_w(0));
1675      brw_inst_set_gen6_jump_count(brw, insn, br * (do_insn - insn));
1676      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1677      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1678
1679      brw_inst_set_exec_size(brw, insn, p->compressed ? BRW_EXECUTE_16
1680                                                      : BRW_EXECUTE_8);
1681   } else {
1682      if (p->single_program_flow) {
1683	 insn = next_insn(p, BRW_OPCODE_ADD);
1684         do_insn = get_inner_do_insn(p);
1685
1686	 brw_set_dest(p, insn, brw_ip_reg());
1687	 brw_set_src0(p, insn, brw_ip_reg());
1688	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1689         brw_inst_set_exec_size(brw, insn, BRW_EXECUTE_1);
1690      } else {
1691	 insn = next_insn(p, BRW_OPCODE_WHILE);
1692         do_insn = get_inner_do_insn(p);
1693
1694         assert(brw_inst_opcode(brw, do_insn) == BRW_OPCODE_DO);
1695
1696	 brw_set_dest(p, insn, brw_ip_reg());
1697	 brw_set_src0(p, insn, brw_ip_reg());
1698	 brw_set_src1(p, insn, brw_imm_d(0));
1699
1700         brw_inst_set_exec_size(brw, insn, brw_inst_exec_size(brw, do_insn));
1701         brw_inst_set_gen4_jump_count(brw, insn, br * (do_insn - insn + 1));
1702         brw_inst_set_gen4_pop_count(brw, insn, 0);
1703
1704	 brw_patch_break_cont(p, insn);
1705      }
1706   }
1707   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1708
1709   p->loop_stack_depth--;
1710
1711   return insn;
1712}
1713
1714/* FORWARD JUMPS:
1715 */
1716void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1717{
1718   struct brw_context *brw = p->brw;
1719   brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1720   unsigned jmpi = 1;
1721
1722   if (brw->gen >= 5)
1723      jmpi = 2;
1724
1725   assert(brw_inst_opcode(brw, jmp_insn) == BRW_OPCODE_JMPI);
1726   assert(brw_inst_src1_reg_file(brw, jmp_insn) == BRW_IMMEDIATE_VALUE);
1727
1728   brw_inst_set_gen4_jump_count(brw, jmp_insn,
1729                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
1730}
1731
1732/* To integrate with the above, it makes sense that the comparison
1733 * instruction should populate the flag register.  It might be simpler
1734 * just to use the flag reg for most WM tasks?
1735 */
1736void brw_CMP(struct brw_compile *p,
1737	     struct brw_reg dest,
1738	     unsigned conditional,
1739	     struct brw_reg src0,
1740	     struct brw_reg src1)
1741{
1742   struct brw_context *brw = p->brw;
1743   brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1744
1745   if (brw->gen >= 8) {
1746      /* The CMP instruction appears to behave erratically for floating point
1747       * sources unless the destination type is also float.  Overriding it to
1748       * match src0 makes it work in all cases.
1749       */
1750      dest.type = src0.type;
1751   }
1752
1753   brw_inst_set_cond_modifier(brw, insn, conditional);
1754   brw_set_dest(p, insn, dest);
1755   brw_set_src0(p, insn, src0);
1756   brw_set_src1(p, insn, src1);
1757
1758   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1759    * page says:
1760    *    "Any CMP instruction with a null destination must use a {switch}."
1761    *
1762    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1763    * mentioned on their work-arounds pages.
1764    */
1765   if (brw->gen == 7) {
1766      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1767          dest.nr == BRW_ARF_NULL) {
1768         brw_inst_set_thread_control(brw, insn, BRW_THREAD_SWITCH);
1769      }
1770   }
1771}
1772
1773/***********************************************************************
1774 * Helpers for the various SEND message types:
1775 */
1776
1777/** Extended math function, float[8].
1778 */
1779void gen4_math(struct brw_compile *p,
1780	       struct brw_reg dest,
1781	       unsigned function,
1782	       unsigned msg_reg_nr,
1783	       struct brw_reg src,
1784	       unsigned data_type,
1785	       unsigned precision )
1786{
1787   struct brw_context *brw = p->brw;
1788   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1789
1790   assert(brw->gen < 6);
1791
1792   /* Example code doesn't set predicate_control for send
1793    * instructions.
1794    */
1795   brw_inst_set_pred_control(brw, insn, 0);
1796   brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
1797
1798   brw_set_dest(p, insn, dest);
1799   brw_set_src0(p, insn, src);
1800   brw_set_math_message(p,
1801                        insn,
1802                        function,
1803                        src.type == BRW_REGISTER_TYPE_D,
1804                        precision,
1805                        data_type);
1806}
1807
1808void gen6_math(struct brw_compile *p,
1809	       struct brw_reg dest,
1810	       unsigned function,
1811	       struct brw_reg src0,
1812	       struct brw_reg src1)
1813{
1814   struct brw_context *brw = p->brw;
1815   brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1816
1817   assert(brw->gen >= 6);
1818
1819   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1820          (brw->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1821   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1822
1823   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1824   if (brw->gen == 6) {
1825      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1826      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1827   }
1828
1829   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1830       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1831       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1832      assert(src0.type != BRW_REGISTER_TYPE_F);
1833      assert(src1.type != BRW_REGISTER_TYPE_F);
1834      assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1835   } else {
1836      assert(src0.type == BRW_REGISTER_TYPE_F);
1837      assert(src1.type == BRW_REGISTER_TYPE_F);
1838      if (function == BRW_MATH_FUNCTION_POW) {
1839         assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1840      } else {
1841         assert(src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1842                src1.nr == BRW_ARF_NULL);
1843      }
1844   }
1845
1846   /* Source modifiers are ignored for extended math instructions on Gen6. */
1847   if (brw->gen == 6) {
1848      assert(!src0.negate);
1849      assert(!src0.abs);
1850      assert(!src1.negate);
1851      assert(!src1.abs);
1852   }
1853
1854   brw_inst_set_math_function(brw, insn, function);
1855
1856   brw_set_dest(p, insn, dest);
1857   brw_set_src0(p, insn, src0);
1858   brw_set_src1(p, insn, src1);
1859}
1860
1861
1862/**
1863 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1864 * using a constant offset per channel.
1865 *
1866 * The offset must be aligned to oword size (16 bytes).  Used for
1867 * register spilling.
1868 */
1869void brw_oword_block_write_scratch(struct brw_compile *p,
1870				   struct brw_reg mrf,
1871				   int num_regs,
1872				   unsigned offset)
1873{
1874   struct brw_context *brw = p->brw;
1875   uint32_t msg_control, msg_type;
1876   int mlen;
1877
1878   if (brw->gen >= 6)
1879      offset /= 16;
1880
1881   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1882
1883   if (num_regs == 1) {
1884      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1885      mlen = 2;
1886   } else {
1887      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1888      mlen = 3;
1889   }
1890
1891   /* Set up the message header.  This is g0, with g0.2 filled with
1892    * the offset.  We don't want to leave our offset around in g0 or
1893    * it'll screw up texture samples, so set it up inside the message
1894    * reg.
1895    */
1896   {
1897      brw_push_insn_state(p);
1898      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1899      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1900
1901      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1902
1903      /* set message header global offset field (reg 0, element 2) */
1904      brw_MOV(p,
1905	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1906				  mrf.nr,
1907				  2), BRW_REGISTER_TYPE_UD),
1908	      brw_imm_ud(offset));
1909
1910      brw_pop_insn_state(p);
1911   }
1912
1913   {
1914      struct brw_reg dest;
1915      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1916      int send_commit_msg;
1917      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1918					 BRW_REGISTER_TYPE_UW);
1919
1920      if (brw_inst_qtr_control(brw, insn) != BRW_COMPRESSION_NONE) {
1921         brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
1922	 src_header = vec16(src_header);
1923      }
1924      assert(brw_inst_pred_control(brw, insn) == BRW_PREDICATE_NONE);
1925      if (brw->gen < 6)
1926         brw_inst_set_base_mrf(brw, insn, mrf.nr);
1927
1928      /* Until gen6, writes followed by reads from the same location
1929       * are not guaranteed to be ordered unless write_commit is set.
1930       * If set, then a no-op write is issued to the destination
1931       * register to set a dependency, and a read from the destination
1932       * can be used to ensure the ordering.
1933       *
1934       * For gen6, only writes between different threads need ordering
1935       * protection.  Our use of DP writes is all about register
1936       * spilling within a thread.
1937       */
1938      if (brw->gen >= 6) {
1939	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1940	 send_commit_msg = 0;
1941      } else {
1942	 dest = src_header;
1943	 send_commit_msg = 1;
1944      }
1945
1946      brw_set_dest(p, insn, dest);
1947      if (brw->gen >= 6) {
1948	 brw_set_src0(p, insn, mrf);
1949      } else {
1950	 brw_set_src0(p, insn, brw_null_reg());
1951      }
1952
1953      if (brw->gen >= 6)
1954	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1955      else
1956	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1957
1958      brw_set_dp_write_message(p,
1959			       insn,
1960			       255, /* binding table index (255=stateless) */
1961			       msg_control,
1962			       msg_type,
1963			       mlen,
1964			       true, /* header_present */
1965			       0, /* not a render target */
1966			       send_commit_msg, /* response_length */
1967			       0, /* eot */
1968			       send_commit_msg);
1969   }
1970}
1971
1972
1973/**
1974 * Read a block of owords (half a GRF each) from the scratch buffer
1975 * using a constant index per channel.
1976 *
1977 * Offset must be aligned to oword size (16 bytes).  Used for register
1978 * spilling.
1979 */
1980void
1981brw_oword_block_read_scratch(struct brw_compile *p,
1982			     struct brw_reg dest,
1983			     struct brw_reg mrf,
1984			     int num_regs,
1985			     unsigned offset)
1986{
1987   struct brw_context *brw = p->brw;
1988   uint32_t msg_control;
1989   int rlen;
1990
1991   if (brw->gen >= 6)
1992      offset /= 16;
1993
1994   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1995   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1996
1997   if (num_regs == 1) {
1998      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1999      rlen = 1;
2000   } else {
2001      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
2002      rlen = 2;
2003   }
2004
2005   {
2006      brw_push_insn_state(p);
2007      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2008      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2009
2010      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2011
2012      /* set message header global offset field (reg 0, element 2) */
2013      brw_MOV(p,
2014	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2015				  mrf.nr,
2016				  2), BRW_REGISTER_TYPE_UD),
2017	      brw_imm_ud(offset));
2018
2019      brw_pop_insn_state(p);
2020   }
2021
2022   {
2023      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2024
2025      assert(brw_inst_pred_control(brw, insn) == 0);
2026      brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
2027
2028      brw_set_dest(p, insn, dest);	/* UW? */
2029      if (brw->gen >= 6) {
2030	 brw_set_src0(p, insn, mrf);
2031      } else {
2032	 brw_set_src0(p, insn, brw_null_reg());
2033         brw_inst_set_base_mrf(brw, insn, mrf.nr);
2034      }
2035
2036      brw_set_dp_read_message(p,
2037			      insn,
2038			      255, /* binding table index (255=stateless) */
2039			      msg_control,
2040			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
2041			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
2042			      1, /* msg_length */
2043                              true, /* header_present */
2044			      rlen);
2045   }
2046}
2047
2048void
2049gen7_block_read_scratch(struct brw_compile *p,
2050                        struct brw_reg dest,
2051                        int num_regs,
2052                        unsigned offset)
2053{
2054   const struct brw_context *brw = p->brw;
2055   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2056   assert(brw_inst_pred_control(brw, insn) == BRW_PREDICATE_NONE);
2057
2058   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
2059   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2060
2061   /* The HW requires that the header is present; this is to get the g0.5
2062    * scratch offset.
2063    */
2064   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2065
2066   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2067    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2068    * is 32 bytes, which happens to be the size of a register.
2069    */
2070   offset /= REG_SIZE;
2071   assert(offset < (1 << 12));
2072
2073   gen7_set_dp_scratch_message(p, insn,
2074                               false, /* scratch read */
2075                               false, /* OWords */
2076                               false, /* invalidate after read */
2077                               num_regs,
2078                               offset,
2079                               1,        /* mlen: just g0 */
2080                               num_regs, /* rlen */
2081                               true);    /* header present */
2082}
2083
2084/**
2085 * Read a float[4] vector from the data port Data Cache (const buffer).
2086 * Location (in buffer) should be a multiple of 16.
2087 * Used for fetching shader constants.
2088 */
2089void brw_oword_block_read(struct brw_compile *p,
2090			  struct brw_reg dest,
2091			  struct brw_reg mrf,
2092			  uint32_t offset,
2093			  uint32_t bind_table_index)
2094{
2095   struct brw_context *brw = p->brw;
2096
2097   /* On newer hardware, offset is in units of owords. */
2098   if (brw->gen >= 6)
2099      offset /= 16;
2100
2101   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2102
2103   brw_push_insn_state(p);
2104   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2105   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2106   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2107
2108   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2109
2110   /* set message header global offset field (reg 0, element 2) */
2111   brw_MOV(p,
2112	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2113			       mrf.nr,
2114			       2), BRW_REGISTER_TYPE_UD),
2115	   brw_imm_ud(offset));
2116
2117   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2118
2119   /* cast dest to a uword[8] vector */
2120   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2121
2122   brw_set_dest(p, insn, dest);
2123   if (brw->gen >= 6) {
2124      brw_set_src0(p, insn, mrf);
2125   } else {
2126      brw_set_src0(p, insn, brw_null_reg());
2127      brw_inst_set_base_mrf(brw, insn, mrf.nr);
2128   }
2129
2130   brw_set_dp_read_message(p,
2131			   insn,
2132			   bind_table_index,
2133			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2134			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2135			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2136			   1, /* msg_length */
2137                           true, /* header_present */
2138			   1); /* response_length (1 reg, 2 owords!) */
2139
2140   brw_pop_insn_state(p);
2141}
2142
2143
2144void brw_fb_WRITE(struct brw_compile *p,
2145		  int dispatch_width,
2146                  unsigned msg_reg_nr,
2147                  struct brw_reg src0,
2148                  unsigned msg_control,
2149                  unsigned binding_table_index,
2150                  unsigned msg_length,
2151                  unsigned response_length,
2152                  bool eot,
2153                  bool header_present)
2154{
2155   struct brw_context *brw = p->brw;
2156   brw_inst *insn;
2157   unsigned msg_type;
2158   struct brw_reg dest;
2159
2160   if (dispatch_width == 16)
2161      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2162   else
2163      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2164
2165   if (brw->gen >= 6) {
2166      insn = next_insn(p, BRW_OPCODE_SENDC);
2167   } else {
2168      insn = next_insn(p, BRW_OPCODE_SEND);
2169   }
2170   brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
2171
2172   if (brw->gen >= 6) {
2173      /* headerless version, just submit color payload */
2174      src0 = brw_message_reg(msg_reg_nr);
2175
2176      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2177   } else {
2178      brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
2179
2180      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2181   }
2182
2183   brw_set_dest(p, insn, dest);
2184   brw_set_src0(p, insn, src0);
2185   brw_set_dp_write_message(p,
2186			    insn,
2187			    binding_table_index,
2188			    msg_control,
2189			    msg_type,
2190			    msg_length,
2191			    header_present,
2192			    eot, /* last render target write */
2193			    response_length,
2194			    eot,
2195			    0 /* send_commit_msg */);
2196}
2197
2198
2199/**
2200 * Texture sample instruction.
2201 * Note: the msg_type plus msg_length values determine exactly what kind
2202 * of sampling operation is performed.  See volume 4, page 161 of docs.
2203 */
2204void brw_SAMPLE(struct brw_compile *p,
2205		struct brw_reg dest,
2206		unsigned msg_reg_nr,
2207		struct brw_reg src0,
2208		unsigned binding_table_index,
2209		unsigned sampler,
2210		unsigned msg_type,
2211		unsigned response_length,
2212		unsigned msg_length,
2213		unsigned header_present,
2214		unsigned simd_mode,
2215		unsigned return_format)
2216{
2217   struct brw_context *brw = p->brw;
2218   brw_inst *insn;
2219
2220   if (msg_reg_nr != -1)
2221      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2222
2223   insn = next_insn(p, BRW_OPCODE_SEND);
2224   brw_inst_set_pred_control(brw, insn, BRW_PREDICATE_NONE); /* XXX */
2225
2226   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2227    *
2228    *    "Instruction compression is not allowed for this instruction (that
2229    *     is, send). The hardware behavior is undefined if this instruction is
2230    *     set as compressed. However, compress control can be set to "SecHalf"
2231    *     to affect the EMask generation."
2232    *
2233    * No similar wording is found in later PRMs, but there are examples
2234    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2235    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2236    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2237    */
2238   if (brw_inst_qtr_control(brw, insn) != BRW_COMPRESSION_2NDHALF)
2239      brw_inst_set_qtr_control(brw, insn, BRW_COMPRESSION_NONE);
2240
2241   if (brw->gen < 6)
2242      brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
2243
2244   brw_set_dest(p, insn, dest);
2245   brw_set_src0(p, insn, src0);
2246   brw_set_sampler_message(p, insn,
2247                           binding_table_index,
2248                           sampler,
2249                           msg_type,
2250                           response_length,
2251                           msg_length,
2252                           header_present,
2253                           simd_mode,
2254                           return_format);
2255}
2256
2257/* All these variables are pretty confusing - we might be better off
2258 * using bitmasks and macros for this, in the old style.  Or perhaps
2259 * just having the caller instantiate the fields in dword3 itself.
2260 */
2261void brw_urb_WRITE(struct brw_compile *p,
2262		   struct brw_reg dest,
2263		   unsigned msg_reg_nr,
2264		   struct brw_reg src0,
2265                   enum brw_urb_write_flags flags,
2266		   unsigned msg_length,
2267		   unsigned response_length,
2268		   unsigned offset,
2269		   unsigned swizzle)
2270{
2271   struct brw_context *brw = p->brw;
2272   brw_inst *insn;
2273
2274   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2275
2276   if (brw->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2277      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2278      brw_push_insn_state(p);
2279      brw_set_default_access_mode(p, BRW_ALIGN_1);
2280      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2281      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2282		       BRW_REGISTER_TYPE_UD),
2283	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2284		brw_imm_ud(0xff00));
2285      brw_pop_insn_state(p);
2286   }
2287
2288   insn = next_insn(p, BRW_OPCODE_SEND);
2289
2290   assert(msg_length < BRW_MAX_MRF);
2291
2292   brw_set_dest(p, insn, dest);
2293   brw_set_src0(p, insn, src0);
2294   brw_set_src1(p, insn, brw_imm_d(0));
2295
2296   if (brw->gen < 6)
2297      brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
2298
2299   brw_set_urb_message(p,
2300		       insn,
2301		       flags,
2302		       msg_length,
2303		       response_length,
2304		       offset,
2305		       swizzle);
2306}
2307
2308static int
2309brw_find_next_block_end(struct brw_compile *p, int start_offset)
2310{
2311   int offset;
2312   void *store = p->store;
2313   const struct brw_context *brw = p->brw;
2314
2315   for (offset = next_offset(brw, store, start_offset);
2316        offset < p->next_insn_offset;
2317        offset = next_offset(brw, store, offset)) {
2318      brw_inst *insn = store + offset;
2319
2320      switch (brw_inst_opcode(brw, insn)) {
2321      case BRW_OPCODE_ENDIF:
2322      case BRW_OPCODE_ELSE:
2323      case BRW_OPCODE_WHILE:
2324      case BRW_OPCODE_HALT:
2325	 return offset;
2326      }
2327   }
2328
2329   return 0;
2330}
2331
2332/* There is no DO instruction on gen6, so to find the end of the loop
2333 * we have to see if the loop is jumping back before our start
2334 * instruction.
2335 */
2336static int
2337brw_find_loop_end(struct brw_compile *p, int start_offset)
2338{
2339   struct brw_context *brw = p->brw;
2340   int offset;
2341   int scale = 8;
2342   void *store = p->store;
2343
2344   assert(brw->gen >= 6);
2345
2346   /* Always start after the instruction (such as a WHILE) we're trying to fix
2347    * up.
2348    */
2349   for (offset = next_offset(brw, store, start_offset);
2350        offset < p->next_insn_offset;
2351        offset = next_offset(brw, store, offset)) {
2352      brw_inst *insn = store + offset;
2353
2354      if (brw_inst_opcode(brw, insn) == BRW_OPCODE_WHILE) {
2355         int jip = brw->gen == 6 ? brw_inst_gen6_jump_count(brw, insn)
2356                                 : brw_inst_jip(brw, insn);
2357	 if (offset + jip * scale <= start_offset)
2358	    return offset;
2359      }
2360   }
2361   assert(!"not reached");
2362   return start_offset;
2363}
2364
2365/* After program generation, go back and update the UIP and JIP of
2366 * BREAK, CONT, and HALT instructions to their correct locations.
2367 */
2368void
2369brw_set_uip_jip(struct brw_compile *p)
2370{
2371   struct brw_context *brw = p->brw;
2372   int offset;
2373   int scale = 8;
2374   void *store = p->store;
2375
2376   if (brw->gen < 6)
2377      return;
2378
2379   for (offset = 0; offset < p->next_insn_offset;
2380        offset = next_offset(brw, store, offset)) {
2381      brw_inst *insn = store + offset;
2382
2383      if (brw_inst_cmpt_control(brw, insn)) {
2384	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2385         assert(brw_inst_opcode(brw, insn) != BRW_OPCODE_BREAK &&
2386                brw_inst_opcode(brw, insn) != BRW_OPCODE_CONTINUE &&
2387                brw_inst_opcode(brw, insn) != BRW_OPCODE_HALT);
2388	 continue;
2389      }
2390
2391      int block_end_offset = brw_find_next_block_end(p, offset);
2392      switch (brw_inst_opcode(brw, insn)) {
2393      case BRW_OPCODE_BREAK:
2394         assert(block_end_offset != 0);
2395         brw_inst_set_jip(brw, insn, (block_end_offset - offset) / scale);
2396	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2397         brw_inst_set_uip(brw, insn,
2398	    (brw_find_loop_end(p, offset) - offset +
2399             (brw->gen == 6 ? 16 : 0)) / scale);
2400	 break;
2401      case BRW_OPCODE_CONTINUE:
2402         assert(block_end_offset != 0);
2403         brw_inst_set_jip(brw, insn, (block_end_offset - offset) / scale);
2404         brw_inst_set_uip(brw, insn,
2405            (brw_find_loop_end(p, offset) - offset) / scale);
2406
2407         assert(brw_inst_uip(brw, insn) != 0);
2408         assert(brw_inst_jip(brw, insn) != 0);
2409	 break;
2410
2411      case BRW_OPCODE_ENDIF:
2412         if (block_end_offset == 0)
2413            brw_inst_set_jip(brw, insn, 2);
2414         else
2415            brw_inst_set_jip(brw, insn, (block_end_offset - offset) / scale);
2416	 break;
2417
2418      case BRW_OPCODE_HALT:
2419	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2420	  *
2421	  *    "In case of the halt instruction not inside any conditional
2422	  *     code block, the value of <JIP> and <UIP> should be the
2423	  *     same. In case of the halt instruction inside conditional code
2424	  *     block, the <UIP> should be the end of the program, and the
2425	  *     <JIP> should be end of the most inner conditional code block."
2426	  *
2427	  * The uip will have already been set by whoever set up the
2428	  * instruction.
2429	  */
2430	 if (block_end_offset == 0) {
2431            brw_inst_set_jip(brw, insn, brw_inst_uip(brw, insn));
2432	 } else {
2433            brw_inst_set_jip(brw, insn, (block_end_offset - offset) / scale);
2434	 }
2435         assert(brw_inst_uip(brw, insn) != 0);
2436         assert(brw_inst_jip(brw, insn) != 0);
2437	 break;
2438      }
2439   }
2440}
2441
2442void brw_ff_sync(struct brw_compile *p,
2443		   struct brw_reg dest,
2444		   unsigned msg_reg_nr,
2445		   struct brw_reg src0,
2446		   bool allocate,
2447		   unsigned response_length,
2448		   bool eot)
2449{
2450   struct brw_context *brw = p->brw;
2451   brw_inst *insn;
2452
2453   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2454
2455   insn = next_insn(p, BRW_OPCODE_SEND);
2456   brw_set_dest(p, insn, dest);
2457   brw_set_src0(p, insn, src0);
2458   brw_set_src1(p, insn, brw_imm_d(0));
2459
2460   if (brw->gen < 6)
2461      brw_inst_set_base_mrf(brw, insn, msg_reg_nr);
2462
2463   brw_set_ff_sync_message(p,
2464			   insn,
2465			   allocate,
2466			   response_length,
2467			   eot);
2468}
2469
2470/**
2471 * Emit the SEND instruction necessary to generate stream output data on Gen6
2472 * (for transform feedback).
2473 *
2474 * If send_commit_msg is true, this is the last piece of stream output data
2475 * from this thread, so send the data as a committed write.  According to the
2476 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2477 *
2478 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2479 *   writes are complete by sending the final write as a committed write."
2480 */
2481void
2482brw_svb_write(struct brw_compile *p,
2483              struct brw_reg dest,
2484              unsigned msg_reg_nr,
2485              struct brw_reg src0,
2486              unsigned binding_table_index,
2487              bool   send_commit_msg)
2488{
2489   brw_inst *insn;
2490
2491   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2492
2493   insn = next_insn(p, BRW_OPCODE_SEND);
2494   brw_set_dest(p, insn, dest);
2495   brw_set_src0(p, insn, src0);
2496   brw_set_src1(p, insn, brw_imm_d(0));
2497   brw_set_dp_write_message(p, insn,
2498                            binding_table_index,
2499                            0, /* msg_control: ignored */
2500                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2501                            1, /* msg_length */
2502                            true, /* header_present */
2503                            0, /* last_render_target: ignored */
2504                            send_commit_msg, /* response_length */
2505                            0, /* end_of_thread */
2506                            send_commit_msg); /* send_commit_msg */
2507}
2508
2509static void
2510brw_set_dp_untyped_atomic_message(struct brw_compile *p,
2511                                  brw_inst *insn,
2512                                  unsigned atomic_op,
2513                                  unsigned bind_table_index,
2514                                  unsigned msg_length,
2515                                  unsigned response_length,
2516                                  bool header_present)
2517{
2518   const struct brw_context *brw = p->brw;
2519
2520   unsigned msg_control =
2521      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
2522      (response_length ? 1 << 5 : 0); /* Return data expected */
2523
2524   if (brw->gen >= 8 || brw->is_haswell) {
2525      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2526                                 msg_length, response_length,
2527                                 header_present, false);
2528
2529
2530      if (brw_inst_access_mode(brw, insn) == BRW_ALIGN_1) {
2531         if (brw_inst_exec_size(brw, insn) != BRW_EXECUTE_16)
2532            msg_control |= 1 << 4; /* SIMD8 mode */
2533
2534         brw_inst_set_dp_msg_type(brw, insn,
2535                                  HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
2536      } else {
2537         brw_inst_set_dp_msg_type(brw, insn,
2538            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
2539      }
2540   } else {
2541      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2542                                 msg_length, response_length,
2543                                 header_present, false);
2544
2545      brw_inst_set_dp_msg_type(brw, insn, GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
2546
2547      if (brw_inst_exec_size(brw, insn) != BRW_EXECUTE_16)
2548         msg_control |= 1 << 4; /* SIMD8 mode */
2549   }
2550
2551   brw_inst_set_binding_table_index(brw, insn, bind_table_index);
2552   brw_inst_set_dp_msg_control(brw, insn, msg_control);
2553}
2554
2555void
2556brw_untyped_atomic(struct brw_compile *p,
2557                   struct brw_reg dest,
2558                   struct brw_reg mrf,
2559                   unsigned atomic_op,
2560                   unsigned bind_table_index,
2561                   unsigned msg_length,
2562                   unsigned response_length) {
2563   const struct brw_context *brw = p->brw;
2564   brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
2565
2566   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2567   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2568   brw_set_src1(p, insn, brw_imm_d(0));
2569   brw_set_dp_untyped_atomic_message(
2570      p, insn, atomic_op, bind_table_index, msg_length, response_length,
2571      brw_inst_access_mode(brw, insn) == BRW_ALIGN_1);
2572}
2573
2574static void
2575brw_set_dp_untyped_surface_read_message(struct brw_compile *p,
2576                                        brw_inst *insn,
2577                                        unsigned bind_table_index,
2578                                        unsigned msg_length,
2579                                        unsigned response_length,
2580                                        bool header_present)
2581{
2582   const struct brw_context *brw = p->brw;
2583   const unsigned dispatch_width =
2584      (brw_inst_exec_size(brw, insn) == BRW_EXECUTE_16 ? 16 : 8);
2585   const unsigned num_channels = response_length / (dispatch_width / 8);
2586
2587   if (brw->gen >= 8 || brw->is_haswell) {
2588      brw_set_message_descriptor(p, insn, HSW_SFID_DATAPORT_DATA_CACHE_1,
2589                                 msg_length, response_length,
2590                                 header_present, false);
2591
2592      brw_inst_set_dp_msg_type(brw, insn,
2593                               HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ);
2594   } else {
2595      brw_set_message_descriptor(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
2596                                 msg_length, response_length,
2597                                 header_present, false);
2598
2599      brw_inst_set_dp_msg_type(brw, insn,
2600                               GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ);
2601   }
2602
2603   /* Set mask of 32-bit channels to drop. */
2604   unsigned msg_control = (0xf & (0xf << num_channels));
2605
2606   if (brw_inst_access_mode(brw, insn) == BRW_ALIGN_1) {
2607      if (dispatch_width == 16)
2608         msg_control |= 1 << 4; /* SIMD16 mode */
2609      else
2610         msg_control |= 2 << 4; /* SIMD8 mode */
2611   }
2612
2613   brw_inst_set_binding_table_index(brw, insn, bind_table_index);
2614   brw_inst_set_dp_msg_control(brw, insn, msg_control);
2615}
2616
2617void
2618brw_untyped_surface_read(struct brw_compile *p,
2619                         struct brw_reg dest,
2620                         struct brw_reg mrf,
2621                         unsigned bind_table_index,
2622                         unsigned msg_length,
2623                         unsigned response_length)
2624{
2625   const struct brw_context *brw = p->brw;
2626   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2627
2628   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UD));
2629   brw_set_src0(p, insn, retype(mrf, BRW_REGISTER_TYPE_UD));
2630   brw_set_dp_untyped_surface_read_message(
2631      p, insn, bind_table_index, msg_length, response_length,
2632      brw_inst_access_mode(brw, insn) == BRW_ALIGN_1);
2633}
2634
2635void
2636brw_pixel_interpolator_query(struct brw_compile *p,
2637                             struct brw_reg dest,
2638                             struct brw_reg mrf,
2639                             bool noperspective,
2640                             unsigned mode,
2641                             unsigned data,
2642                             unsigned msg_length,
2643                             unsigned response_length)
2644{
2645   const struct brw_context *brw = p->brw;
2646   struct brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2647
2648   brw_set_dest(p, insn, dest);
2649   brw_set_src0(p, insn, mrf);
2650   brw_set_message_descriptor(p, insn, GEN7_SFID_PIXEL_INTERPOLATOR,
2651                              msg_length, response_length,
2652                              false /* header is never present for PI */,
2653                              false);
2654
2655   brw_inst_set_pi_simd_mode(
2656         brw, insn, brw_inst_exec_size(brw, insn) == BRW_EXECUTE_16);
2657   brw_inst_set_pi_slot_group(brw, insn, 0); /* zero unless 32/64px dispatch */
2658   brw_inst_set_pi_nopersp(brw, insn, noperspective);
2659   brw_inst_set_pi_message_type(brw, insn, mode);
2660   brw_inst_set_pi_message_data(brw, insn, data);
2661}
2662
2663/**
2664 * This instruction is generated as a single-channel align1 instruction by
2665 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2666 *
2667 * We can't use the typed atomic op in the FS because that has the execution
2668 * mask ANDed with the pixel mask, but we just want to write the one dword for
2669 * all the pixels.
2670 *
2671 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2672 * one u32.  So we use the same untyped atomic write message as the pixel
2673 * shader.
2674 *
2675 * The untyped atomic operation requires a BUFFER surface type with RAW
2676 * format, and is only accessible through the legacy DATA_CACHE dataport
2677 * messages.
2678 */
2679void brw_shader_time_add(struct brw_compile *p,
2680                         struct brw_reg payload,
2681                         uint32_t surf_index)
2682{
2683   assert(p->brw->gen >= 7);
2684
2685   brw_push_insn_state(p);
2686   brw_set_default_access_mode(p, BRW_ALIGN_1);
2687   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2688   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
2689   brw_pop_insn_state(p);
2690
2691   /* We use brw_vec1_reg and unmasked because we want to increment the given
2692    * offset only once.
2693    */
2694   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2695                                      BRW_ARF_NULL, 0));
2696   brw_set_src0(p, send, brw_vec1_reg(payload.file,
2697                                      payload.nr, 0));
2698   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, surf_index,
2699                                     2 /* message length */,
2700                                     0 /* response length */,
2701                                     false /* header present */);
2702}
2703