brw_eu_emit.c revision 5c5218ea6163f694a256562df1d73a108396e40d
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct intel_context *intel = &p->brw->intel;
67   if (intel->gen < 6)
68      return;
69
70   if (src->file == BRW_MESSAGE_REGISTER_FILE)
71      return;
72
73   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
74      brw_push_insn_state(p);
75      brw_set_mask_control(p, BRW_MASK_DISABLE);
76      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	      retype(*src, BRW_REGISTER_TYPE_UD));
79      brw_pop_insn_state(p);
80   }
81   *src = brw_message_reg(msg_reg_nr);
82}
83
84static void
85gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
86{
87   /* From the BSpec / ISA Reference / send - [DevIVB+]:
88    * "The send with EOT should use register space R112-R127 for <src>. This is
89    *  to enable loading of a new thread into the same slot while the message
90    *  with EOT for current thread is pending dispatch."
91    *
92    * Since we're pretending to have 16 MRFs anyway, we may as well use the
93    * registers required for messages with EOT.
94    */
95   struct intel_context *intel = &p->brw->intel;
96   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
97      reg->file = BRW_GENERAL_REGISTER_FILE;
98      reg->nr += GEN7_MRF_HACK_START;
99   }
100}
101
102
103void
104brw_set_dest(struct brw_compile *p, struct brw_instruction *insn,
105	     struct brw_reg dest)
106{
107   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
108       dest.file != BRW_MESSAGE_REGISTER_FILE)
109      assert(dest.nr < 128);
110
111   gen7_convert_mrf_to_grf(p, &dest);
112
113   insn->bits1.da1.dest_reg_file = dest.file;
114   insn->bits1.da1.dest_reg_type = dest.type;
115   insn->bits1.da1.dest_address_mode = dest.address_mode;
116
117   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
118      insn->bits1.da1.dest_reg_nr = dest.nr;
119
120      if (insn->header.access_mode == BRW_ALIGN_1) {
121	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
122	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
123	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
124	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
125      }
126      else {
127	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
128	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
129	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
130	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
131	  *    this to be programmed as "01".
132	  */
133	 insn->bits1.da16.dest_horiz_stride = 1;
134      }
135   }
136   else {
137      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
138
139      /* These are different sizes in align1 vs align16:
140       */
141      if (insn->header.access_mode == BRW_ALIGN_1) {
142	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
143	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
144	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
145	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
146      }
147      else {
148	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
149	 /* even ignored in da16, still need to set as '01' */
150	 insn->bits1.ia16.dest_horiz_stride = 1;
151      }
152   }
153
154   /* NEW: Set the execution size based on dest.width and
155    * insn->compression_control:
156    */
157   guess_execution_size(p, insn, dest);
158}
159
160extern int reg_type_size[];
161
162static void
163validate_reg(struct brw_instruction *insn, struct brw_reg reg)
164{
165   int hstride_for_reg[] = {0, 1, 2, 4};
166   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
167   int width_for_reg[] = {1, 2, 4, 8, 16};
168   int execsize_for_reg[] = {1, 2, 4, 8, 16};
169   int width, hstride, vstride, execsize;
170
171   if (reg.file == BRW_IMMEDIATE_VALUE) {
172      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
173       * mean the destination has to be 128-bit aligned and the
174       * destination horiz stride has to be a word.
175       */
176      if (reg.type == BRW_REGISTER_TYPE_V) {
177	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
178		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
179      }
180
181      return;
182   }
183
184   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
185       reg.file == BRW_ARF_NULL)
186      return;
187
188   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
189   hstride = hstride_for_reg[reg.hstride];
190
191   if (reg.vstride == 0xf) {
192      vstride = -1;
193   } else {
194      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
195      vstride = vstride_for_reg[reg.vstride];
196   }
197
198   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
199   width = width_for_reg[reg.width];
200
201   assert(insn->header.execution_size >= 0 &&
202	  insn->header.execution_size < Elements(execsize_for_reg));
203   execsize = execsize_for_reg[insn->header.execution_size];
204
205   /* Restrictions from 3.3.10: Register Region Restrictions. */
206   /* 3. */
207   assert(execsize >= width);
208
209   /* 4. */
210   if (execsize == width && hstride != 0) {
211      assert(vstride == -1 || vstride == width * hstride);
212   }
213
214   /* 5. */
215   if (execsize == width && hstride == 0) {
216      /* no restriction on vstride. */
217   }
218
219   /* 6. */
220   if (width == 1) {
221      assert(hstride == 0);
222   }
223
224   /* 7. */
225   if (execsize == 1 && width == 1) {
226      assert(hstride == 0);
227      assert(vstride == 0);
228   }
229
230   /* 8. */
231   if (vstride == 0 && hstride == 0) {
232      assert(width == 1);
233   }
234
235   /* 10. Check destination issues. */
236}
237
238void
239brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
240	     struct brw_reg reg)
241{
242   struct brw_context *brw = p->brw;
243   struct intel_context *intel = &brw->intel;
244
245   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
246      assert(reg.nr < 128);
247
248   gen7_convert_mrf_to_grf(p, &reg);
249
250   if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND ||
251                           insn->header.opcode == BRW_OPCODE_SENDC)) {
252      /* Any source modifiers or regions will be ignored, since this just
253       * identifies the MRF/GRF to start reading the message contents from.
254       * Check for some likely failures.
255       */
256      assert(!reg.negate);
257      assert(!reg.abs);
258      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
259   }
260
261   validate_reg(insn, reg);
262
263   insn->bits1.da1.src0_reg_file = reg.file;
264   insn->bits1.da1.src0_reg_type = reg.type;
265   insn->bits2.da1.src0_abs = reg.abs;
266   insn->bits2.da1.src0_negate = reg.negate;
267   insn->bits2.da1.src0_address_mode = reg.address_mode;
268
269   if (reg.file == BRW_IMMEDIATE_VALUE) {
270      insn->bits3.ud = reg.dw1.ud;
271
272      /* Required to set some fields in src1 as well:
273       */
274      insn->bits1.da1.src1_reg_file = 0; /* arf */
275      insn->bits1.da1.src1_reg_type = reg.type;
276   }
277   else
278   {
279      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
280	 if (insn->header.access_mode == BRW_ALIGN_1) {
281	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
282	    insn->bits2.da1.src0_reg_nr = reg.nr;
283	 }
284	 else {
285	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
286	    insn->bits2.da16.src0_reg_nr = reg.nr;
287	 }
288      }
289      else {
290	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
291
292	 if (insn->header.access_mode == BRW_ALIGN_1) {
293	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
294	 }
295	 else {
296	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
297	 }
298      }
299
300      if (insn->header.access_mode == BRW_ALIGN_1) {
301	 if (reg.width == BRW_WIDTH_1 &&
302	     insn->header.execution_size == BRW_EXECUTE_1) {
303	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
304	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
305	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
306	 }
307	 else {
308	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
309	    insn->bits2.da1.src0_width = reg.width;
310	    insn->bits2.da1.src0_vert_stride = reg.vstride;
311	 }
312      }
313      else {
314	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
315	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
316	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
317	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
318
319	 /* This is an oddity of the fact we're using the same
320	  * descriptions for registers in align_16 as align_1:
321	  */
322	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
323	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
324	 else
325	    insn->bits2.da16.src0_vert_stride = reg.vstride;
326      }
327   }
328}
329
330
331void brw_set_src1(struct brw_compile *p,
332		  struct brw_instruction *insn,
333		  struct brw_reg reg)
334{
335   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
336
337   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
338      assert(reg.nr < 128);
339
340   gen7_convert_mrf_to_grf(p, &reg);
341
342   validate_reg(insn, reg);
343
344   insn->bits1.da1.src1_reg_file = reg.file;
345   insn->bits1.da1.src1_reg_type = reg.type;
346   insn->bits3.da1.src1_abs = reg.abs;
347   insn->bits3.da1.src1_negate = reg.negate;
348
349   /* Only src1 can be immediate in two-argument instructions.
350    */
351   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
352
353   if (reg.file == BRW_IMMEDIATE_VALUE) {
354      insn->bits3.ud = reg.dw1.ud;
355   }
356   else {
357      /* This is a hardware restriction, which may or may not be lifted
358       * in the future:
359       */
360      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
361      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
362
363      if (insn->header.access_mode == BRW_ALIGN_1) {
364	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
365	 insn->bits3.da1.src1_reg_nr = reg.nr;
366      }
367      else {
368	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
369	 insn->bits3.da16.src1_reg_nr = reg.nr;
370      }
371
372      if (insn->header.access_mode == BRW_ALIGN_1) {
373	 if (reg.width == BRW_WIDTH_1 &&
374	     insn->header.execution_size == BRW_EXECUTE_1) {
375	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
376	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
377	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
378	 }
379	 else {
380	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
381	    insn->bits3.da1.src1_width = reg.width;
382	    insn->bits3.da1.src1_vert_stride = reg.vstride;
383	 }
384      }
385      else {
386	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
387	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
388	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
389	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
390
391	 /* This is an oddity of the fact we're using the same
392	  * descriptions for registers in align_16 as align_1:
393	  */
394	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
395	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
396	 else
397	    insn->bits3.da16.src1_vert_stride = reg.vstride;
398      }
399   }
400}
401
402/**
403 * Set the Message Descriptor and Extended Message Descriptor fields
404 * for SEND messages.
405 *
406 * \note This zeroes out the Function Control bits, so it must be called
407 *       \b before filling out any message-specific data.  Callers can
408 *       choose not to fill in irrelevant bits; they will be zero.
409 */
410static void
411brw_set_message_descriptor(struct brw_compile *p,
412			   struct brw_instruction *inst,
413			   enum brw_message_target sfid,
414			   unsigned msg_length,
415			   unsigned response_length,
416			   bool header_present,
417			   bool end_of_thread)
418{
419   struct intel_context *intel = &p->brw->intel;
420
421   brw_set_src1(p, inst, brw_imm_d(0));
422
423   if (intel->gen >= 5) {
424      inst->bits3.generic_gen5.header_present = header_present;
425      inst->bits3.generic_gen5.response_length = response_length;
426      inst->bits3.generic_gen5.msg_length = msg_length;
427      inst->bits3.generic_gen5.end_of_thread = end_of_thread;
428
429      if (intel->gen >= 6) {
430	 /* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
431	 inst->header.destreg__conditionalmod = sfid;
432      } else {
433	 /* Set Extended Message Descriptor (ex_desc) */
434	 inst->bits2.send_gen5.sfid = sfid;
435	 inst->bits2.send_gen5.end_of_thread = end_of_thread;
436      }
437   } else {
438      inst->bits3.generic.response_length = response_length;
439      inst->bits3.generic.msg_length = msg_length;
440      inst->bits3.generic.msg_target = sfid;
441      inst->bits3.generic.end_of_thread = end_of_thread;
442   }
443}
444
445static void brw_set_math_message( struct brw_compile *p,
446				  struct brw_instruction *insn,
447				  GLuint function,
448				  GLuint integer_type,
449				  bool low_precision,
450				  GLuint dataType )
451{
452   struct brw_context *brw = p->brw;
453   struct intel_context *intel = &brw->intel;
454   unsigned msg_length;
455   unsigned response_length;
456
457   /* Infer message length from the function */
458   switch (function) {
459   case BRW_MATH_FUNCTION_POW:
460   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
461   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
462   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
463      msg_length = 2;
464      break;
465   default:
466      msg_length = 1;
467      break;
468   }
469
470   /* Infer response length from the function */
471   switch (function) {
472   case BRW_MATH_FUNCTION_SINCOS:
473   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
474      response_length = 2;
475      break;
476   default:
477      response_length = 1;
478      break;
479   }
480
481
482   brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
483			      msg_length, response_length, false, false);
484   if (intel->gen == 5) {
485      insn->bits3.math_gen5.function = function;
486      insn->bits3.math_gen5.int_type = integer_type;
487      insn->bits3.math_gen5.precision = low_precision;
488      insn->bits3.math_gen5.saturate = insn->header.saturate;
489      insn->bits3.math_gen5.data_type = dataType;
490      insn->bits3.math_gen5.snapshot = 0;
491   } else {
492      insn->bits3.math.function = function;
493      insn->bits3.math.int_type = integer_type;
494      insn->bits3.math.precision = low_precision;
495      insn->bits3.math.saturate = insn->header.saturate;
496      insn->bits3.math.data_type = dataType;
497   }
498   insn->header.saturate = 0;
499}
500
501
502static void brw_set_ff_sync_message(struct brw_compile *p,
503				    struct brw_instruction *insn,
504				    bool allocate,
505				    GLuint response_length,
506				    bool end_of_thread)
507{
508   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
509			      1, response_length, true, end_of_thread);
510   insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
511   insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
512   insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
513   insn->bits3.urb_gen5.allocate = allocate;
514   insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
515   insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
516}
517
518static void brw_set_urb_message( struct brw_compile *p,
519				 struct brw_instruction *insn,
520				 bool allocate,
521				 bool used,
522				 GLuint msg_length,
523				 GLuint response_length,
524				 bool end_of_thread,
525				 bool complete,
526				 GLuint offset,
527				 GLuint swizzle_control )
528{
529   struct brw_context *brw = p->brw;
530   struct intel_context *intel = &brw->intel;
531
532   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
533			      msg_length, response_length, true, end_of_thread);
534   if (intel->gen == 7) {
535      insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
536      insn->bits3.urb_gen7.offset = offset;
537      assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
538      insn->bits3.urb_gen7.swizzle_control = swizzle_control;
539      /* per_slot_offset = 0 makes it ignore offsets in message header */
540      insn->bits3.urb_gen7.per_slot_offset = 0;
541      insn->bits3.urb_gen7.complete = complete;
542   } else if (intel->gen >= 5) {
543      insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
544      insn->bits3.urb_gen5.offset = offset;
545      insn->bits3.urb_gen5.swizzle_control = swizzle_control;
546      insn->bits3.urb_gen5.allocate = allocate;
547      insn->bits3.urb_gen5.used = used;	/* ? */
548      insn->bits3.urb_gen5.complete = complete;
549   } else {
550      insn->bits3.urb.opcode = 0;	/* ? */
551      insn->bits3.urb.offset = offset;
552      insn->bits3.urb.swizzle_control = swizzle_control;
553      insn->bits3.urb.allocate = allocate;
554      insn->bits3.urb.used = used;	/* ? */
555      insn->bits3.urb.complete = complete;
556   }
557}
558
559void
560brw_set_dp_write_message(struct brw_compile *p,
561			 struct brw_instruction *insn,
562			 GLuint binding_table_index,
563			 GLuint msg_control,
564			 GLuint msg_type,
565			 GLuint msg_length,
566			 bool header_present,
567			 GLuint last_render_target,
568			 GLuint response_length,
569			 GLuint end_of_thread,
570			 GLuint send_commit_msg)
571{
572   struct brw_context *brw = p->brw;
573   struct intel_context *intel = &brw->intel;
574   unsigned sfid;
575
576   if (intel->gen >= 7) {
577      /* Use the Render Cache for RT writes; otherwise use the Data Cache */
578      if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
579	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
580      else
581	 sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
582   } else if (intel->gen == 6) {
583      /* Use the render cache for all write messages. */
584      sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
585   } else {
586      sfid = BRW_SFID_DATAPORT_WRITE;
587   }
588
589   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
590			      header_present, end_of_thread);
591
592   if (intel->gen >= 7) {
593      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
594      insn->bits3.gen7_dp.msg_control = msg_control;
595      insn->bits3.gen7_dp.last_render_target = last_render_target;
596      insn->bits3.gen7_dp.msg_type = msg_type;
597   } else if (intel->gen == 6) {
598      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
599      insn->bits3.gen6_dp.msg_control = msg_control;
600      insn->bits3.gen6_dp.last_render_target = last_render_target;
601      insn->bits3.gen6_dp.msg_type = msg_type;
602      insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
603   } else if (intel->gen == 5) {
604      insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
605      insn->bits3.dp_write_gen5.msg_control = msg_control;
606      insn->bits3.dp_write_gen5.last_render_target = last_render_target;
607      insn->bits3.dp_write_gen5.msg_type = msg_type;
608      insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
609   } else {
610      insn->bits3.dp_write.binding_table_index = binding_table_index;
611      insn->bits3.dp_write.msg_control = msg_control;
612      insn->bits3.dp_write.last_render_target = last_render_target;
613      insn->bits3.dp_write.msg_type = msg_type;
614      insn->bits3.dp_write.send_commit_msg = send_commit_msg;
615   }
616}
617
618void
619brw_set_dp_read_message(struct brw_compile *p,
620			struct brw_instruction *insn,
621			GLuint binding_table_index,
622			GLuint msg_control,
623			GLuint msg_type,
624			GLuint target_cache,
625			GLuint msg_length,
626                        bool header_present,
627			GLuint response_length)
628{
629   struct brw_context *brw = p->brw;
630   struct intel_context *intel = &brw->intel;
631   unsigned sfid;
632
633   if (intel->gen >= 7) {
634      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
635   } else if (intel->gen == 6) {
636      if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
637	 sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
638      else
639	 sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
640   } else {
641      sfid = BRW_SFID_DATAPORT_READ;
642   }
643
644   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
645			      header_present, false);
646
647   if (intel->gen >= 7) {
648      insn->bits3.gen7_dp.binding_table_index = binding_table_index;
649      insn->bits3.gen7_dp.msg_control = msg_control;
650      insn->bits3.gen7_dp.last_render_target = 0;
651      insn->bits3.gen7_dp.msg_type = msg_type;
652   } else if (intel->gen == 6) {
653      insn->bits3.gen6_dp.binding_table_index = binding_table_index;
654      insn->bits3.gen6_dp.msg_control = msg_control;
655      insn->bits3.gen6_dp.last_render_target = 0;
656      insn->bits3.gen6_dp.msg_type = msg_type;
657      insn->bits3.gen6_dp.send_commit_msg = 0;
658   } else if (intel->gen == 5) {
659      insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
660      insn->bits3.dp_read_gen5.msg_control = msg_control;
661      insn->bits3.dp_read_gen5.msg_type = msg_type;
662      insn->bits3.dp_read_gen5.target_cache = target_cache;
663   } else if (intel->is_g4x) {
664      insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
665      insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
666      insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
667      insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
668   } else {
669      insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
670      insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
671      insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
672      insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
673   }
674}
675
676void
677brw_set_sampler_message(struct brw_compile *p,
678                        struct brw_instruction *insn,
679                        GLuint binding_table_index,
680                        GLuint sampler,
681                        GLuint msg_type,
682                        GLuint response_length,
683                        GLuint msg_length,
684                        GLuint header_present,
685                        GLuint simd_mode,
686                        GLuint return_format)
687{
688   struct brw_context *brw = p->brw;
689   struct intel_context *intel = &brw->intel;
690
691   brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
692			      response_length, header_present, false);
693
694   if (intel->gen >= 7) {
695      insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
696      insn->bits3.sampler_gen7.sampler = sampler;
697      insn->bits3.sampler_gen7.msg_type = msg_type;
698      insn->bits3.sampler_gen7.simd_mode = simd_mode;
699   } else if (intel->gen >= 5) {
700      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
701      insn->bits3.sampler_gen5.sampler = sampler;
702      insn->bits3.sampler_gen5.msg_type = msg_type;
703      insn->bits3.sampler_gen5.simd_mode = simd_mode;
704   } else if (intel->is_g4x) {
705      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
706      insn->bits3.sampler_g4x.sampler = sampler;
707      insn->bits3.sampler_g4x.msg_type = msg_type;
708   } else {
709      insn->bits3.sampler.binding_table_index = binding_table_index;
710      insn->bits3.sampler.sampler = sampler;
711      insn->bits3.sampler.msg_type = msg_type;
712      insn->bits3.sampler.return_format = return_format;
713   }
714}
715
716
717#define next_insn brw_next_insn
718struct brw_instruction *
719brw_next_insn(struct brw_compile *p, GLuint opcode)
720{
721   struct brw_instruction *insn;
722
723   if (p->nr_insn + 1 > p->store_size) {
724      if (0)
725         printf("incresing the store size to %d\n", p->store_size << 1);
726      p->store_size <<= 1;
727      p->store = reralloc(p->mem_ctx, p->store,
728                          struct brw_instruction, p->store_size);
729      if (!p->store)
730         assert(!"realloc eu store memeory failed");
731   }
732
733   p->next_insn_offset += 16;
734   insn = &p->store[p->nr_insn++];
735   memcpy(insn, p->current, sizeof(*insn));
736
737   /* Reset this one-shot flag:
738    */
739
740   if (p->current->header.destreg__conditionalmod) {
741      p->current->header.destreg__conditionalmod = 0;
742      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
743   }
744
745   insn->header.opcode = opcode;
746   return insn;
747}
748
749static struct brw_instruction *brw_alu1( struct brw_compile *p,
750					 GLuint opcode,
751					 struct brw_reg dest,
752					 struct brw_reg src )
753{
754   struct brw_instruction *insn = next_insn(p, opcode);
755   brw_set_dest(p, insn, dest);
756   brw_set_src0(p, insn, src);
757   return insn;
758}
759
760static struct brw_instruction *brw_alu2(struct brw_compile *p,
761					GLuint opcode,
762					struct brw_reg dest,
763					struct brw_reg src0,
764					struct brw_reg src1 )
765{
766   struct brw_instruction *insn = next_insn(p, opcode);
767   brw_set_dest(p, insn, dest);
768   brw_set_src0(p, insn, src0);
769   brw_set_src1(p, insn, src1);
770   return insn;
771}
772
773static int
774get_3src_subreg_nr(struct brw_reg reg)
775{
776   if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
777      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
778      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
779   } else {
780      return reg.subnr / 4;
781   }
782}
783
784static struct brw_instruction *brw_alu3(struct brw_compile *p,
785					GLuint opcode,
786					struct brw_reg dest,
787					struct brw_reg src0,
788					struct brw_reg src1,
789					struct brw_reg src2)
790{
791   struct brw_instruction *insn = next_insn(p, opcode);
792
793   gen7_convert_mrf_to_grf(p, &dest);
794
795   assert(insn->header.access_mode == BRW_ALIGN_16);
796
797   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
798	  dest.file == BRW_MESSAGE_REGISTER_FILE);
799   assert(dest.nr < 128);
800   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
801   assert(dest.type == BRW_REGISTER_TYPE_F);
802   insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
803   insn->bits1.da3src.dest_reg_nr = dest.nr;
804   insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
805   insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
806   guess_execution_size(p, insn, dest);
807
808   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
809   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
810   assert(src0.nr < 128);
811   assert(src0.type == BRW_REGISTER_TYPE_F);
812   insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
813   insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
814   insn->bits2.da3src.src0_reg_nr = src0.nr;
815   insn->bits1.da3src.src0_abs = src0.abs;
816   insn->bits1.da3src.src0_negate = src0.negate;
817   insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
818
819   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
820   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
821   assert(src1.nr < 128);
822   assert(src1.type == BRW_REGISTER_TYPE_F);
823   insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
824   insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
825   insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
826   insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
827   insn->bits3.da3src.src1_reg_nr = src1.nr;
828   insn->bits1.da3src.src1_abs = src1.abs;
829   insn->bits1.da3src.src1_negate = src1.negate;
830
831   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
832   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
833   assert(src2.nr < 128);
834   assert(src2.type == BRW_REGISTER_TYPE_F);
835   insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
836   insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
837   insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
838   insn->bits3.da3src.src2_reg_nr = src2.nr;
839   insn->bits1.da3src.src2_abs = src2.abs;
840   insn->bits1.da3src.src2_negate = src2.negate;
841
842   return insn;
843}
844
845
846/***********************************************************************
847 * Convenience routines.
848 */
849#define ALU1(OP)					\
850struct brw_instruction *brw_##OP(struct brw_compile *p,	\
851	      struct brw_reg dest,			\
852	      struct brw_reg src0)   			\
853{							\
854   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
855}
856
857#define ALU2(OP)					\
858struct brw_instruction *brw_##OP(struct brw_compile *p,	\
859	      struct brw_reg dest,			\
860	      struct brw_reg src0,			\
861	      struct brw_reg src1)   			\
862{							\
863   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
864}
865
866#define ALU3(OP)					\
867struct brw_instruction *brw_##OP(struct brw_compile *p,	\
868	      struct brw_reg dest,			\
869	      struct brw_reg src0,			\
870	      struct brw_reg src1,			\
871	      struct brw_reg src2)   			\
872{							\
873   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
874}
875
876/* Rounding operations (other than RNDD) require two instructions - the first
877 * stores a rounded value (possibly the wrong way) in the dest register, but
878 * also sets a per-channel "increment bit" in the flag register.  A predicated
879 * add of 1.0 fixes dest to contain the desired result.
880 *
881 * Sandybridge and later appear to round correctly without an ADD.
882 */
883#define ROUND(OP)							      \
884void brw_##OP(struct brw_compile *p,					      \
885	      struct brw_reg dest,					      \
886	      struct brw_reg src)					      \
887{									      \
888   struct brw_instruction *rnd, *add;					      \
889   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
890   brw_set_dest(p, rnd, dest);						      \
891   brw_set_src0(p, rnd, src);						      \
892									      \
893   if (p->brw->intel.gen < 6) {						      \
894      /* turn on round-increments */					      \
895      rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R;		      \
896      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
897      add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
898   }									      \
899}
900
901
902ALU1(MOV)
903ALU2(SEL)
904ALU1(NOT)
905ALU2(AND)
906ALU2(OR)
907ALU2(XOR)
908ALU2(SHR)
909ALU2(SHL)
910ALU2(RSR)
911ALU2(RSL)
912ALU2(ASR)
913ALU1(F32TO16)
914ALU1(F16TO32)
915ALU1(FRC)
916ALU1(RNDD)
917ALU2(MAC)
918ALU2(MACH)
919ALU1(LZD)
920ALU2(DP4)
921ALU2(DPH)
922ALU2(DP3)
923ALU2(DP2)
924ALU2(LINE)
925ALU2(PLN)
926ALU3(MAD)
927ALU3(LRP)
928
929ROUND(RNDZ)
930ROUND(RNDE)
931
932
933struct brw_instruction *brw_ADD(struct brw_compile *p,
934				struct brw_reg dest,
935				struct brw_reg src0,
936				struct brw_reg src1)
937{
938   /* 6.2.2: add */
939   if (src0.type == BRW_REGISTER_TYPE_F ||
940       (src0.file == BRW_IMMEDIATE_VALUE &&
941	src0.type == BRW_REGISTER_TYPE_VF)) {
942      assert(src1.type != BRW_REGISTER_TYPE_UD);
943      assert(src1.type != BRW_REGISTER_TYPE_D);
944   }
945
946   if (src1.type == BRW_REGISTER_TYPE_F ||
947       (src1.file == BRW_IMMEDIATE_VALUE &&
948	src1.type == BRW_REGISTER_TYPE_VF)) {
949      assert(src0.type != BRW_REGISTER_TYPE_UD);
950      assert(src0.type != BRW_REGISTER_TYPE_D);
951   }
952
953   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
954}
955
956struct brw_instruction *brw_AVG(struct brw_compile *p,
957                                struct brw_reg dest,
958                                struct brw_reg src0,
959                                struct brw_reg src1)
960{
961   assert(dest.type == src0.type);
962   assert(src0.type == src1.type);
963   switch (src0.type) {
964   case BRW_REGISTER_TYPE_B:
965   case BRW_REGISTER_TYPE_UB:
966   case BRW_REGISTER_TYPE_W:
967   case BRW_REGISTER_TYPE_UW:
968   case BRW_REGISTER_TYPE_D:
969   case BRW_REGISTER_TYPE_UD:
970      break;
971   default:
972      assert(!"Bad type for brw_AVG");
973   }
974
975   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
976}
977
978struct brw_instruction *brw_MUL(struct brw_compile *p,
979				struct brw_reg dest,
980				struct brw_reg src0,
981				struct brw_reg src1)
982{
983   /* 6.32.38: mul */
984   if (src0.type == BRW_REGISTER_TYPE_D ||
985       src0.type == BRW_REGISTER_TYPE_UD ||
986       src1.type == BRW_REGISTER_TYPE_D ||
987       src1.type == BRW_REGISTER_TYPE_UD) {
988      assert(dest.type != BRW_REGISTER_TYPE_F);
989   }
990
991   if (src0.type == BRW_REGISTER_TYPE_F ||
992       (src0.file == BRW_IMMEDIATE_VALUE &&
993	src0.type == BRW_REGISTER_TYPE_VF)) {
994      assert(src1.type != BRW_REGISTER_TYPE_UD);
995      assert(src1.type != BRW_REGISTER_TYPE_D);
996   }
997
998   if (src1.type == BRW_REGISTER_TYPE_F ||
999       (src1.file == BRW_IMMEDIATE_VALUE &&
1000	src1.type == BRW_REGISTER_TYPE_VF)) {
1001      assert(src0.type != BRW_REGISTER_TYPE_UD);
1002      assert(src0.type != BRW_REGISTER_TYPE_D);
1003   }
1004
1005   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1006	  src0.nr != BRW_ARF_ACCUMULATOR);
1007   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1008	  src1.nr != BRW_ARF_ACCUMULATOR);
1009
1010   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1011}
1012
1013
1014void brw_NOP(struct brw_compile *p)
1015{
1016   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
1017   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1018   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1019   brw_set_src1(p, insn, brw_imm_ud(0x0));
1020}
1021
1022
1023
1024
1025
1026/***********************************************************************
1027 * Comparisons, if/else/endif
1028 */
1029
1030struct brw_instruction *brw_JMPI(struct brw_compile *p,
1031                                 struct brw_reg dest,
1032                                 struct brw_reg src0,
1033                                 struct brw_reg src1)
1034{
1035   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
1036
1037   insn->header.execution_size = 1;
1038   insn->header.compression_control = BRW_COMPRESSION_NONE;
1039   insn->header.mask_control = BRW_MASK_DISABLE;
1040
1041   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1042
1043   return insn;
1044}
1045
1046static void
1047push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
1048{
1049   p->if_stack[p->if_stack_depth] = inst - p->store;
1050
1051   p->if_stack_depth++;
1052   if (p->if_stack_array_size <= p->if_stack_depth) {
1053      p->if_stack_array_size *= 2;
1054      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1055			     p->if_stack_array_size);
1056   }
1057}
1058
1059static struct brw_instruction *
1060pop_if_stack(struct brw_compile *p)
1061{
1062   p->if_stack_depth--;
1063   return &p->store[p->if_stack[p->if_stack_depth]];
1064}
1065
1066static void
1067push_loop_stack(struct brw_compile *p, struct brw_instruction *inst)
1068{
1069   if (p->loop_stack_array_size < p->loop_stack_depth) {
1070      p->loop_stack_array_size *= 2;
1071      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1072			       p->loop_stack_array_size);
1073      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1074				     p->loop_stack_array_size);
1075   }
1076
1077   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1078   p->loop_stack_depth++;
1079   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1080}
1081
1082static struct brw_instruction *
1083get_inner_do_insn(struct brw_compile *p)
1084{
1085   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1086}
1087
1088/* EU takes the value from the flag register and pushes it onto some
1089 * sort of a stack (presumably merging with any flag value already on
1090 * the stack).  Within an if block, the flags at the top of the stack
1091 * control execution on each channel of the unit, eg. on each of the
1092 * 16 pixel values in our wm programs.
1093 *
1094 * When the matching 'else' instruction is reached (presumably by
1095 * countdown of the instruction count patched in by our ELSE/ENDIF
1096 * functions), the relevent flags are inverted.
1097 *
1098 * When the matching 'endif' instruction is reached, the flags are
1099 * popped off.  If the stack is now empty, normal execution resumes.
1100 */
1101struct brw_instruction *
1102brw_IF(struct brw_compile *p, GLuint execute_size)
1103{
1104   struct intel_context *intel = &p->brw->intel;
1105   struct brw_instruction *insn;
1106
1107   insn = next_insn(p, BRW_OPCODE_IF);
1108
1109   /* Override the defaults for this instruction:
1110    */
1111   if (intel->gen < 6) {
1112      brw_set_dest(p, insn, brw_ip_reg());
1113      brw_set_src0(p, insn, brw_ip_reg());
1114      brw_set_src1(p, insn, brw_imm_d(0x0));
1115   } else if (intel->gen == 6) {
1116      brw_set_dest(p, insn, brw_imm_w(0));
1117      insn->bits1.branch_gen6.jump_count = 0;
1118      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1119      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1120   } else {
1121      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1122      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1123      brw_set_src1(p, insn, brw_imm_ud(0));
1124      insn->bits3.break_cont.jip = 0;
1125      insn->bits3.break_cont.uip = 0;
1126   }
1127
1128   insn->header.execution_size = execute_size;
1129   insn->header.compression_control = BRW_COMPRESSION_NONE;
1130   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
1131   insn->header.mask_control = BRW_MASK_ENABLE;
1132   if (!p->single_program_flow)
1133      insn->header.thread_control = BRW_THREAD_SWITCH;
1134
1135   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1136
1137   push_if_stack(p, insn);
1138   p->if_depth_in_loop[p->loop_stack_depth]++;
1139   return insn;
1140}
1141
1142/* This function is only used for gen6-style IF instructions with an
1143 * embedded comparison (conditional modifier).  It is not used on gen7.
1144 */
1145struct brw_instruction *
1146gen6_IF(struct brw_compile *p, uint32_t conditional,
1147	struct brw_reg src0, struct brw_reg src1)
1148{
1149   struct brw_instruction *insn;
1150
1151   insn = next_insn(p, BRW_OPCODE_IF);
1152
1153   brw_set_dest(p, insn, brw_imm_w(0));
1154   if (p->compressed) {
1155      insn->header.execution_size = BRW_EXECUTE_16;
1156   } else {
1157      insn->header.execution_size = BRW_EXECUTE_8;
1158   }
1159   insn->bits1.branch_gen6.jump_count = 0;
1160   brw_set_src0(p, insn, src0);
1161   brw_set_src1(p, insn, src1);
1162
1163   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
1164   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1165   insn->header.destreg__conditionalmod = conditional;
1166
1167   if (!p->single_program_flow)
1168      insn->header.thread_control = BRW_THREAD_SWITCH;
1169
1170   push_if_stack(p, insn);
1171   return insn;
1172}
1173
1174/**
1175 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1176 */
1177static void
1178convert_IF_ELSE_to_ADD(struct brw_compile *p,
1179		       struct brw_instruction *if_inst,
1180		       struct brw_instruction *else_inst)
1181{
1182   /* The next instruction (where the ENDIF would be, if it existed) */
1183   struct brw_instruction *next_inst = &p->store[p->nr_insn];
1184
1185   assert(p->single_program_flow);
1186   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1187   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1188   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1189
1190   /* Convert IF to an ADD instruction that moves the instruction pointer
1191    * to the first instruction of the ELSE block.  If there is no ELSE
1192    * block, point to where ENDIF would be.  Reverse the predicate.
1193    *
1194    * There's no need to execute an ENDIF since we don't need to do any
1195    * stack operations, and if we're currently executing, we just want to
1196    * continue normally.
1197    */
1198   if_inst->header.opcode = BRW_OPCODE_ADD;
1199   if_inst->header.predicate_inverse = 1;
1200
1201   if (else_inst != NULL) {
1202      /* Convert ELSE to an ADD instruction that points where the ENDIF
1203       * would be.
1204       */
1205      else_inst->header.opcode = BRW_OPCODE_ADD;
1206
1207      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1208      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1209   } else {
1210      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1211   }
1212}
1213
1214/**
1215 * Patch IF and ELSE instructions with appropriate jump targets.
1216 */
1217static void
1218patch_IF_ELSE(struct brw_compile *p,
1219	      struct brw_instruction *if_inst,
1220	      struct brw_instruction *else_inst,
1221	      struct brw_instruction *endif_inst)
1222{
1223   struct intel_context *intel = &p->brw->intel;
1224
1225   /* We shouldn't be patching IF and ELSE instructions in single program flow
1226    * mode when gen < 6, because in single program flow mode on those
1227    * platforms, we convert flow control instructions to conditional ADDs that
1228    * operate on IP (see brw_ENDIF).
1229    *
1230    * However, on Gen6, writing to IP doesn't work in single program flow mode
1231    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1232    * not be updated by non-flow control instructions.").  And on later
1233    * platforms, there is no significant benefit to converting control flow
1234    * instructions to conditional ADDs.  So we do patch IF and ELSE
1235    * instructions in single program flow mode on those platforms.
1236    */
1237   if (intel->gen < 6)
1238      assert(!p->single_program_flow);
1239
1240   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1241   assert(endif_inst != NULL);
1242   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1243
1244   unsigned br = 1;
1245   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1246    * requires 2 chunks.
1247    */
1248   if (intel->gen >= 5)
1249      br = 2;
1250
1251   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1252   endif_inst->header.execution_size = if_inst->header.execution_size;
1253
1254   if (else_inst == NULL) {
1255      /* Patch IF -> ENDIF */
1256      if (intel->gen < 6) {
1257	 /* Turn it into an IFF, which means no mask stack operations for
1258	  * all-false and jumping past the ENDIF.
1259	  */
1260	 if_inst->header.opcode = BRW_OPCODE_IFF;
1261	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1262	 if_inst->bits3.if_else.pop_count = 0;
1263	 if_inst->bits3.if_else.pad0 = 0;
1264      } else if (intel->gen == 6) {
1265	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1266	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1267      } else {
1268	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1269	 if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
1270      }
1271   } else {
1272      else_inst->header.execution_size = if_inst->header.execution_size;
1273
1274      /* Patch IF -> ELSE */
1275      if (intel->gen < 6) {
1276	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1277	 if_inst->bits3.if_else.pop_count = 0;
1278	 if_inst->bits3.if_else.pad0 = 0;
1279      } else if (intel->gen == 6) {
1280	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1281      }
1282
1283      /* Patch ELSE -> ENDIF */
1284      if (intel->gen < 6) {
1285	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1286	  * matching ENDIF.
1287	  */
1288	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1289	 else_inst->bits3.if_else.pop_count = 1;
1290	 else_inst->bits3.if_else.pad0 = 0;
1291      } else if (intel->gen == 6) {
1292	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1293	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1294      } else {
1295	 /* The IF instruction's JIP should point just past the ELSE */
1296	 if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
1297	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1298	 if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
1299	 else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
1300      }
1301   }
1302}
1303
1304void
1305brw_ELSE(struct brw_compile *p)
1306{
1307   struct intel_context *intel = &p->brw->intel;
1308   struct brw_instruction *insn;
1309
1310   insn = next_insn(p, BRW_OPCODE_ELSE);
1311
1312   if (intel->gen < 6) {
1313      brw_set_dest(p, insn, brw_ip_reg());
1314      brw_set_src0(p, insn, brw_ip_reg());
1315      brw_set_src1(p, insn, brw_imm_d(0x0));
1316   } else if (intel->gen == 6) {
1317      brw_set_dest(p, insn, brw_imm_w(0));
1318      insn->bits1.branch_gen6.jump_count = 0;
1319      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1320      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1321   } else {
1322      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1323      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1324      brw_set_src1(p, insn, brw_imm_ud(0));
1325      insn->bits3.break_cont.jip = 0;
1326      insn->bits3.break_cont.uip = 0;
1327   }
1328
1329   insn->header.compression_control = BRW_COMPRESSION_NONE;
1330   insn->header.mask_control = BRW_MASK_ENABLE;
1331   if (!p->single_program_flow)
1332      insn->header.thread_control = BRW_THREAD_SWITCH;
1333
1334   push_if_stack(p, insn);
1335}
1336
1337void
1338brw_ENDIF(struct brw_compile *p)
1339{
1340   struct intel_context *intel = &p->brw->intel;
1341   struct brw_instruction *insn = NULL;
1342   struct brw_instruction *else_inst = NULL;
1343   struct brw_instruction *if_inst = NULL;
1344   struct brw_instruction *tmp;
1345   bool emit_endif = true;
1346
1347   /* In single program flow mode, we can express IF and ELSE instructions
1348    * equivalently as ADD instructions that operate on IP.  On platforms prior
1349    * to Gen6, flow control instructions cause an implied thread switch, so
1350    * this is a significant savings.
1351    *
1352    * However, on Gen6, writing to IP doesn't work in single program flow mode
1353    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1354    * not be updated by non-flow control instructions.").  And on later
1355    * platforms, there is no significant benefit to converting control flow
1356    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1357    * Gen5.
1358    */
1359   if (intel->gen < 6 && p->single_program_flow)
1360      emit_endif = false;
1361
1362   /*
1363    * A single next_insn() may change the base adress of instruction store
1364    * memory(p->store), so call it first before referencing the instruction
1365    * store pointer from an index
1366    */
1367   if (emit_endif)
1368      insn = next_insn(p, BRW_OPCODE_ENDIF);
1369
1370   /* Pop the IF and (optional) ELSE instructions from the stack */
1371   p->if_depth_in_loop[p->loop_stack_depth]--;
1372   tmp = pop_if_stack(p);
1373   if (tmp->header.opcode == BRW_OPCODE_ELSE) {
1374      else_inst = tmp;
1375      tmp = pop_if_stack(p);
1376   }
1377   if_inst = tmp;
1378
1379   if (!emit_endif) {
1380      /* ENDIF is useless; don't bother emitting it. */
1381      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1382      return;
1383   }
1384
1385   if (intel->gen < 6) {
1386      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1387      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1388      brw_set_src1(p, insn, brw_imm_d(0x0));
1389   } else if (intel->gen == 6) {
1390      brw_set_dest(p, insn, brw_imm_w(0));
1391      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1392      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1393   } else {
1394      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1395      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1396      brw_set_src1(p, insn, brw_imm_ud(0));
1397   }
1398
1399   insn->header.compression_control = BRW_COMPRESSION_NONE;
1400   insn->header.mask_control = BRW_MASK_ENABLE;
1401   insn->header.thread_control = BRW_THREAD_SWITCH;
1402
1403   /* Also pop item off the stack in the endif instruction: */
1404   if (intel->gen < 6) {
1405      insn->bits3.if_else.jump_count = 0;
1406      insn->bits3.if_else.pop_count = 1;
1407      insn->bits3.if_else.pad0 = 0;
1408   } else if (intel->gen == 6) {
1409      insn->bits1.branch_gen6.jump_count = 2;
1410   } else {
1411      insn->bits3.break_cont.jip = 2;
1412   }
1413   patch_IF_ELSE(p, if_inst, else_inst, insn);
1414}
1415
1416struct brw_instruction *brw_BREAK(struct brw_compile *p)
1417{
1418   struct intel_context *intel = &p->brw->intel;
1419   struct brw_instruction *insn;
1420
1421   insn = next_insn(p, BRW_OPCODE_BREAK);
1422   if (intel->gen >= 6) {
1423      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1424      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1425      brw_set_src1(p, insn, brw_imm_d(0x0));
1426   } else {
1427      brw_set_dest(p, insn, brw_ip_reg());
1428      brw_set_src0(p, insn, brw_ip_reg());
1429      brw_set_src1(p, insn, brw_imm_d(0x0));
1430      insn->bits3.if_else.pad0 = 0;
1431      insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1432   }
1433   insn->header.compression_control = BRW_COMPRESSION_NONE;
1434   insn->header.execution_size = BRW_EXECUTE_8;
1435
1436   return insn;
1437}
1438
1439struct brw_instruction *gen6_CONT(struct brw_compile *p)
1440{
1441   struct brw_instruction *insn;
1442
1443   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1444   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1445   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1446   brw_set_dest(p, insn, brw_ip_reg());
1447   brw_set_src0(p, insn, brw_ip_reg());
1448   brw_set_src1(p, insn, brw_imm_d(0x0));
1449
1450   insn->header.compression_control = BRW_COMPRESSION_NONE;
1451   insn->header.execution_size = BRW_EXECUTE_8;
1452   return insn;
1453}
1454
1455struct brw_instruction *brw_CONT(struct brw_compile *p)
1456{
1457   struct brw_instruction *insn;
1458   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1459   brw_set_dest(p, insn, brw_ip_reg());
1460   brw_set_src0(p, insn, brw_ip_reg());
1461   brw_set_src1(p, insn, brw_imm_d(0x0));
1462   insn->header.compression_control = BRW_COMPRESSION_NONE;
1463   insn->header.execution_size = BRW_EXECUTE_8;
1464   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1465   insn->bits3.if_else.pad0 = 0;
1466   insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
1467   return insn;
1468}
1469
1470struct brw_instruction *gen6_HALT(struct brw_compile *p)
1471{
1472   struct brw_instruction *insn;
1473
1474   insn = next_insn(p, BRW_OPCODE_HALT);
1475   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1476   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1477   brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1478
1479   if (p->compressed) {
1480      insn->header.execution_size = BRW_EXECUTE_16;
1481   } else {
1482      insn->header.compression_control = BRW_COMPRESSION_NONE;
1483      insn->header.execution_size = BRW_EXECUTE_8;
1484   }
1485   return insn;
1486}
1487
1488/* DO/WHILE loop:
1489 *
1490 * The DO/WHILE is just an unterminated loop -- break or continue are
1491 * used for control within the loop.  We have a few ways they can be
1492 * done.
1493 *
1494 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1495 * jip and no DO instruction.
1496 *
1497 * For non-uniform control flow pre-gen6, there's a DO instruction to
1498 * push the mask, and a WHILE to jump back, and BREAK to get out and
1499 * pop the mask.
1500 *
1501 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1502 * just points back to the first instruction of the loop.
1503 */
1504struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1505{
1506   struct intel_context *intel = &p->brw->intel;
1507
1508   if (intel->gen >= 6 || p->single_program_flow) {
1509      push_loop_stack(p, &p->store[p->nr_insn]);
1510      return &p->store[p->nr_insn];
1511   } else {
1512      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1513
1514      push_loop_stack(p, insn);
1515
1516      /* Override the defaults for this instruction:
1517       */
1518      brw_set_dest(p, insn, brw_null_reg());
1519      brw_set_src0(p, insn, brw_null_reg());
1520      brw_set_src1(p, insn, brw_null_reg());
1521
1522      insn->header.compression_control = BRW_COMPRESSION_NONE;
1523      insn->header.execution_size = execute_size;
1524      insn->header.predicate_control = BRW_PREDICATE_NONE;
1525      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1526      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1527
1528      return insn;
1529   }
1530}
1531
1532/**
1533 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1534 * instruction here.
1535 *
1536 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1537 * nesting, since it can always just point to the end of the block/current loop.
1538 */
1539static void
1540brw_patch_break_cont(struct brw_compile *p, struct brw_instruction *while_inst)
1541{
1542   struct intel_context *intel = &p->brw->intel;
1543   struct brw_instruction *do_inst = get_inner_do_insn(p);
1544   struct brw_instruction *inst;
1545   int br = (intel->gen == 5) ? 2 : 1;
1546
1547   for (inst = while_inst - 1; inst != do_inst; inst--) {
1548      /* If the jump count is != 0, that means that this instruction has already
1549       * been patched because it's part of a loop inside of the one we're
1550       * patching.
1551       */
1552      if (inst->header.opcode == BRW_OPCODE_BREAK &&
1553	  inst->bits3.if_else.jump_count == 0) {
1554	 inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
1555      } else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
1556		 inst->bits3.if_else.jump_count == 0) {
1557	 inst->bits3.if_else.jump_count = br * (while_inst - inst);
1558      }
1559   }
1560}
1561
1562struct brw_instruction *brw_WHILE(struct brw_compile *p)
1563{
1564   struct intel_context *intel = &p->brw->intel;
1565   struct brw_instruction *insn, *do_insn;
1566   GLuint br = 1;
1567
1568   if (intel->gen >= 5)
1569      br = 2;
1570
1571   if (intel->gen >= 7) {
1572      insn = next_insn(p, BRW_OPCODE_WHILE);
1573      do_insn = get_inner_do_insn(p);
1574
1575      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1576      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1577      brw_set_src1(p, insn, brw_imm_ud(0));
1578      insn->bits3.break_cont.jip = br * (do_insn - insn);
1579
1580      insn->header.execution_size = BRW_EXECUTE_8;
1581   } else if (intel->gen == 6) {
1582      insn = next_insn(p, BRW_OPCODE_WHILE);
1583      do_insn = get_inner_do_insn(p);
1584
1585      brw_set_dest(p, insn, brw_imm_w(0));
1586      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1587      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1588      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1589
1590      insn->header.execution_size = BRW_EXECUTE_8;
1591   } else {
1592      if (p->single_program_flow) {
1593	 insn = next_insn(p, BRW_OPCODE_ADD);
1594         do_insn = get_inner_do_insn(p);
1595
1596	 brw_set_dest(p, insn, brw_ip_reg());
1597	 brw_set_src0(p, insn, brw_ip_reg());
1598	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1599	 insn->header.execution_size = BRW_EXECUTE_1;
1600      } else {
1601	 insn = next_insn(p, BRW_OPCODE_WHILE);
1602         do_insn = get_inner_do_insn(p);
1603
1604	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1605
1606	 brw_set_dest(p, insn, brw_ip_reg());
1607	 brw_set_src0(p, insn, brw_ip_reg());
1608	 brw_set_src1(p, insn, brw_imm_d(0));
1609
1610	 insn->header.execution_size = do_insn->header.execution_size;
1611	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1612	 insn->bits3.if_else.pop_count = 0;
1613	 insn->bits3.if_else.pad0 = 0;
1614
1615	 brw_patch_break_cont(p, insn);
1616      }
1617   }
1618   insn->header.compression_control = BRW_COMPRESSION_NONE;
1619   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1620
1621   p->loop_stack_depth--;
1622
1623   return insn;
1624}
1625
1626
1627/* FORWARD JUMPS:
1628 */
1629void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
1630{
1631   struct intel_context *intel = &p->brw->intel;
1632   struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
1633   GLuint jmpi = 1;
1634
1635   if (intel->gen >= 5)
1636      jmpi = 2;
1637
1638   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1639   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1640
1641   jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
1642}
1643
1644
1645
1646/* To integrate with the above, it makes sense that the comparison
1647 * instruction should populate the flag register.  It might be simpler
1648 * just to use the flag reg for most WM tasks?
1649 */
1650void brw_CMP(struct brw_compile *p,
1651	     struct brw_reg dest,
1652	     GLuint conditional,
1653	     struct brw_reg src0,
1654	     struct brw_reg src1)
1655{
1656   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1657
1658   insn->header.destreg__conditionalmod = conditional;
1659   brw_set_dest(p, insn, dest);
1660   brw_set_src0(p, insn, src0);
1661   brw_set_src1(p, insn, src1);
1662
1663/*    guess_execution_size(insn, src0); */
1664
1665
1666   /* Make it so that future instructions will use the computed flag
1667    * value until brw_set_predicate_control_flag_value() is called
1668    * again.
1669    */
1670   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1671       dest.nr == 0) {
1672      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1673      p->flag_value = 0xff;
1674   }
1675}
1676
1677/* Issue 'wait' instruction for n1, host could program MMIO
1678   to wake up thread. */
1679void brw_WAIT (struct brw_compile *p)
1680{
1681   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1682   struct brw_reg src = brw_notification_1_reg();
1683
1684   brw_set_dest(p, insn, src);
1685   brw_set_src0(p, insn, src);
1686   brw_set_src1(p, insn, brw_null_reg());
1687   insn->header.execution_size = 0; /* must */
1688   insn->header.predicate_control = 0;
1689   insn->header.compression_control = 0;
1690}
1691
1692
1693/***********************************************************************
1694 * Helpers for the various SEND message types:
1695 */
1696
1697/** Extended math function, float[8].
1698 */
1699void brw_math( struct brw_compile *p,
1700	       struct brw_reg dest,
1701	       GLuint function,
1702	       GLuint msg_reg_nr,
1703	       struct brw_reg src,
1704	       GLuint data_type,
1705	       GLuint precision )
1706{
1707   struct intel_context *intel = &p->brw->intel;
1708
1709   if (intel->gen >= 6) {
1710      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1711
1712      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1713             (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1714      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1715
1716      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1717      if (intel->gen == 6)
1718	 assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1719
1720      /* Source modifiers are ignored for extended math instructions on Gen6. */
1721      if (intel->gen == 6) {
1722	 assert(!src.negate);
1723	 assert(!src.abs);
1724      }
1725
1726      if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1727	  function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1728	  function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1729	 assert(src.type != BRW_REGISTER_TYPE_F);
1730      } else {
1731	 assert(src.type == BRW_REGISTER_TYPE_F);
1732      }
1733
1734      /* Math is the same ISA format as other opcodes, except that CondModifier
1735       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1736       */
1737      insn->header.destreg__conditionalmod = function;
1738
1739      brw_set_dest(p, insn, dest);
1740      brw_set_src0(p, insn, src);
1741      brw_set_src1(p, insn, brw_null_reg());
1742   } else {
1743      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1744
1745      /* Example code doesn't set predicate_control for send
1746       * instructions.
1747       */
1748      insn->header.predicate_control = 0;
1749      insn->header.destreg__conditionalmod = msg_reg_nr;
1750
1751      brw_set_dest(p, insn, dest);
1752      brw_set_src0(p, insn, src);
1753      brw_set_math_message(p,
1754			   insn,
1755			   function,
1756			   src.type == BRW_REGISTER_TYPE_D,
1757			   precision,
1758			   data_type);
1759   }
1760}
1761
1762/** Extended math function, float[8].
1763 */
1764void brw_math2(struct brw_compile *p,
1765	       struct brw_reg dest,
1766	       GLuint function,
1767	       struct brw_reg src0,
1768	       struct brw_reg src1)
1769{
1770   struct intel_context *intel = &p->brw->intel;
1771   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1772
1773   assert(intel->gen >= 6);
1774   (void) intel;
1775
1776
1777   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1778          (intel->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1779   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1780   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1781
1782   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1783   if (intel->gen == 6) {
1784      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1785      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1786   }
1787
1788   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1789       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1790       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1791      assert(src0.type != BRW_REGISTER_TYPE_F);
1792      assert(src1.type != BRW_REGISTER_TYPE_F);
1793   } else {
1794      assert(src0.type == BRW_REGISTER_TYPE_F);
1795      assert(src1.type == BRW_REGISTER_TYPE_F);
1796   }
1797
1798   /* Source modifiers are ignored for extended math instructions on Gen6. */
1799   if (intel->gen == 6) {
1800      assert(!src0.negate);
1801      assert(!src0.abs);
1802      assert(!src1.negate);
1803      assert(!src1.abs);
1804   }
1805
1806   /* Math is the same ISA format as other opcodes, except that CondModifier
1807    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1808    */
1809   insn->header.destreg__conditionalmod = function;
1810
1811   brw_set_dest(p, insn, dest);
1812   brw_set_src0(p, insn, src0);
1813   brw_set_src1(p, insn, src1);
1814}
1815
1816
1817/**
1818 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1819 * using a constant offset per channel.
1820 *
1821 * The offset must be aligned to oword size (16 bytes).  Used for
1822 * register spilling.
1823 */
1824void brw_oword_block_write_scratch(struct brw_compile *p,
1825				   struct brw_reg mrf,
1826				   int num_regs,
1827				   GLuint offset)
1828{
1829   struct intel_context *intel = &p->brw->intel;
1830   uint32_t msg_control, msg_type;
1831   int mlen;
1832
1833   if (intel->gen >= 6)
1834      offset /= 16;
1835
1836   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1837
1838   if (num_regs == 1) {
1839      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1840      mlen = 2;
1841   } else {
1842      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1843      mlen = 3;
1844   }
1845
1846   /* Set up the message header.  This is g0, with g0.2 filled with
1847    * the offset.  We don't want to leave our offset around in g0 or
1848    * it'll screw up texture samples, so set it up inside the message
1849    * reg.
1850    */
1851   {
1852      brw_push_insn_state(p);
1853      brw_set_mask_control(p, BRW_MASK_DISABLE);
1854      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1855
1856      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1857
1858      /* set message header global offset field (reg 0, element 2) */
1859      brw_MOV(p,
1860	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1861				  mrf.nr,
1862				  2), BRW_REGISTER_TYPE_UD),
1863	      brw_imm_ud(offset));
1864
1865      brw_pop_insn_state(p);
1866   }
1867
1868   {
1869      struct brw_reg dest;
1870      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1871      int send_commit_msg;
1872      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1873					 BRW_REGISTER_TYPE_UW);
1874
1875      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1876	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1877	 src_header = vec16(src_header);
1878      }
1879      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1880      insn->header.destreg__conditionalmod = mrf.nr;
1881
1882      /* Until gen6, writes followed by reads from the same location
1883       * are not guaranteed to be ordered unless write_commit is set.
1884       * If set, then a no-op write is issued to the destination
1885       * register to set a dependency, and a read from the destination
1886       * can be used to ensure the ordering.
1887       *
1888       * For gen6, only writes between different threads need ordering
1889       * protection.  Our use of DP writes is all about register
1890       * spilling within a thread.
1891       */
1892      if (intel->gen >= 6) {
1893	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1894	 send_commit_msg = 0;
1895      } else {
1896	 dest = src_header;
1897	 send_commit_msg = 1;
1898      }
1899
1900      brw_set_dest(p, insn, dest);
1901      if (intel->gen >= 6) {
1902	 brw_set_src0(p, insn, mrf);
1903      } else {
1904	 brw_set_src0(p, insn, brw_null_reg());
1905      }
1906
1907      if (intel->gen >= 6)
1908	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1909      else
1910	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1911
1912      brw_set_dp_write_message(p,
1913			       insn,
1914			       255, /* binding table index (255=stateless) */
1915			       msg_control,
1916			       msg_type,
1917			       mlen,
1918			       true, /* header_present */
1919			       0, /* not a render target */
1920			       send_commit_msg, /* response_length */
1921			       0, /* eot */
1922			       send_commit_msg);
1923   }
1924}
1925
1926
1927/**
1928 * Read a block of owords (half a GRF each) from the scratch buffer
1929 * using a constant index per channel.
1930 *
1931 * Offset must be aligned to oword size (16 bytes).  Used for register
1932 * spilling.
1933 */
1934void
1935brw_oword_block_read_scratch(struct brw_compile *p,
1936			     struct brw_reg dest,
1937			     struct brw_reg mrf,
1938			     int num_regs,
1939			     GLuint offset)
1940{
1941   struct intel_context *intel = &p->brw->intel;
1942   uint32_t msg_control;
1943   int rlen;
1944
1945   if (intel->gen >= 6)
1946      offset /= 16;
1947
1948   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1949   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1950
1951   if (num_regs == 1) {
1952      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1953      rlen = 1;
1954   } else {
1955      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1956      rlen = 2;
1957   }
1958
1959   {
1960      brw_push_insn_state(p);
1961      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1962      brw_set_mask_control(p, BRW_MASK_DISABLE);
1963
1964      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1965
1966      /* set message header global offset field (reg 0, element 2) */
1967      brw_MOV(p,
1968	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1969				  mrf.nr,
1970				  2), BRW_REGISTER_TYPE_UD),
1971	      brw_imm_ud(offset));
1972
1973      brw_pop_insn_state(p);
1974   }
1975
1976   {
1977      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1978
1979      assert(insn->header.predicate_control == 0);
1980      insn->header.compression_control = BRW_COMPRESSION_NONE;
1981      insn->header.destreg__conditionalmod = mrf.nr;
1982
1983      brw_set_dest(p, insn, dest);	/* UW? */
1984      if (intel->gen >= 6) {
1985	 brw_set_src0(p, insn, mrf);
1986      } else {
1987	 brw_set_src0(p, insn, brw_null_reg());
1988      }
1989
1990      brw_set_dp_read_message(p,
1991			      insn,
1992			      255, /* binding table index (255=stateless) */
1993			      msg_control,
1994			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1995			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1996			      1, /* msg_length */
1997                              true, /* header_present */
1998			      rlen);
1999   }
2000}
2001
2002/**
2003 * Read a float[4] vector from the data port Data Cache (const buffer).
2004 * Location (in buffer) should be a multiple of 16.
2005 * Used for fetching shader constants.
2006 */
2007void brw_oword_block_read(struct brw_compile *p,
2008			  struct brw_reg dest,
2009			  struct brw_reg mrf,
2010			  uint32_t offset,
2011			  uint32_t bind_table_index)
2012{
2013   struct intel_context *intel = &p->brw->intel;
2014
2015   /* On newer hardware, offset is in units of owords. */
2016   if (intel->gen >= 6)
2017      offset /= 16;
2018
2019   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2020
2021   brw_push_insn_state(p);
2022   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2023   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2024   brw_set_mask_control(p, BRW_MASK_DISABLE);
2025
2026   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2027
2028   /* set message header global offset field (reg 0, element 2) */
2029   brw_MOV(p,
2030	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2031			       mrf.nr,
2032			       2), BRW_REGISTER_TYPE_UD),
2033	   brw_imm_ud(offset));
2034
2035   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
2036   insn->header.destreg__conditionalmod = mrf.nr;
2037
2038   /* cast dest to a uword[8] vector */
2039   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2040
2041   brw_set_dest(p, insn, dest);
2042   if (intel->gen >= 6) {
2043      brw_set_src0(p, insn, mrf);
2044   } else {
2045      brw_set_src0(p, insn, brw_null_reg());
2046   }
2047
2048   brw_set_dp_read_message(p,
2049			   insn,
2050			   bind_table_index,
2051			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
2052			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2053			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
2054			   1, /* msg_length */
2055                           true, /* header_present */
2056			   1); /* response_length (1 reg, 2 owords!) */
2057
2058   brw_pop_insn_state(p);
2059}
2060
2061
2062void brw_fb_WRITE(struct brw_compile *p,
2063		  int dispatch_width,
2064                  GLuint msg_reg_nr,
2065                  struct brw_reg src0,
2066                  GLuint msg_control,
2067                  GLuint binding_table_index,
2068                  GLuint msg_length,
2069                  GLuint response_length,
2070                  bool eot,
2071                  bool header_present)
2072{
2073   struct intel_context *intel = &p->brw->intel;
2074   struct brw_instruction *insn;
2075   GLuint msg_type;
2076   struct brw_reg dest;
2077
2078   if (dispatch_width == 16)
2079      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2080   else
2081      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2082
2083   if (intel->gen >= 6) {
2084      insn = next_insn(p, BRW_OPCODE_SENDC);
2085   } else {
2086      insn = next_insn(p, BRW_OPCODE_SEND);
2087   }
2088   /* The execution mask is ignored for render target writes. */
2089   insn->header.predicate_control = 0;
2090   insn->header.compression_control = BRW_COMPRESSION_NONE;
2091
2092   if (intel->gen >= 6) {
2093      /* headerless version, just submit color payload */
2094      src0 = brw_message_reg(msg_reg_nr);
2095
2096      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2097   } else {
2098      insn->header.destreg__conditionalmod = msg_reg_nr;
2099
2100      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2101   }
2102
2103   brw_set_dest(p, insn, dest);
2104   brw_set_src0(p, insn, src0);
2105   brw_set_dp_write_message(p,
2106			    insn,
2107			    binding_table_index,
2108			    msg_control,
2109			    msg_type,
2110			    msg_length,
2111			    header_present,
2112			    eot, /* last render target write */
2113			    response_length,
2114			    eot,
2115			    0 /* send_commit_msg */);
2116}
2117
2118
2119/**
2120 * Texture sample instruction.
2121 * Note: the msg_type plus msg_length values determine exactly what kind
2122 * of sampling operation is performed.  See volume 4, page 161 of docs.
2123 */
2124void brw_SAMPLE(struct brw_compile *p,
2125		struct brw_reg dest,
2126		GLuint msg_reg_nr,
2127		struct brw_reg src0,
2128		GLuint binding_table_index,
2129		GLuint sampler,
2130		GLuint msg_type,
2131		GLuint response_length,
2132		GLuint msg_length,
2133		GLuint header_present,
2134		GLuint simd_mode,
2135		GLuint return_format)
2136{
2137   struct intel_context *intel = &p->brw->intel;
2138   struct brw_instruction *insn;
2139
2140   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2141
2142   insn = next_insn(p, BRW_OPCODE_SEND);
2143   insn->header.predicate_control = 0; /* XXX */
2144   insn->header.compression_control = BRW_COMPRESSION_NONE;
2145   if (intel->gen < 6)
2146      insn->header.destreg__conditionalmod = msg_reg_nr;
2147
2148   brw_set_dest(p, insn, dest);
2149   brw_set_src0(p, insn, src0);
2150   brw_set_sampler_message(p, insn,
2151                           binding_table_index,
2152                           sampler,
2153                           msg_type,
2154                           response_length,
2155                           msg_length,
2156                           header_present,
2157                           simd_mode,
2158                           return_format);
2159}
2160
2161/* All these variables are pretty confusing - we might be better off
2162 * using bitmasks and macros for this, in the old style.  Or perhaps
2163 * just having the caller instantiate the fields in dword3 itself.
2164 */
2165void brw_urb_WRITE(struct brw_compile *p,
2166		   struct brw_reg dest,
2167		   GLuint msg_reg_nr,
2168		   struct brw_reg src0,
2169		   bool allocate,
2170		   bool used,
2171		   GLuint msg_length,
2172		   GLuint response_length,
2173		   bool eot,
2174		   bool writes_complete,
2175		   GLuint offset,
2176		   GLuint swizzle)
2177{
2178   struct intel_context *intel = &p->brw->intel;
2179   struct brw_instruction *insn;
2180
2181   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2182
2183   if (intel->gen == 7) {
2184      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2185      brw_push_insn_state(p);
2186      brw_set_access_mode(p, BRW_ALIGN_1);
2187      brw_set_mask_control(p, BRW_MASK_DISABLE);
2188      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2189		       BRW_REGISTER_TYPE_UD),
2190	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2191		brw_imm_ud(0xff00));
2192      brw_pop_insn_state(p);
2193   }
2194
2195   insn = next_insn(p, BRW_OPCODE_SEND);
2196
2197   assert(msg_length < BRW_MAX_MRF);
2198
2199   brw_set_dest(p, insn, dest);
2200   brw_set_src0(p, insn, src0);
2201   brw_set_src1(p, insn, brw_imm_d(0));
2202
2203   if (intel->gen < 6)
2204      insn->header.destreg__conditionalmod = msg_reg_nr;
2205
2206   brw_set_urb_message(p,
2207		       insn,
2208		       allocate,
2209		       used,
2210		       msg_length,
2211		       response_length,
2212		       eot,
2213		       writes_complete,
2214		       offset,
2215		       swizzle);
2216}
2217
2218static int
2219next_ip(struct brw_compile *p, int ip)
2220{
2221   struct brw_instruction *insn = (void *)p->store + ip;
2222
2223   if (insn->header.cmpt_control)
2224      return ip + 8;
2225   else
2226      return ip + 16;
2227}
2228
2229static int
2230brw_find_next_block_end(struct brw_compile *p, int start)
2231{
2232   int ip;
2233   void *store = p->store;
2234
2235   for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2236      struct brw_instruction *insn = store + ip;
2237
2238      switch (insn->header.opcode) {
2239      case BRW_OPCODE_ENDIF:
2240      case BRW_OPCODE_ELSE:
2241      case BRW_OPCODE_WHILE:
2242      case BRW_OPCODE_HALT:
2243	 return ip;
2244      }
2245   }
2246
2247   return 0;
2248}
2249
2250/* There is no DO instruction on gen6, so to find the end of the loop
2251 * we have to see if the loop is jumping back before our start
2252 * instruction.
2253 */
2254static int
2255brw_find_loop_end(struct brw_compile *p, int start)
2256{
2257   struct intel_context *intel = &p->brw->intel;
2258   int ip;
2259   int scale = 8;
2260   void *store = p->store;
2261
2262   /* Always start after the instruction (such as a WHILE) we're trying to fix
2263    * up.
2264    */
2265   for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2266      struct brw_instruction *insn = store + ip;
2267
2268      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2269	 int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
2270				   : insn->bits3.break_cont.jip;
2271	 if (ip + jip * scale <= start)
2272	    return ip;
2273      }
2274   }
2275   assert(!"not reached");
2276   return start;
2277}
2278
2279/* After program generation, go back and update the UIP and JIP of
2280 * BREAK, CONT, and HALT instructions to their correct locations.
2281 */
2282void
2283brw_set_uip_jip(struct brw_compile *p)
2284{
2285   struct intel_context *intel = &p->brw->intel;
2286   int ip;
2287   int scale = 8;
2288   void *store = p->store;
2289
2290   if (intel->gen < 6)
2291      return;
2292
2293   for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
2294      struct brw_instruction *insn = store + ip;
2295
2296      if (insn->header.cmpt_control) {
2297	 /* Fixups for compacted BREAK/CONTINUE not supported yet. */
2298	 assert(insn->header.opcode != BRW_OPCODE_BREAK &&
2299		insn->header.opcode != BRW_OPCODE_CONTINUE &&
2300		insn->header.opcode != BRW_OPCODE_HALT);
2301	 continue;
2302      }
2303
2304      int block_end_ip = brw_find_next_block_end(p, ip);
2305      switch (insn->header.opcode) {
2306      case BRW_OPCODE_BREAK:
2307         assert(block_end_ip != 0);
2308	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2309	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2310	 insn->bits3.break_cont.uip =
2311	    (brw_find_loop_end(p, ip) - ip +
2312             (intel->gen == 6 ? 16 : 0)) / scale;
2313	 break;
2314      case BRW_OPCODE_CONTINUE:
2315         assert(block_end_ip != 0);
2316	 insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2317	 insn->bits3.break_cont.uip =
2318            (brw_find_loop_end(p, ip) - ip) / scale;
2319
2320	 assert(insn->bits3.break_cont.uip != 0);
2321	 assert(insn->bits3.break_cont.jip != 0);
2322	 break;
2323
2324      case BRW_OPCODE_ENDIF:
2325         if (block_end_ip == 0)
2326            insn->bits3.break_cont.jip = 2;
2327         else
2328            insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2329	 break;
2330
2331      case BRW_OPCODE_HALT:
2332	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2333	  *
2334	  *    "In case of the halt instruction not inside any conditional
2335	  *     code block, the value of <JIP> and <UIP> should be the
2336	  *     same. In case of the halt instruction inside conditional code
2337	  *     block, the <UIP> should be the end of the program, and the
2338	  *     <JIP> should be end of the most inner conditional code block."
2339	  *
2340	  * The uip will have already been set by whoever set up the
2341	  * instruction.
2342	  */
2343	 if (block_end_ip == 0) {
2344	    insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
2345	 } else {
2346	    insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
2347	 }
2348	 assert(insn->bits3.break_cont.uip != 0);
2349	 assert(insn->bits3.break_cont.jip != 0);
2350	 break;
2351      }
2352   }
2353}
2354
2355void brw_ff_sync(struct brw_compile *p,
2356		   struct brw_reg dest,
2357		   GLuint msg_reg_nr,
2358		   struct brw_reg src0,
2359		   bool allocate,
2360		   GLuint response_length,
2361		   bool eot)
2362{
2363   struct intel_context *intel = &p->brw->intel;
2364   struct brw_instruction *insn;
2365
2366   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2367
2368   insn = next_insn(p, BRW_OPCODE_SEND);
2369   brw_set_dest(p, insn, dest);
2370   brw_set_src0(p, insn, src0);
2371   brw_set_src1(p, insn, brw_imm_d(0));
2372
2373   if (intel->gen < 6)
2374      insn->header.destreg__conditionalmod = msg_reg_nr;
2375
2376   brw_set_ff_sync_message(p,
2377			   insn,
2378			   allocate,
2379			   response_length,
2380			   eot);
2381}
2382
2383/**
2384 * Emit the SEND instruction necessary to generate stream output data on Gen6
2385 * (for transform feedback).
2386 *
2387 * If send_commit_msg is true, this is the last piece of stream output data
2388 * from this thread, so send the data as a committed write.  According to the
2389 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2390 *
2391 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2392 *   writes are complete by sending the final write as a committed write."
2393 */
2394void
2395brw_svb_write(struct brw_compile *p,
2396              struct brw_reg dest,
2397              GLuint msg_reg_nr,
2398              struct brw_reg src0,
2399              GLuint binding_table_index,
2400              bool   send_commit_msg)
2401{
2402   struct brw_instruction *insn;
2403
2404   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2405
2406   insn = next_insn(p, BRW_OPCODE_SEND);
2407   brw_set_dest(p, insn, dest);
2408   brw_set_src0(p, insn, src0);
2409   brw_set_src1(p, insn, brw_imm_d(0));
2410   brw_set_dp_write_message(p, insn,
2411                            binding_table_index,
2412                            0, /* msg_control: ignored */
2413                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2414                            1, /* msg_length */
2415                            true, /* header_present */
2416                            0, /* last_render_target: ignored */
2417                            send_commit_msg, /* response_length */
2418                            0, /* end_of_thread */
2419                            send_commit_msg); /* send_commit_msg */
2420}
2421
2422/**
2423 * This instruction is generated as a single-channel align1 instruction by
2424 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
2425 *
2426 * We can't use the typed atomic op in the FS because that has the execution
2427 * mask ANDed with the pixel mask, but we just want to write the one dword for
2428 * all the pixels.
2429 *
2430 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
2431 * one u32.  So we use the same untyped atomic write message as the pixel
2432 * shader.
2433 *
2434 * The untyped atomic operation requires a BUFFER surface type with RAW
2435 * format, and is only accessible through the legacy DATA_CACHE dataport
2436 * messages.
2437 */
2438void brw_shader_time_add(struct brw_compile *p,
2439                         struct brw_reg payload,
2440                         uint32_t surf_index)
2441{
2442   struct intel_context *intel = &p->brw->intel;
2443   assert(intel->gen >= 7);
2444
2445   brw_push_insn_state(p);
2446   brw_set_access_mode(p, BRW_ALIGN_1);
2447   brw_set_mask_control(p, BRW_MASK_DISABLE);
2448   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
2449   brw_pop_insn_state(p);
2450
2451   /* We use brw_vec1_reg and unmasked because we want to increment the given
2452    * offset only once.
2453    */
2454   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
2455                                      BRW_ARF_NULL, 0));
2456   brw_set_src0(p, send, brw_vec1_reg(payload.file,
2457                                      payload.nr, 0));
2458
2459   uint32_t sfid, msg_type;
2460   if (intel->is_haswell) {
2461      sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2462      msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
2463   } else {
2464      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
2465      msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
2466   }
2467
2468   bool header_present = false;
2469   bool eot = false;
2470   uint32_t mlen = 2; /* offset, value */
2471   uint32_t rlen = 0;
2472   brw_set_message_descriptor(p, send, sfid, mlen, rlen, header_present, eot);
2473
2474   send->bits3.ud |= msg_type << 14;
2475   send->bits3.ud |= 0 << 13; /* no return data */
2476   send->bits3.ud |= 1 << 12; /* SIMD8 mode */
2477   send->bits3.ud |= BRW_AOP_ADD << 8;
2478   send->bits3.ud |= surf_index << 0;
2479}
2480