brw_eu_emit.c revision 9a21bc640188e4078075b9f8e6701853a4f0bbe4
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37
38
39
40/***********************************************************************
41 * Internal helper for constructing instructions
42 */
43
44static void guess_execution_size(struct brw_compile *p,
45				 struct brw_instruction *insn,
46				 struct brw_reg reg)
47{
48   if (reg.width == BRW_WIDTH_8 && p->compressed)
49      insn->header.execution_size = BRW_EXECUTE_16;
50   else
51      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
52}
53
54
55/**
56 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
57 * registers, implicitly moving the operand to a message register.
58 *
59 * On Sandybridge, this is no longer the case.  This function performs the
60 * explicit move; it should be called before emitting a SEND instruction.
61 */
62static void
63gen6_resolve_implied_move(struct brw_compile *p,
64			  struct brw_reg *src,
65			  GLuint msg_reg_nr)
66{
67   struct intel_context *intel = &p->brw->intel;
68   if (intel->gen != 6)
69      return;
70
71   if (src->file == BRW_ARCHITECTURE_REGISTER_FILE && src->nr == BRW_ARF_NULL)
72      return;
73
74   brw_push_insn_state(p);
75   brw_set_mask_control(p, BRW_MASK_DISABLE);
76   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
77   brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
78	   retype(*src, BRW_REGISTER_TYPE_UD));
79   brw_pop_insn_state(p);
80   *src = brw_message_reg(msg_reg_nr);
81}
82
83
84static void brw_set_dest(struct brw_compile *p,
85			 struct brw_instruction *insn,
86			 struct brw_reg dest)
87{
88   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
89       dest.file != BRW_MESSAGE_REGISTER_FILE)
90      assert(dest.nr < 128);
91
92   insn->bits1.da1.dest_reg_file = dest.file;
93   insn->bits1.da1.dest_reg_type = dest.type;
94   insn->bits1.da1.dest_address_mode = dest.address_mode;
95
96   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
97      insn->bits1.da1.dest_reg_nr = dest.nr;
98
99      if (insn->header.access_mode == BRW_ALIGN_1) {
100	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
101	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
102	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
103	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
104      }
105      else {
106	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
107	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
108	 /* even ignored in da16, still need to set as '01' */
109	 insn->bits1.da16.dest_horiz_stride = 1;
110      }
111   }
112   else {
113      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
114
115      /* These are different sizes in align1 vs align16:
116       */
117      if (insn->header.access_mode == BRW_ALIGN_1) {
118	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
119	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
120	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
121	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
122      }
123      else {
124	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
125	 /* even ignored in da16, still need to set as '01' */
126	 insn->bits1.ia16.dest_horiz_stride = 1;
127      }
128   }
129
130   /* NEW: Set the execution size based on dest.width and
131    * insn->compression_control:
132    */
133   guess_execution_size(p, insn, dest);
134}
135
136extern int reg_type_size[];
137
138static void
139validate_reg(struct brw_instruction *insn, struct brw_reg reg)
140{
141   int hstride_for_reg[] = {0, 1, 2, 4};
142   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
143   int width_for_reg[] = {1, 2, 4, 8, 16};
144   int execsize_for_reg[] = {1, 2, 4, 8, 16};
145   int width, hstride, vstride, execsize;
146
147   if (reg.file == BRW_IMMEDIATE_VALUE) {
148      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
149       * mean the destination has to be 128-bit aligned and the
150       * destination horiz stride has to be a word.
151       */
152      if (reg.type == BRW_REGISTER_TYPE_V) {
153	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
154		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
155      }
156
157      return;
158   }
159
160   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
161       reg.file == BRW_ARF_NULL)
162      return;
163
164   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
165   hstride = hstride_for_reg[reg.hstride];
166
167   if (reg.vstride == 0xf) {
168      vstride = -1;
169   } else {
170      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
171      vstride = vstride_for_reg[reg.vstride];
172   }
173
174   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
175   width = width_for_reg[reg.width];
176
177   assert(insn->header.execution_size >= 0 &&
178	  insn->header.execution_size < Elements(execsize_for_reg));
179   execsize = execsize_for_reg[insn->header.execution_size];
180
181   /* Restrictions from 3.3.10: Register Region Restrictions. */
182   /* 3. */
183   assert(execsize >= width);
184
185   /* 4. */
186   if (execsize == width && hstride != 0) {
187      assert(vstride == -1 || vstride == width * hstride);
188   }
189
190   /* 5. */
191   if (execsize == width && hstride == 0) {
192      /* no restriction on vstride. */
193   }
194
195   /* 6. */
196   if (width == 1) {
197      assert(hstride == 0);
198   }
199
200   /* 7. */
201   if (execsize == 1 && width == 1) {
202      assert(hstride == 0);
203      assert(vstride == 0);
204   }
205
206   /* 8. */
207   if (vstride == 0 && hstride == 0) {
208      assert(width == 1);
209   }
210
211   /* 10. Check destination issues. */
212}
213
214static void brw_set_src0( struct brw_instruction *insn,
215                          struct brw_reg reg )
216{
217   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
218      assert(reg.nr < 128);
219
220   validate_reg(insn, reg);
221
222   insn->bits1.da1.src0_reg_file = reg.file;
223   insn->bits1.da1.src0_reg_type = reg.type;
224   insn->bits2.da1.src0_abs = reg.abs;
225   insn->bits2.da1.src0_negate = reg.negate;
226   insn->bits2.da1.src0_address_mode = reg.address_mode;
227
228   if (reg.file == BRW_IMMEDIATE_VALUE) {
229      insn->bits3.ud = reg.dw1.ud;
230
231      /* Required to set some fields in src1 as well:
232       */
233      insn->bits1.da1.src1_reg_file = 0; /* arf */
234      insn->bits1.da1.src1_reg_type = reg.type;
235   }
236   else
237   {
238      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
239	 if (insn->header.access_mode == BRW_ALIGN_1) {
240	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
241	    insn->bits2.da1.src0_reg_nr = reg.nr;
242	 }
243	 else {
244	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
245	    insn->bits2.da16.src0_reg_nr = reg.nr;
246	 }
247      }
248      else {
249	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
250
251	 if (insn->header.access_mode == BRW_ALIGN_1) {
252	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
253	 }
254	 else {
255	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
256	 }
257      }
258
259      if (insn->header.access_mode == BRW_ALIGN_1) {
260	 if (reg.width == BRW_WIDTH_1 &&
261	     insn->header.execution_size == BRW_EXECUTE_1) {
262	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
263	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
264	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
265	 }
266	 else {
267	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
268	    insn->bits2.da1.src0_width = reg.width;
269	    insn->bits2.da1.src0_vert_stride = reg.vstride;
270	 }
271      }
272      else {
273	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
274	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
275	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
276	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
277
278	 /* This is an oddity of the fact we're using the same
279	  * descriptions for registers in align_16 as align_1:
280	  */
281	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
282	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
283	 else
284	    insn->bits2.da16.src0_vert_stride = reg.vstride;
285      }
286   }
287}
288
289
290void brw_set_src1( struct brw_instruction *insn,
291                   struct brw_reg reg )
292{
293   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
294
295   assert(reg.nr < 128);
296
297   validate_reg(insn, reg);
298
299   insn->bits1.da1.src1_reg_file = reg.file;
300   insn->bits1.da1.src1_reg_type = reg.type;
301   insn->bits3.da1.src1_abs = reg.abs;
302   insn->bits3.da1.src1_negate = reg.negate;
303
304   /* Only src1 can be immediate in two-argument instructions.
305    */
306   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
307
308   if (reg.file == BRW_IMMEDIATE_VALUE) {
309      insn->bits3.ud = reg.dw1.ud;
310   }
311   else {
312      /* This is a hardware restriction, which may or may not be lifted
313       * in the future:
314       */
315      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
316      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
317
318      if (insn->header.access_mode == BRW_ALIGN_1) {
319	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
320	 insn->bits3.da1.src1_reg_nr = reg.nr;
321      }
322      else {
323	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
324	 insn->bits3.da16.src1_reg_nr = reg.nr;
325      }
326
327      if (insn->header.access_mode == BRW_ALIGN_1) {
328	 if (reg.width == BRW_WIDTH_1 &&
329	     insn->header.execution_size == BRW_EXECUTE_1) {
330	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
331	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
332	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
333	 }
334	 else {
335	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
336	    insn->bits3.da1.src1_width = reg.width;
337	    insn->bits3.da1.src1_vert_stride = reg.vstride;
338	 }
339      }
340      else {
341	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
342	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
343	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
344	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
345
346	 /* This is an oddity of the fact we're using the same
347	  * descriptions for registers in align_16 as align_1:
348	  */
349	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
350	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
351	 else
352	    insn->bits3.da16.src1_vert_stride = reg.vstride;
353      }
354   }
355}
356
357
358
359static void brw_set_math_message( struct brw_context *brw,
360				  struct brw_instruction *insn,
361				  GLuint msg_length,
362				  GLuint response_length,
363				  GLuint function,
364				  GLuint integer_type,
365				  GLboolean low_precision,
366				  GLboolean saturate,
367				  GLuint dataType )
368{
369   struct intel_context *intel = &brw->intel;
370   brw_set_src1(insn, brw_imm_d(0));
371
372   if (intel->gen == 5) {
373       insn->bits3.math_gen5.function = function;
374       insn->bits3.math_gen5.int_type = integer_type;
375       insn->bits3.math_gen5.precision = low_precision;
376       insn->bits3.math_gen5.saturate = saturate;
377       insn->bits3.math_gen5.data_type = dataType;
378       insn->bits3.math_gen5.snapshot = 0;
379       insn->bits3.math_gen5.header_present = 0;
380       insn->bits3.math_gen5.response_length = response_length;
381       insn->bits3.math_gen5.msg_length = msg_length;
382       insn->bits3.math_gen5.end_of_thread = 0;
383       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
384       insn->bits2.send_gen5.end_of_thread = 0;
385   } else {
386       insn->bits3.math.function = function;
387       insn->bits3.math.int_type = integer_type;
388       insn->bits3.math.precision = low_precision;
389       insn->bits3.math.saturate = saturate;
390       insn->bits3.math.data_type = dataType;
391       insn->bits3.math.response_length = response_length;
392       insn->bits3.math.msg_length = msg_length;
393       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
394       insn->bits3.math.end_of_thread = 0;
395   }
396}
397
398
399static void brw_set_ff_sync_message(struct brw_context *brw,
400				    struct brw_instruction *insn,
401				    GLboolean allocate,
402				    GLuint response_length,
403				    GLboolean end_of_thread)
404{
405	struct intel_context *intel = &brw->intel;
406	brw_set_src1(insn, brw_imm_d(0));
407
408	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
409	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
410	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
411	insn->bits3.urb_gen5.allocate = allocate;
412	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
413	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
414	insn->bits3.urb_gen5.header_present = 1;
415	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
416	insn->bits3.urb_gen5.msg_length = 1;
417	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
418	if (intel->gen >= 6) {
419	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
420	} else {
421	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
422	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
423	}
424}
425
426static void brw_set_urb_message( struct brw_context *brw,
427				 struct brw_instruction *insn,
428				 GLboolean allocate,
429				 GLboolean used,
430				 GLuint msg_length,
431				 GLuint response_length,
432				 GLboolean end_of_thread,
433				 GLboolean complete,
434				 GLuint offset,
435				 GLuint swizzle_control )
436{
437    struct intel_context *intel = &brw->intel;
438    brw_set_src1(insn, brw_imm_d(0));
439
440    if (intel->gen >= 5) {
441        insn->bits3.urb_gen5.opcode = 0;	/* ? */
442        insn->bits3.urb_gen5.offset = offset;
443        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
444        insn->bits3.urb_gen5.allocate = allocate;
445        insn->bits3.urb_gen5.used = used;	/* ? */
446        insn->bits3.urb_gen5.complete = complete;
447        insn->bits3.urb_gen5.header_present = 1;
448        insn->bits3.urb_gen5.response_length = response_length;
449        insn->bits3.urb_gen5.msg_length = msg_length;
450        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
451	if (intel->gen >= 6) {
452	   /* For SNB, the SFID bits moved to the condmod bits, and
453	    * EOT stayed in bits3 above.  Does the EOT bit setting
454	    * below on Ironlake even do anything?
455	    */
456	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
457	} else {
458	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
459	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
460	}
461    } else {
462        insn->bits3.urb.opcode = 0;	/* ? */
463        insn->bits3.urb.offset = offset;
464        insn->bits3.urb.swizzle_control = swizzle_control;
465        insn->bits3.urb.allocate = allocate;
466        insn->bits3.urb.used = used;	/* ? */
467        insn->bits3.urb.complete = complete;
468        insn->bits3.urb.response_length = response_length;
469        insn->bits3.urb.msg_length = msg_length;
470        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
471        insn->bits3.urb.end_of_thread = end_of_thread;
472    }
473}
474
475static void brw_set_dp_write_message( struct brw_context *brw,
476				      struct brw_instruction *insn,
477				      GLuint binding_table_index,
478				      GLuint msg_control,
479				      GLuint msg_type,
480				      GLuint msg_length,
481				      GLboolean header_present,
482				      GLuint pixel_scoreboard_clear,
483				      GLuint response_length,
484				      GLuint end_of_thread,
485				      GLuint send_commit_msg)
486{
487   struct intel_context *intel = &brw->intel;
488   brw_set_src1(insn, brw_imm_ud(0));
489
490   if (intel->gen >= 6) {
491       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
492       insn->bits3.dp_render_cache.msg_control = msg_control;
493       insn->bits3.dp_render_cache.pixel_scoreboard_clear = pixel_scoreboard_clear;
494       insn->bits3.dp_render_cache.msg_type = msg_type;
495       insn->bits3.dp_render_cache.send_commit_msg = send_commit_msg;
496       insn->bits3.dp_render_cache.header_present = header_present;
497       insn->bits3.dp_render_cache.response_length = response_length;
498       insn->bits3.dp_render_cache.msg_length = msg_length;
499       insn->bits3.dp_render_cache.end_of_thread = end_of_thread;
500       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
501	/* XXX really need below? */
502       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
503       insn->bits2.send_gen5.end_of_thread = end_of_thread;
504   } else if (intel->gen == 5) {
505       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
506       insn->bits3.dp_write_gen5.msg_control = msg_control;
507       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
508       insn->bits3.dp_write_gen5.msg_type = msg_type;
509       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
510       insn->bits3.dp_write_gen5.header_present = header_present;
511       insn->bits3.dp_write_gen5.response_length = response_length;
512       insn->bits3.dp_write_gen5.msg_length = msg_length;
513       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
514       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
515       insn->bits2.send_gen5.end_of_thread = end_of_thread;
516   } else {
517       insn->bits3.dp_write.binding_table_index = binding_table_index;
518       insn->bits3.dp_write.msg_control = msg_control;
519       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
520       insn->bits3.dp_write.msg_type = msg_type;
521       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
522       insn->bits3.dp_write.response_length = response_length;
523       insn->bits3.dp_write.msg_length = msg_length;
524       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
525       insn->bits3.dp_write.end_of_thread = end_of_thread;
526   }
527}
528
529static void
530brw_set_dp_read_message(struct brw_context *brw,
531			struct brw_instruction *insn,
532			GLuint binding_table_index,
533			GLuint msg_control,
534			GLuint msg_type,
535			GLuint target_cache,
536			GLuint msg_length,
537			GLuint response_length)
538{
539   struct intel_context *intel = &brw->intel;
540   brw_set_src1(insn, brw_imm_d(0));
541
542   if (intel->gen >= 6) {
543       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
544       insn->bits3.dp_render_cache.msg_control = msg_control;
545       insn->bits3.dp_render_cache.pixel_scoreboard_clear = 0;
546       insn->bits3.dp_render_cache.msg_type = msg_type;
547       insn->bits3.dp_render_cache.send_commit_msg = 0;
548       insn->bits3.dp_render_cache.header_present = 1;
549       insn->bits3.dp_render_cache.response_length = response_length;
550       insn->bits3.dp_render_cache.msg_length = msg_length;
551       insn->bits3.dp_render_cache.end_of_thread = 0;
552       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_READ;
553	/* XXX really need below? */
554       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
555       insn->bits2.send_gen5.end_of_thread = 0;
556   } else if (intel->gen == 5) {
557       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
558       insn->bits3.dp_read_gen5.msg_control = msg_control;
559       insn->bits3.dp_read_gen5.msg_type = msg_type;
560       insn->bits3.dp_read_gen5.target_cache = target_cache;
561       insn->bits3.dp_read_gen5.header_present = 1;
562       insn->bits3.dp_read_gen5.response_length = response_length;
563       insn->bits3.dp_read_gen5.msg_length = msg_length;
564       insn->bits3.dp_read_gen5.pad1 = 0;
565       insn->bits3.dp_read_gen5.end_of_thread = 0;
566       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
567       insn->bits2.send_gen5.end_of_thread = 0;
568   } else if (intel->is_g4x) {
569       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
570       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
571       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
572       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
573       insn->bits3.dp_read_g4x.response_length = response_length;  /*16:19*/
574       insn->bits3.dp_read_g4x.msg_length = msg_length;  /*20:23*/
575       insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
576       insn->bits3.dp_read_g4x.pad1 = 0;
577       insn->bits3.dp_read_g4x.end_of_thread = 0;
578   } else {
579       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
580       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
581       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
582       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
583       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
584       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
585       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
586       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
587       insn->bits3.dp_read.end_of_thread = 0;  /*31*/
588   }
589}
590
591static void brw_set_sampler_message(struct brw_context *brw,
592                                    struct brw_instruction *insn,
593                                    GLuint binding_table_index,
594                                    GLuint sampler,
595                                    GLuint msg_type,
596                                    GLuint response_length,
597                                    GLuint msg_length,
598                                    GLboolean eot,
599                                    GLuint header_present,
600                                    GLuint simd_mode)
601{
602   struct intel_context *intel = &brw->intel;
603   assert(eot == 0);
604   brw_set_src1(insn, brw_imm_d(0));
605
606   if (intel->gen >= 5) {
607      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
608      insn->bits3.sampler_gen5.sampler = sampler;
609      insn->bits3.sampler_gen5.msg_type = msg_type;
610      insn->bits3.sampler_gen5.simd_mode = simd_mode;
611      insn->bits3.sampler_gen5.header_present = header_present;
612      insn->bits3.sampler_gen5.response_length = response_length;
613      insn->bits3.sampler_gen5.msg_length = msg_length;
614      insn->bits3.sampler_gen5.end_of_thread = eot;
615      if (intel->gen >= 6)
616	  insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
617      else {
618	  insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
619	  insn->bits2.send_gen5.end_of_thread = eot;
620      }
621   } else if (intel->is_g4x) {
622      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
623      insn->bits3.sampler_g4x.sampler = sampler;
624      insn->bits3.sampler_g4x.msg_type = msg_type;
625      insn->bits3.sampler_g4x.response_length = response_length;
626      insn->bits3.sampler_g4x.msg_length = msg_length;
627      insn->bits3.sampler_g4x.end_of_thread = eot;
628      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
629   } else {
630      insn->bits3.sampler.binding_table_index = binding_table_index;
631      insn->bits3.sampler.sampler = sampler;
632      insn->bits3.sampler.msg_type = msg_type;
633      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
634      insn->bits3.sampler.response_length = response_length;
635      insn->bits3.sampler.msg_length = msg_length;
636      insn->bits3.sampler.end_of_thread = eot;
637      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
638   }
639}
640
641
642
643static struct brw_instruction *next_insn( struct brw_compile *p,
644					  GLuint opcode )
645{
646   struct brw_instruction *insn;
647
648   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
649
650   insn = &p->store[p->nr_insn++];
651   memcpy(insn, p->current, sizeof(*insn));
652
653   /* Reset this one-shot flag:
654    */
655
656   if (p->current->header.destreg__conditionalmod) {
657      p->current->header.destreg__conditionalmod = 0;
658      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
659   }
660
661   insn->header.opcode = opcode;
662   return insn;
663}
664
665
666static struct brw_instruction *brw_alu1( struct brw_compile *p,
667					 GLuint opcode,
668					 struct brw_reg dest,
669					 struct brw_reg src )
670{
671   struct brw_instruction *insn = next_insn(p, opcode);
672   brw_set_dest(p, insn, dest);
673   brw_set_src0(insn, src);
674   return insn;
675}
676
677static struct brw_instruction *brw_alu2(struct brw_compile *p,
678					GLuint opcode,
679					struct brw_reg dest,
680					struct brw_reg src0,
681					struct brw_reg src1 )
682{
683   struct brw_instruction *insn = next_insn(p, opcode);
684   brw_set_dest(p, insn, dest);
685   brw_set_src0(insn, src0);
686   brw_set_src1(insn, src1);
687   return insn;
688}
689
690
691/***********************************************************************
692 * Convenience routines.
693 */
694#define ALU1(OP)					\
695struct brw_instruction *brw_##OP(struct brw_compile *p,	\
696	      struct brw_reg dest,			\
697	      struct brw_reg src0)   			\
698{							\
699   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
700}
701
702#define ALU2(OP)					\
703struct brw_instruction *brw_##OP(struct brw_compile *p,	\
704	      struct brw_reg dest,			\
705	      struct brw_reg src0,			\
706	      struct brw_reg src1)   			\
707{							\
708   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
709}
710
711/* Rounding operations (other than RNDD) require two instructions - the first
712 * stores a rounded value (possibly the wrong way) in the dest register, but
713 * also sets a per-channel "increment bit" in the flag register.  A predicated
714 * add of 1.0 fixes dest to contain the desired result.
715 */
716#define ROUND(OP)							      \
717void brw_##OP(struct brw_compile *p,					      \
718	      struct brw_reg dest,					      \
719	      struct brw_reg src)					      \
720{									      \
721   struct brw_instruction *rnd, *add;					      \
722   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
723   brw_set_dest(p, rnd, dest);						      \
724   brw_set_src0(rnd, src);						      \
725   rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
726									      \
727   add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
728   add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
729}
730
731
732ALU1(MOV)
733ALU2(SEL)
734ALU1(NOT)
735ALU2(AND)
736ALU2(OR)
737ALU2(XOR)
738ALU2(SHR)
739ALU2(SHL)
740ALU2(RSR)
741ALU2(RSL)
742ALU2(ASR)
743ALU1(FRC)
744ALU1(RNDD)
745ALU2(MAC)
746ALU2(MACH)
747ALU1(LZD)
748ALU2(DP4)
749ALU2(DPH)
750ALU2(DP3)
751ALU2(DP2)
752ALU2(LINE)
753ALU2(PLN)
754
755
756ROUND(RNDZ)
757ROUND(RNDE)
758
759
760struct brw_instruction *brw_ADD(struct brw_compile *p,
761				struct brw_reg dest,
762				struct brw_reg src0,
763				struct brw_reg src1)
764{
765   /* 6.2.2: add */
766   if (src0.type == BRW_REGISTER_TYPE_F ||
767       (src0.file == BRW_IMMEDIATE_VALUE &&
768	src0.type == BRW_REGISTER_TYPE_VF)) {
769      assert(src1.type != BRW_REGISTER_TYPE_UD);
770      assert(src1.type != BRW_REGISTER_TYPE_D);
771   }
772
773   if (src1.type == BRW_REGISTER_TYPE_F ||
774       (src1.file == BRW_IMMEDIATE_VALUE &&
775	src1.type == BRW_REGISTER_TYPE_VF)) {
776      assert(src0.type != BRW_REGISTER_TYPE_UD);
777      assert(src0.type != BRW_REGISTER_TYPE_D);
778   }
779
780   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
781}
782
783struct brw_instruction *brw_MUL(struct brw_compile *p,
784				struct brw_reg dest,
785				struct brw_reg src0,
786				struct brw_reg src1)
787{
788   /* 6.32.38: mul */
789   if (src0.type == BRW_REGISTER_TYPE_D ||
790       src0.type == BRW_REGISTER_TYPE_UD ||
791       src1.type == BRW_REGISTER_TYPE_D ||
792       src1.type == BRW_REGISTER_TYPE_UD) {
793      assert(dest.type != BRW_REGISTER_TYPE_F);
794   }
795
796   if (src0.type == BRW_REGISTER_TYPE_F ||
797       (src0.file == BRW_IMMEDIATE_VALUE &&
798	src0.type == BRW_REGISTER_TYPE_VF)) {
799      assert(src1.type != BRW_REGISTER_TYPE_UD);
800      assert(src1.type != BRW_REGISTER_TYPE_D);
801   }
802
803   if (src1.type == BRW_REGISTER_TYPE_F ||
804       (src1.file == BRW_IMMEDIATE_VALUE &&
805	src1.type == BRW_REGISTER_TYPE_VF)) {
806      assert(src0.type != BRW_REGISTER_TYPE_UD);
807      assert(src0.type != BRW_REGISTER_TYPE_D);
808   }
809
810   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
811	  src0.nr != BRW_ARF_ACCUMULATOR);
812   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
813	  src1.nr != BRW_ARF_ACCUMULATOR);
814
815   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
816}
817
818
819void brw_NOP(struct brw_compile *p)
820{
821   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
822   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
823   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
824   brw_set_src1(insn, brw_imm_ud(0x0));
825}
826
827
828
829
830
831/***********************************************************************
832 * Comparisons, if/else/endif
833 */
834
835struct brw_instruction *brw_JMPI(struct brw_compile *p,
836                                 struct brw_reg dest,
837                                 struct brw_reg src0,
838                                 struct brw_reg src1)
839{
840   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
841
842   insn->header.execution_size = 1;
843   insn->header.compression_control = BRW_COMPRESSION_NONE;
844   insn->header.mask_control = BRW_MASK_DISABLE;
845
846   p->current->header.predicate_control = BRW_PREDICATE_NONE;
847
848   return insn;
849}
850
851/* EU takes the value from the flag register and pushes it onto some
852 * sort of a stack (presumably merging with any flag value already on
853 * the stack).  Within an if block, the flags at the top of the stack
854 * control execution on each channel of the unit, eg. on each of the
855 * 16 pixel values in our wm programs.
856 *
857 * When the matching 'else' instruction is reached (presumably by
858 * countdown of the instruction count patched in by our ELSE/ENDIF
859 * functions), the relevent flags are inverted.
860 *
861 * When the matching 'endif' instruction is reached, the flags are
862 * popped off.  If the stack is now empty, normal execution resumes.
863 *
864 * No attempt is made to deal with stack overflow (14 elements?).
865 */
866struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
867{
868   struct intel_context *intel = &p->brw->intel;
869   struct brw_instruction *insn;
870
871   if (p->single_program_flow) {
872      assert(execute_size == BRW_EXECUTE_1);
873
874      insn = next_insn(p, BRW_OPCODE_ADD);
875      insn->header.predicate_inverse = 1;
876   } else {
877      insn = next_insn(p, BRW_OPCODE_IF);
878   }
879
880   /* Override the defaults for this instruction:
881    */
882   if (intel->gen < 6) {
883      brw_set_dest(p, insn, brw_ip_reg());
884      brw_set_src0(insn, brw_ip_reg());
885      brw_set_src1(insn, brw_imm_d(0x0));
886   } else {
887      brw_set_dest(p, insn, brw_imm_w(0));
888      insn->bits1.branch_gen6.jump_count = 0;
889      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
890      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
891   }
892
893   insn->header.execution_size = execute_size;
894   insn->header.compression_control = BRW_COMPRESSION_NONE;
895   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
896   insn->header.mask_control = BRW_MASK_ENABLE;
897   if (!p->single_program_flow)
898       insn->header.thread_control = BRW_THREAD_SWITCH;
899
900   p->current->header.predicate_control = BRW_PREDICATE_NONE;
901
902   return insn;
903}
904
905struct brw_instruction *
906gen6_IF(struct brw_compile *p, uint32_t conditional,
907	struct brw_reg src0, struct brw_reg src1)
908{
909   struct brw_instruction *insn;
910
911   insn = next_insn(p, BRW_OPCODE_IF);
912
913   brw_set_dest(p, insn, brw_imm_w(0));
914   insn->header.execution_size = BRW_EXECUTE_8;
915   insn->bits1.branch_gen6.jump_count = 0;
916   brw_set_src0(insn, src0);
917   brw_set_src1(insn, src1);
918
919   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
920   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
921   insn->header.destreg__conditionalmod = conditional;
922
923   if (!p->single_program_flow)
924       insn->header.thread_control = BRW_THREAD_SWITCH;
925
926   return insn;
927}
928
929struct brw_instruction *brw_ELSE(struct brw_compile *p,
930				 struct brw_instruction *if_insn)
931{
932   struct intel_context *intel = &p->brw->intel;
933   struct brw_instruction *insn;
934   GLuint br = 1;
935
936   /* jump count is for 64bit data chunk each, so one 128bit
937      instruction requires 2 chunks. */
938   if (intel->gen >= 5)
939      br = 2;
940
941   if (p->single_program_flow) {
942      insn = next_insn(p, BRW_OPCODE_ADD);
943   } else {
944      insn = next_insn(p, BRW_OPCODE_ELSE);
945   }
946
947   if (intel->gen < 6) {
948      brw_set_dest(p, insn, brw_ip_reg());
949      brw_set_src0(insn, brw_ip_reg());
950      brw_set_src1(insn, brw_imm_d(0x0));
951   } else {
952      brw_set_dest(p, insn, brw_imm_w(0));
953      insn->bits1.branch_gen6.jump_count = 0;
954      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
955      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
956   }
957
958   insn->header.compression_control = BRW_COMPRESSION_NONE;
959   insn->header.execution_size = if_insn->header.execution_size;
960   insn->header.mask_control = BRW_MASK_ENABLE;
961   if (!p->single_program_flow)
962       insn->header.thread_control = BRW_THREAD_SWITCH;
963
964   /* Patch the if instruction to point at this instruction.
965    */
966   if (p->single_program_flow) {
967      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
968
969      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
970   } else {
971      assert(if_insn->header.opcode == BRW_OPCODE_IF);
972
973      if (intel->gen < 6) {
974	 if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
975	 if_insn->bits3.if_else.pop_count = 0;
976	 if_insn->bits3.if_else.pad0 = 0;
977      } else {
978	 if_insn->bits1.branch_gen6.jump_count = br * (insn - if_insn + 1);
979      }
980   }
981
982   return insn;
983}
984
985void brw_ENDIF(struct brw_compile *p,
986	       struct brw_instruction *patch_insn)
987{
988   struct intel_context *intel = &p->brw->intel;
989   GLuint br = 1;
990
991   if (intel->gen >= 5)
992      br = 2;
993
994   if (p->single_program_flow) {
995      /* In single program flow mode, there's no need to execute an ENDIF,
996       * since we don't need to do any stack operations, and if we're executing
997       * currently, we want to just continue executing.
998       */
999      struct brw_instruction *next = &p->store[p->nr_insn];
1000
1001      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
1002
1003      patch_insn->bits3.ud = (next - patch_insn) * 16;
1004   } else {
1005      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
1006
1007      if (intel->gen < 6) {
1008	 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1009	 brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1010	 brw_set_src1(insn, brw_imm_d(0x0));
1011      } else {
1012	 brw_set_dest(p, insn, brw_imm_w(0));
1013	 brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1014	 brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1015      }
1016
1017      insn->header.compression_control = BRW_COMPRESSION_NONE;
1018      insn->header.execution_size = patch_insn->header.execution_size;
1019      insn->header.mask_control = BRW_MASK_ENABLE;
1020      insn->header.thread_control = BRW_THREAD_SWITCH;
1021
1022      if (intel->gen < 6)
1023	 assert(patch_insn->bits3.if_else.jump_count == 0);
1024      else
1025	 assert(patch_insn->bits1.branch_gen6.jump_count == 0);
1026
1027      /* Patch the if or else instructions to point at this or the next
1028       * instruction respectively.
1029       */
1030      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
1031	 if (intel->gen < 6) {
1032	    /* Turn it into an IFF, which means no mask stack operations for
1033	     * all-false and jumping past the ENDIF.
1034	     */
1035	    patch_insn->header.opcode = BRW_OPCODE_IFF;
1036	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
1037	    patch_insn->bits3.if_else.pop_count = 0;
1038	    patch_insn->bits3.if_else.pad0 = 0;
1039	 } else {
1040	    /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1041	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1042	 }
1043      } else {
1044	 assert(patch_insn->header.opcode == BRW_OPCODE_ELSE);
1045	 if (intel->gen < 6) {
1046	    /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1047	     * matching ENDIF.
1048	     */
1049	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
1050	    patch_insn->bits3.if_else.pop_count = 1;
1051	    patch_insn->bits3.if_else.pad0 = 0;
1052	 } else {
1053	    /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1054	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1055	 }
1056      }
1057
1058      /* Also pop item off the stack in the endif instruction:
1059       */
1060      if (intel->gen < 6) {
1061	 insn->bits3.if_else.jump_count = 0;
1062	 insn->bits3.if_else.pop_count = 1;
1063	 insn->bits3.if_else.pad0 = 0;
1064      } else {
1065	 insn->bits1.branch_gen6.jump_count = 2;
1066      }
1067   }
1068}
1069
1070struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1071{
1072   struct intel_context *intel = &p->brw->intel;
1073   struct brw_instruction *insn;
1074
1075   insn = next_insn(p, BRW_OPCODE_BREAK);
1076   if (intel->gen >= 6) {
1077      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1078      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1079      brw_set_src1(insn, brw_imm_d(0x0));
1080   } else {
1081      brw_set_dest(p, insn, brw_ip_reg());
1082      brw_set_src0(insn, brw_ip_reg());
1083      brw_set_src1(insn, brw_imm_d(0x0));
1084      insn->bits3.if_else.pad0 = 0;
1085      insn->bits3.if_else.pop_count = pop_count;
1086   }
1087   insn->header.compression_control = BRW_COMPRESSION_NONE;
1088   insn->header.execution_size = BRW_EXECUTE_8;
1089
1090   return insn;
1091}
1092
1093struct brw_instruction *gen6_CONT(struct brw_compile *p,
1094				  struct brw_instruction *do_insn)
1095{
1096   struct brw_instruction *insn;
1097   int br = 2;
1098
1099   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1100   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1101   brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1102   brw_set_dest(p, insn, brw_ip_reg());
1103   brw_set_src0(insn, brw_ip_reg());
1104   brw_set_src1(insn, brw_imm_d(0x0));
1105
1106   insn->bits3.break_cont.uip = br * (do_insn - insn);
1107
1108   insn->header.compression_control = BRW_COMPRESSION_NONE;
1109   insn->header.execution_size = BRW_EXECUTE_8;
1110   return insn;
1111}
1112
1113struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1114{
1115   struct brw_instruction *insn;
1116   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1117   brw_set_dest(p, insn, brw_ip_reg());
1118   brw_set_src0(insn, brw_ip_reg());
1119   brw_set_src1(insn, brw_imm_d(0x0));
1120   insn->header.compression_control = BRW_COMPRESSION_NONE;
1121   insn->header.execution_size = BRW_EXECUTE_8;
1122   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1123   insn->bits3.if_else.pad0 = 0;
1124   insn->bits3.if_else.pop_count = pop_count;
1125   return insn;
1126}
1127
1128/* DO/WHILE loop:
1129 *
1130 * The DO/WHILE is just an unterminated loop -- break or continue are
1131 * used for control within the loop.  We have a few ways they can be
1132 * done.
1133 *
1134 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1135 * jip and no DO instruction.
1136 *
1137 * For non-uniform control flow pre-gen6, there's a DO instruction to
1138 * push the mask, and a WHILE to jump back, and BREAK to get out and
1139 * pop the mask.
1140 *
1141 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1142 * just points back to the first instruction of the loop.
1143 */
1144struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1145{
1146   struct intel_context *intel = &p->brw->intel;
1147
1148   if (intel->gen >= 6 || p->single_program_flow) {
1149      return &p->store[p->nr_insn];
1150   } else {
1151      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1152
1153      /* Override the defaults for this instruction:
1154       */
1155      brw_set_dest(p, insn, brw_null_reg());
1156      brw_set_src0(insn, brw_null_reg());
1157      brw_set_src1(insn, brw_null_reg());
1158
1159      insn->header.compression_control = BRW_COMPRESSION_NONE;
1160      insn->header.execution_size = execute_size;
1161      insn->header.predicate_control = BRW_PREDICATE_NONE;
1162      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1163      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1164
1165      return insn;
1166   }
1167}
1168
1169
1170
1171struct brw_instruction *brw_WHILE(struct brw_compile *p,
1172                                  struct brw_instruction *do_insn)
1173{
1174   struct intel_context *intel = &p->brw->intel;
1175   struct brw_instruction *insn;
1176   GLuint br = 1;
1177
1178   if (intel->gen >= 5)
1179      br = 2;
1180
1181   if (intel->gen >= 6) {
1182      insn = next_insn(p, BRW_OPCODE_WHILE);
1183
1184      brw_set_dest(p, insn, brw_imm_w(0));
1185      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1186      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1187      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1188
1189      insn->header.execution_size = do_insn->header.execution_size;
1190      assert(insn->header.execution_size == BRW_EXECUTE_8);
1191   } else {
1192      if (p->single_program_flow) {
1193	 insn = next_insn(p, BRW_OPCODE_ADD);
1194
1195	 brw_set_dest(p, insn, brw_ip_reg());
1196	 brw_set_src0(insn, brw_ip_reg());
1197	 brw_set_src1(insn, brw_imm_d((do_insn - insn) * 16));
1198	 insn->header.execution_size = BRW_EXECUTE_1;
1199      } else {
1200	 insn = next_insn(p, BRW_OPCODE_WHILE);
1201
1202	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1203
1204	 brw_set_dest(p, insn, brw_ip_reg());
1205	 brw_set_src0(insn, brw_ip_reg());
1206	 brw_set_src1(insn, brw_imm_d(0));
1207
1208	 insn->header.execution_size = do_insn->header.execution_size;
1209	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1210	 insn->bits3.if_else.pop_count = 0;
1211	 insn->bits3.if_else.pad0 = 0;
1212      }
1213   }
1214   insn->header.compression_control = BRW_COMPRESSION_NONE;
1215   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1216
1217   return insn;
1218}
1219
1220
1221/* FORWARD JUMPS:
1222 */
1223void brw_land_fwd_jump(struct brw_compile *p,
1224		       struct brw_instruction *jmp_insn)
1225{
1226   struct intel_context *intel = &p->brw->intel;
1227   struct brw_instruction *landing = &p->store[p->nr_insn];
1228   GLuint jmpi = 1;
1229
1230   if (intel->gen >= 5)
1231       jmpi = 2;
1232
1233   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1234   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1235
1236   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1237}
1238
1239
1240
1241/* To integrate with the above, it makes sense that the comparison
1242 * instruction should populate the flag register.  It might be simpler
1243 * just to use the flag reg for most WM tasks?
1244 */
1245void brw_CMP(struct brw_compile *p,
1246	     struct brw_reg dest,
1247	     GLuint conditional,
1248	     struct brw_reg src0,
1249	     struct brw_reg src1)
1250{
1251   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1252
1253   insn->header.destreg__conditionalmod = conditional;
1254   brw_set_dest(p, insn, dest);
1255   brw_set_src0(insn, src0);
1256   brw_set_src1(insn, src1);
1257
1258/*    guess_execution_size(insn, src0); */
1259
1260
1261   /* Make it so that future instructions will use the computed flag
1262    * value until brw_set_predicate_control_flag_value() is called
1263    * again.
1264    */
1265   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1266       dest.nr == 0) {
1267      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1268      p->flag_value = 0xff;
1269   }
1270}
1271
1272/* Issue 'wait' instruction for n1, host could program MMIO
1273   to wake up thread. */
1274void brw_WAIT (struct brw_compile *p)
1275{
1276   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1277   struct brw_reg src = brw_notification_1_reg();
1278
1279   brw_set_dest(p, insn, src);
1280   brw_set_src0(insn, src);
1281   brw_set_src1(insn, brw_null_reg());
1282   insn->header.execution_size = 0; /* must */
1283   insn->header.predicate_control = 0;
1284   insn->header.compression_control = 0;
1285}
1286
1287
1288/***********************************************************************
1289 * Helpers for the various SEND message types:
1290 */
1291
1292/** Extended math function, float[8].
1293 */
1294void brw_math( struct brw_compile *p,
1295	       struct brw_reg dest,
1296	       GLuint function,
1297	       GLuint saturate,
1298	       GLuint msg_reg_nr,
1299	       struct brw_reg src,
1300	       GLuint data_type,
1301	       GLuint precision )
1302{
1303   struct intel_context *intel = &p->brw->intel;
1304
1305   if (intel->gen >= 6) {
1306      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1307
1308      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1309      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1310
1311      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1312      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1313
1314      /* Source modifiers are ignored for extended math instructions. */
1315      assert(!src.negate);
1316      assert(!src.abs);
1317
1318      if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1319	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1320	 assert(src.type == BRW_REGISTER_TYPE_F);
1321      }
1322
1323      /* Math is the same ISA format as other opcodes, except that CondModifier
1324       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1325       */
1326      insn->header.destreg__conditionalmod = function;
1327      insn->header.saturate = saturate;
1328
1329      brw_set_dest(p, insn, dest);
1330      brw_set_src0(insn, src);
1331      brw_set_src1(insn, brw_null_reg());
1332   } else {
1333      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1334      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1335      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1336      /* Example code doesn't set predicate_control for send
1337       * instructions.
1338       */
1339      insn->header.predicate_control = 0;
1340      insn->header.destreg__conditionalmod = msg_reg_nr;
1341
1342      brw_set_dest(p, insn, dest);
1343      brw_set_src0(insn, src);
1344      brw_set_math_message(p->brw,
1345			   insn,
1346			   msg_length, response_length,
1347			   function,
1348			   BRW_MATH_INTEGER_UNSIGNED,
1349			   precision,
1350			   saturate,
1351			   data_type);
1352   }
1353}
1354
1355/** Extended math function, float[8].
1356 */
1357void brw_math2(struct brw_compile *p,
1358	       struct brw_reg dest,
1359	       GLuint function,
1360	       struct brw_reg src0,
1361	       struct brw_reg src1)
1362{
1363   struct intel_context *intel = &p->brw->intel;
1364   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1365
1366   assert(intel->gen >= 6);
1367   (void) intel;
1368
1369
1370   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1371   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1372   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1373
1374   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1375   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1376   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1377
1378   if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1379       function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1380      assert(src0.type == BRW_REGISTER_TYPE_F);
1381      assert(src1.type == BRW_REGISTER_TYPE_F);
1382   }
1383
1384   /* Source modifiers are ignored for extended math instructions. */
1385   assert(!src0.negate);
1386   assert(!src0.abs);
1387   assert(!src1.negate);
1388   assert(!src1.abs);
1389
1390   /* Math is the same ISA format as other opcodes, except that CondModifier
1391    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1392    */
1393   insn->header.destreg__conditionalmod = function;
1394
1395   brw_set_dest(p, insn, dest);
1396   brw_set_src0(insn, src0);
1397   brw_set_src1(insn, src1);
1398}
1399
1400/**
1401 * Extended math function, float[16].
1402 * Use 2 send instructions.
1403 */
1404void brw_math_16( struct brw_compile *p,
1405		  struct brw_reg dest,
1406		  GLuint function,
1407		  GLuint saturate,
1408		  GLuint msg_reg_nr,
1409		  struct brw_reg src,
1410		  GLuint precision )
1411{
1412   struct intel_context *intel = &p->brw->intel;
1413   struct brw_instruction *insn;
1414   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1415   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1416
1417   if (intel->gen >= 6) {
1418      insn = next_insn(p, BRW_OPCODE_MATH);
1419
1420      /* Math is the same ISA format as other opcodes, except that CondModifier
1421       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1422       */
1423      insn->header.destreg__conditionalmod = function;
1424      insn->header.saturate = saturate;
1425
1426      /* Source modifiers are ignored for extended math instructions. */
1427      assert(!src.negate);
1428      assert(!src.abs);
1429
1430      brw_set_dest(p, insn, dest);
1431      brw_set_src0(insn, src);
1432      brw_set_src1(insn, brw_null_reg());
1433      return;
1434   }
1435
1436   /* First instruction:
1437    */
1438   brw_push_insn_state(p);
1439   brw_set_predicate_control_flag_value(p, 0xff);
1440   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1441
1442   insn = next_insn(p, BRW_OPCODE_SEND);
1443   insn->header.destreg__conditionalmod = msg_reg_nr;
1444
1445   brw_set_dest(p, insn, dest);
1446   brw_set_src0(insn, src);
1447   brw_set_math_message(p->brw,
1448			insn,
1449			msg_length, response_length,
1450			function,
1451			BRW_MATH_INTEGER_UNSIGNED,
1452			precision,
1453			saturate,
1454			BRW_MATH_DATA_VECTOR);
1455
1456   /* Second instruction:
1457    */
1458   insn = next_insn(p, BRW_OPCODE_SEND);
1459   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1460   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1461
1462   brw_set_dest(p, insn, offset(dest,1));
1463   brw_set_src0(insn, src);
1464   brw_set_math_message(p->brw,
1465			insn,
1466			msg_length, response_length,
1467			function,
1468			BRW_MATH_INTEGER_UNSIGNED,
1469			precision,
1470			saturate,
1471			BRW_MATH_DATA_VECTOR);
1472
1473   brw_pop_insn_state(p);
1474}
1475
1476
1477/**
1478 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1479 * using a constant offset per channel.
1480 *
1481 * The offset must be aligned to oword size (16 bytes).  Used for
1482 * register spilling.
1483 */
1484void brw_oword_block_write_scratch(struct brw_compile *p,
1485				   struct brw_reg mrf,
1486				   int num_regs,
1487				   GLuint offset)
1488{
1489   struct intel_context *intel = &p->brw->intel;
1490   uint32_t msg_control;
1491   int mlen;
1492
1493   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1494
1495   if (num_regs == 1) {
1496      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1497      mlen = 2;
1498   } else {
1499      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1500      mlen = 3;
1501   }
1502
1503   /* Set up the message header.  This is g0, with g0.2 filled with
1504    * the offset.  We don't want to leave our offset around in g0 or
1505    * it'll screw up texture samples, so set it up inside the message
1506    * reg.
1507    */
1508   {
1509      brw_push_insn_state(p);
1510      brw_set_mask_control(p, BRW_MASK_DISABLE);
1511      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1512
1513      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1514
1515      /* set message header global offset field (reg 0, element 2) */
1516      brw_MOV(p,
1517	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1518				  mrf.nr,
1519				  2), BRW_REGISTER_TYPE_UD),
1520	      brw_imm_ud(offset));
1521
1522      brw_pop_insn_state(p);
1523   }
1524
1525   {
1526      struct brw_reg dest;
1527      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1528      int send_commit_msg;
1529      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1530					 BRW_REGISTER_TYPE_UW);
1531
1532      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1533	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1534	 src_header = vec16(src_header);
1535      }
1536      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1537      insn->header.destreg__conditionalmod = mrf.nr;
1538
1539      /* Until gen6, writes followed by reads from the same location
1540       * are not guaranteed to be ordered unless write_commit is set.
1541       * If set, then a no-op write is issued to the destination
1542       * register to set a dependency, and a read from the destination
1543       * can be used to ensure the ordering.
1544       *
1545       * For gen6, only writes between different threads need ordering
1546       * protection.  Our use of DP writes is all about register
1547       * spilling within a thread.
1548       */
1549      if (intel->gen >= 6) {
1550	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1551	 send_commit_msg = 0;
1552      } else {
1553	 dest = src_header;
1554	 send_commit_msg = 1;
1555      }
1556
1557      brw_set_dest(p, insn, dest);
1558      brw_set_src0(insn, brw_null_reg());
1559
1560      brw_set_dp_write_message(p->brw,
1561			       insn,
1562			       255, /* binding table index (255=stateless) */
1563			       msg_control,
1564			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
1565			       mlen,
1566			       GL_TRUE, /* header_present */
1567			       0, /* pixel scoreboard */
1568			       send_commit_msg, /* response_length */
1569			       0, /* eot */
1570			       send_commit_msg);
1571   }
1572}
1573
1574
1575/**
1576 * Read a block of owords (half a GRF each) from the scratch buffer
1577 * using a constant index per channel.
1578 *
1579 * Offset must be aligned to oword size (16 bytes).  Used for register
1580 * spilling.
1581 */
1582void
1583brw_oword_block_read_scratch(struct brw_compile *p,
1584			     struct brw_reg dest,
1585			     struct brw_reg mrf,
1586			     int num_regs,
1587			     GLuint offset)
1588{
1589   uint32_t msg_control;
1590   int rlen;
1591
1592   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1593   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1594
1595   if (num_regs == 1) {
1596      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1597      rlen = 1;
1598   } else {
1599      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1600      rlen = 2;
1601   }
1602
1603   {
1604      brw_push_insn_state(p);
1605      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1606      brw_set_mask_control(p, BRW_MASK_DISABLE);
1607
1608      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1609
1610      /* set message header global offset field (reg 0, element 2) */
1611      brw_MOV(p,
1612	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1613				  mrf.nr,
1614				  2), BRW_REGISTER_TYPE_UD),
1615	      brw_imm_ud(offset));
1616
1617      brw_pop_insn_state(p);
1618   }
1619
1620   {
1621      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1622
1623      assert(insn->header.predicate_control == 0);
1624      insn->header.compression_control = BRW_COMPRESSION_NONE;
1625      insn->header.destreg__conditionalmod = mrf.nr;
1626
1627      brw_set_dest(p, insn, dest);	/* UW? */
1628      brw_set_src0(insn, brw_null_reg());
1629
1630      brw_set_dp_read_message(p->brw,
1631			      insn,
1632			      255, /* binding table index (255=stateless) */
1633			      msg_control,
1634			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1635			      1, /* target cache (render/scratch) */
1636			      1, /* msg_length */
1637			      rlen);
1638   }
1639}
1640
1641/**
1642 * Read a float[4] vector from the data port Data Cache (const buffer).
1643 * Location (in buffer) should be a multiple of 16.
1644 * Used for fetching shader constants.
1645 */
1646void brw_oword_block_read(struct brw_compile *p,
1647			  struct brw_reg dest,
1648			  struct brw_reg mrf,
1649			  uint32_t offset,
1650			  uint32_t bind_table_index)
1651{
1652   struct intel_context *intel = &p->brw->intel;
1653
1654   /* On newer hardware, offset is in units of owords. */
1655   if (intel->gen >= 6)
1656      offset /= 16;
1657
1658   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1659
1660   brw_push_insn_state(p);
1661   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1662   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1663   brw_set_mask_control(p, BRW_MASK_DISABLE);
1664
1665   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1666
1667   /* set message header global offset field (reg 0, element 2) */
1668   brw_MOV(p,
1669	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1670			       mrf.nr,
1671			       2), BRW_REGISTER_TYPE_UD),
1672	   brw_imm_ud(offset));
1673
1674   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1675   insn->header.destreg__conditionalmod = mrf.nr;
1676
1677   /* cast dest to a uword[8] vector */
1678   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1679
1680   brw_set_dest(p, insn, dest);
1681   if (intel->gen >= 6) {
1682      brw_set_src0(insn, mrf);
1683   } else {
1684      brw_set_src0(insn, brw_null_reg());
1685   }
1686
1687   brw_set_dp_read_message(p->brw,
1688			   insn,
1689			   bind_table_index,
1690			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1691			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1692			   0, /* source cache = data cache */
1693			   1, /* msg_length */
1694			   1); /* response_length (1 reg, 2 owords!) */
1695
1696   brw_pop_insn_state(p);
1697}
1698
1699/**
1700 * Read a set of dwords from the data port Data Cache (const buffer).
1701 *
1702 * Location (in buffer) appears as UD offsets in the register after
1703 * the provided mrf header reg.
1704 */
1705void brw_dword_scattered_read(struct brw_compile *p,
1706			      struct brw_reg dest,
1707			      struct brw_reg mrf,
1708			      uint32_t bind_table_index)
1709{
1710   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1711
1712   brw_push_insn_state(p);
1713   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1714   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1715   brw_set_mask_control(p, BRW_MASK_DISABLE);
1716   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1717   brw_pop_insn_state(p);
1718
1719   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1720   insn->header.destreg__conditionalmod = mrf.nr;
1721
1722   /* cast dest to a uword[8] vector */
1723   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1724
1725   brw_set_dest(p, insn, dest);
1726   brw_set_src0(insn, brw_null_reg());
1727
1728   brw_set_dp_read_message(p->brw,
1729			   insn,
1730			   bind_table_index,
1731			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1732			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1733			   0, /* source cache = data cache */
1734			   2, /* msg_length */
1735			   1); /* response_length */
1736}
1737
1738
1739
1740/**
1741 * Read float[4] constant(s) from VS constant buffer.
1742 * For relative addressing, two float[4] constants will be read into 'dest'.
1743 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1744 */
1745void brw_dp_READ_4_vs(struct brw_compile *p,
1746                      struct brw_reg dest,
1747                      GLuint location,
1748                      GLuint bind_table_index)
1749{
1750   struct intel_context *intel = &p->brw->intel;
1751   struct brw_instruction *insn;
1752   GLuint msg_reg_nr = 1;
1753
1754   if (intel->gen >= 6)
1755      location /= 16;
1756
1757   /* Setup MRF[1] with location/offset into const buffer */
1758   brw_push_insn_state(p);
1759   brw_set_access_mode(p, BRW_ALIGN_1);
1760   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1761   brw_set_mask_control(p, BRW_MASK_DISABLE);
1762   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1763   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1764		     BRW_REGISTER_TYPE_UD),
1765	   brw_imm_ud(location));
1766   brw_pop_insn_state(p);
1767
1768   insn = next_insn(p, BRW_OPCODE_SEND);
1769
1770   insn->header.predicate_control = BRW_PREDICATE_NONE;
1771   insn->header.compression_control = BRW_COMPRESSION_NONE;
1772   insn->header.destreg__conditionalmod = msg_reg_nr;
1773   insn->header.mask_control = BRW_MASK_DISABLE;
1774
1775   brw_set_dest(p, insn, dest);
1776   if (intel->gen >= 6) {
1777      brw_set_src0(insn, brw_message_reg(msg_reg_nr));
1778   } else {
1779      brw_set_src0(insn, brw_null_reg());
1780   }
1781
1782   brw_set_dp_read_message(p->brw,
1783			   insn,
1784			   bind_table_index,
1785			   0,
1786			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1787			   0, /* source cache = data cache */
1788			   1, /* msg_length */
1789			   1); /* response_length (1 Oword) */
1790}
1791
1792/**
1793 * Read a float[4] constant per vertex from VS constant buffer, with
1794 * relative addressing.
1795 */
1796void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1797			       struct brw_reg dest,
1798			       struct brw_reg addr_reg,
1799			       GLuint offset,
1800			       GLuint bind_table_index)
1801{
1802   struct intel_context *intel = &p->brw->intel;
1803   int msg_type;
1804
1805   /* Setup MRF[1] with offset into const buffer */
1806   brw_push_insn_state(p);
1807   brw_set_access_mode(p, BRW_ALIGN_1);
1808   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1809   brw_set_mask_control(p, BRW_MASK_DISABLE);
1810   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1811
1812   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1813    * fields ignored.
1814    */
1815   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
1816	   addr_reg, brw_imm_d(offset));
1817   brw_pop_insn_state(p);
1818
1819   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1820
1821   insn->header.predicate_control = BRW_PREDICATE_NONE;
1822   insn->header.compression_control = BRW_COMPRESSION_NONE;
1823   insn->header.destreg__conditionalmod = 0;
1824   insn->header.mask_control = BRW_MASK_DISABLE;
1825
1826   brw_set_dest(p, insn, dest);
1827   brw_set_src0(insn, brw_vec8_grf(0, 0));
1828
1829   if (intel->gen == 6)
1830      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1831   else if (intel->gen == 5 || intel->is_g4x)
1832      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1833   else
1834      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1835
1836   brw_set_dp_read_message(p->brw,
1837			   insn,
1838			   bind_table_index,
1839			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1840			   msg_type,
1841			   0, /* source cache = data cache */
1842			   2, /* msg_length */
1843			   1); /* response_length */
1844}
1845
1846
1847
1848void brw_fb_WRITE(struct brw_compile *p,
1849		  int dispatch_width,
1850                  struct brw_reg dest,
1851                  GLuint msg_reg_nr,
1852                  struct brw_reg src0,
1853                  GLuint binding_table_index,
1854                  GLuint msg_length,
1855                  GLuint response_length,
1856                  GLboolean eot,
1857                  GLboolean header_present)
1858{
1859   struct intel_context *intel = &p->brw->intel;
1860   struct brw_instruction *insn;
1861   GLuint msg_control, msg_type;
1862
1863   if (intel->gen >= 6 && binding_table_index == 0) {
1864      insn = next_insn(p, BRW_OPCODE_SENDC);
1865   } else {
1866      insn = next_insn(p, BRW_OPCODE_SEND);
1867   }
1868   /* The execution mask is ignored for render target writes. */
1869   insn->header.predicate_control = 0;
1870   insn->header.compression_control = BRW_COMPRESSION_NONE;
1871
1872   if (intel->gen >= 6) {
1873       /* headerless version, just submit color payload */
1874       src0 = brw_message_reg(msg_reg_nr);
1875
1876       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1877   } else {
1878      insn->header.destreg__conditionalmod = msg_reg_nr;
1879
1880      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1881   }
1882
1883   if (dispatch_width == 16)
1884      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1885   else
1886      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1887
1888   brw_set_dest(p, insn, dest);
1889   brw_set_src0(insn, src0);
1890   brw_set_dp_write_message(p->brw,
1891			    insn,
1892			    binding_table_index,
1893			    msg_control,
1894			    msg_type,
1895			    msg_length,
1896			    header_present,
1897			    1,	/* pixel scoreboard */
1898			    response_length,
1899			    eot,
1900			    0 /* send_commit_msg */);
1901}
1902
1903
1904/**
1905 * Texture sample instruction.
1906 * Note: the msg_type plus msg_length values determine exactly what kind
1907 * of sampling operation is performed.  See volume 4, page 161 of docs.
1908 */
1909void brw_SAMPLE(struct brw_compile *p,
1910		struct brw_reg dest,
1911		GLuint msg_reg_nr,
1912		struct brw_reg src0,
1913		GLuint binding_table_index,
1914		GLuint sampler,
1915		GLuint writemask,
1916		GLuint msg_type,
1917		GLuint response_length,
1918		GLuint msg_length,
1919		GLboolean eot,
1920		GLuint header_present,
1921		GLuint simd_mode)
1922{
1923   struct intel_context *intel = &p->brw->intel;
1924   GLboolean need_stall = 0;
1925
1926   if (writemask == 0) {
1927      /*printf("%s: zero writemask??\n", __FUNCTION__); */
1928      return;
1929   }
1930
1931   /* Hardware doesn't do destination dependency checking on send
1932    * instructions properly.  Add a workaround which generates the
1933    * dependency by other means.  In practice it seems like this bug
1934    * only crops up for texture samples, and only where registers are
1935    * written by the send and then written again later without being
1936    * read in between.  Luckily for us, we already track that
1937    * information and use it to modify the writemask for the
1938    * instruction, so that is a guide for whether a workaround is
1939    * needed.
1940    */
1941   if (writemask != WRITEMASK_XYZW) {
1942      GLuint dst_offset = 0;
1943      GLuint i, newmask = 0, len = 0;
1944
1945      for (i = 0; i < 4; i++) {
1946	 if (writemask & (1<<i))
1947	    break;
1948	 dst_offset += 2;
1949      }
1950      for (; i < 4; i++) {
1951	 if (!(writemask & (1<<i)))
1952	    break;
1953	 newmask |= 1<<i;
1954	 len++;
1955      }
1956
1957      if (newmask != writemask) {
1958	 need_stall = 1;
1959         /* printf("need stall %x %x\n", newmask , writemask); */
1960      }
1961      else {
1962	 GLboolean dispatch_16 = GL_FALSE;
1963
1964	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
1965
1966	 guess_execution_size(p, p->current, dest);
1967	 if (p->current->header.execution_size == BRW_EXECUTE_16)
1968	    dispatch_16 = GL_TRUE;
1969
1970	 newmask = ~newmask & WRITEMASK_XYZW;
1971
1972	 brw_push_insn_state(p);
1973
1974	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1975	 brw_set_mask_control(p, BRW_MASK_DISABLE);
1976
1977	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
1978		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
1979  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
1980
1981	 brw_pop_insn_state(p);
1982
1983  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
1984	 dest = offset(dest, dst_offset);
1985
1986	 /* For 16-wide dispatch, masked channels are skipped in the
1987	  * response.  For 8-wide, masked channels still take up slots,
1988	  * and are just not written to.
1989	  */
1990	 if (dispatch_16)
1991	    response_length = len * 2;
1992      }
1993   }
1994
1995   {
1996      struct brw_instruction *insn;
1997
1998      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
1999
2000      insn = next_insn(p, BRW_OPCODE_SEND);
2001      insn->header.predicate_control = 0; /* XXX */
2002      insn->header.compression_control = BRW_COMPRESSION_NONE;
2003      if (intel->gen < 6)
2004	  insn->header.destreg__conditionalmod = msg_reg_nr;
2005
2006      brw_set_dest(p, insn, dest);
2007      brw_set_src0(insn, src0);
2008      brw_set_sampler_message(p->brw, insn,
2009			      binding_table_index,
2010			      sampler,
2011			      msg_type,
2012			      response_length,
2013			      msg_length,
2014			      eot,
2015			      header_present,
2016			      simd_mode);
2017   }
2018
2019   if (need_stall) {
2020      struct brw_reg reg = vec8(offset(dest, response_length-1));
2021
2022      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2023       */
2024      brw_push_insn_state(p);
2025      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2026      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2027	      retype(reg, BRW_REGISTER_TYPE_UD));
2028      brw_pop_insn_state(p);
2029   }
2030
2031}
2032
2033/* All these variables are pretty confusing - we might be better off
2034 * using bitmasks and macros for this, in the old style.  Or perhaps
2035 * just having the caller instantiate the fields in dword3 itself.
2036 */
2037void brw_urb_WRITE(struct brw_compile *p,
2038		   struct brw_reg dest,
2039		   GLuint msg_reg_nr,
2040		   struct brw_reg src0,
2041		   GLboolean allocate,
2042		   GLboolean used,
2043		   GLuint msg_length,
2044		   GLuint response_length,
2045		   GLboolean eot,
2046		   GLboolean writes_complete,
2047		   GLuint offset,
2048		   GLuint swizzle)
2049{
2050   struct intel_context *intel = &p->brw->intel;
2051   struct brw_instruction *insn;
2052
2053   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2054
2055   insn = next_insn(p, BRW_OPCODE_SEND);
2056
2057   assert(msg_length < BRW_MAX_MRF);
2058
2059   brw_set_dest(p, insn, dest);
2060   brw_set_src0(insn, src0);
2061   brw_set_src1(insn, brw_imm_d(0));
2062
2063   if (intel->gen < 6)
2064      insn->header.destreg__conditionalmod = msg_reg_nr;
2065
2066   brw_set_urb_message(p->brw,
2067		       insn,
2068		       allocate,
2069		       used,
2070		       msg_length,
2071		       response_length,
2072		       eot,
2073		       writes_complete,
2074		       offset,
2075		       swizzle);
2076}
2077
2078static int
2079brw_find_next_block_end(struct brw_compile *p, int start)
2080{
2081   int ip;
2082
2083   for (ip = start + 1; ip < p->nr_insn; ip++) {
2084      struct brw_instruction *insn = &p->store[ip];
2085
2086      switch (insn->header.opcode) {
2087      case BRW_OPCODE_ENDIF:
2088      case BRW_OPCODE_ELSE:
2089      case BRW_OPCODE_WHILE:
2090	 return ip;
2091      }
2092   }
2093   assert(!"not reached");
2094   return start + 1;
2095}
2096
2097/* There is no DO instruction on gen6, so to find the end of the loop
2098 * we have to see if the loop is jumping back before our start
2099 * instruction.
2100 */
2101static int
2102brw_find_loop_end(struct brw_compile *p, int start)
2103{
2104   int ip;
2105   int br = 2;
2106
2107   for (ip = start + 1; ip < p->nr_insn; ip++) {
2108      struct brw_instruction *insn = &p->store[ip];
2109
2110      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2111	 if (ip + insn->bits1.branch_gen6.jump_count / br < start)
2112	    return ip;
2113      }
2114   }
2115   assert(!"not reached");
2116   return start + 1;
2117}
2118
2119/* After program generation, go back and update the UIP and JIP of
2120 * BREAK and CONT instructions to their correct locations.
2121 */
2122void
2123brw_set_uip_jip(struct brw_compile *p)
2124{
2125   struct intel_context *intel = &p->brw->intel;
2126   int ip;
2127   int br = 2;
2128
2129   if (intel->gen < 6)
2130      return;
2131
2132   for (ip = 0; ip < p->nr_insn; ip++) {
2133      struct brw_instruction *insn = &p->store[ip];
2134
2135      switch (insn->header.opcode) {
2136      case BRW_OPCODE_BREAK:
2137	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2138	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
2139	 break;
2140      case BRW_OPCODE_CONTINUE:
2141	 /* JIP is set at CONTINUE emit time, since that's when we
2142	  * know where the start of the loop is.
2143	  */
2144	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2145	 assert(insn->bits3.break_cont.uip != 0);
2146	 assert(insn->bits3.break_cont.jip != 0);
2147	 break;
2148      }
2149   }
2150}
2151
2152void brw_ff_sync(struct brw_compile *p,
2153		   struct brw_reg dest,
2154		   GLuint msg_reg_nr,
2155		   struct brw_reg src0,
2156		   GLboolean allocate,
2157		   GLuint response_length,
2158		   GLboolean eot)
2159{
2160   struct intel_context *intel = &p->brw->intel;
2161   struct brw_instruction *insn;
2162
2163   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2164
2165   insn = next_insn(p, BRW_OPCODE_SEND);
2166   brw_set_dest(p, insn, dest);
2167   brw_set_src0(insn, src0);
2168   brw_set_src1(insn, brw_imm_d(0));
2169
2170   if (intel->gen < 6)
2171       insn->header.destreg__conditionalmod = msg_reg_nr;
2172
2173   brw_set_ff_sync_message(p->brw,
2174			   insn,
2175			   allocate,
2176			   response_length,
2177			   eot);
2178}
2179