brw_eu_emit.c revision 0b77d57394a3712851ec271aa7ad353d56f302a1
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37
38
39
40/***********************************************************************
41 * Internal helper for constructing instructions
42 */
43
44static void guess_execution_size( struct brw_instruction *insn,
45				  struct brw_reg reg )
46{
47   if (reg.width == BRW_WIDTH_8 &&
48       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED)
49      insn->header.execution_size = BRW_EXECUTE_16;
50   else
51      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
52}
53
54
55static void brw_set_dest( struct brw_instruction *insn,
56			  struct brw_reg dest )
57{
58   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
59       dest.file != BRW_MESSAGE_REGISTER_FILE)
60      assert(dest.nr < 128);
61
62   insn->bits1.da1.dest_reg_file = dest.file;
63   insn->bits1.da1.dest_reg_type = dest.type;
64   insn->bits1.da1.dest_address_mode = dest.address_mode;
65
66   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
67      insn->bits1.da1.dest_reg_nr = dest.nr;
68
69      if (insn->header.access_mode == BRW_ALIGN_1) {
70	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
71	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
72	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
73	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
74      }
75      else {
76	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
77	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
78	 /* even ignored in da16, still need to set as '01' */
79	 insn->bits1.da16.dest_horiz_stride = 1;
80      }
81   }
82   else {
83      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
84
85      /* These are different sizes in align1 vs align16:
86       */
87      if (insn->header.access_mode == BRW_ALIGN_1) {
88	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
89	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
90	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
91	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
92      }
93      else {
94	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
95	 /* even ignored in da16, still need to set as '01' */
96	 insn->bits1.ia16.dest_horiz_stride = 1;
97      }
98   }
99
100   /* NEW: Set the execution size based on dest.width and
101    * insn->compression_control:
102    */
103   guess_execution_size(insn, dest);
104}
105
106extern int reg_type_size[];
107
108static void
109validate_reg(struct brw_instruction *insn, struct brw_reg reg)
110{
111   int hstride_for_reg[] = {0, 1, 2, 4};
112   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
113   int width_for_reg[] = {1, 2, 4, 8, 16};
114   int execsize_for_reg[] = {1, 2, 4, 8, 16};
115   int width, hstride, vstride, execsize;
116
117   if (reg.file == BRW_IMMEDIATE_VALUE) {
118      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
119       * mean the destination has to be 128-bit aligned and the
120       * destination horiz stride has to be a word.
121       */
122      if (reg.type == BRW_REGISTER_TYPE_V) {
123	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
124		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
125      }
126
127      return;
128   }
129
130   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
131       reg.file == BRW_ARF_NULL)
132      return;
133
134   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
135   hstride = hstride_for_reg[reg.hstride];
136
137   if (reg.vstride == 0xf) {
138      vstride = -1;
139   } else {
140      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
141      vstride = vstride_for_reg[reg.vstride];
142   }
143
144   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
145   width = width_for_reg[reg.width];
146
147   assert(insn->header.execution_size >= 0 &&
148	  insn->header.execution_size < Elements(execsize_for_reg));
149   execsize = execsize_for_reg[insn->header.execution_size];
150
151   /* Restrictions from 3.3.10: Register Region Restrictions. */
152   /* 3. */
153   assert(execsize >= width);
154
155   /* 4. */
156   if (execsize == width && hstride != 0) {
157      assert(vstride == -1 || vstride == width * hstride);
158   }
159
160   /* 5. */
161   if (execsize == width && hstride == 0) {
162      /* no restriction on vstride. */
163   }
164
165   /* 6. */
166   if (width == 1) {
167      assert(hstride == 0);
168   }
169
170   /* 7. */
171   if (execsize == 1 && width == 1) {
172      assert(hstride == 0);
173      assert(vstride == 0);
174   }
175
176   /* 8. */
177   if (vstride == 0 && hstride == 0) {
178      assert(width == 1);
179   }
180
181   /* 10. Check destination issues. */
182}
183
184static void brw_set_src0( struct brw_instruction *insn,
185                          struct brw_reg reg )
186{
187   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
188      assert(reg.nr < 128);
189
190   validate_reg(insn, reg);
191
192   insn->bits1.da1.src0_reg_file = reg.file;
193   insn->bits1.da1.src0_reg_type = reg.type;
194   insn->bits2.da1.src0_abs = reg.abs;
195   insn->bits2.da1.src0_negate = reg.negate;
196   insn->bits2.da1.src0_address_mode = reg.address_mode;
197
198   if (reg.file == BRW_IMMEDIATE_VALUE) {
199      insn->bits3.ud = reg.dw1.ud;
200
201      /* Required to set some fields in src1 as well:
202       */
203      insn->bits1.da1.src1_reg_file = 0; /* arf */
204      insn->bits1.da1.src1_reg_type = reg.type;
205   }
206   else
207   {
208      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
209	 if (insn->header.access_mode == BRW_ALIGN_1) {
210	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
211	    insn->bits2.da1.src0_reg_nr = reg.nr;
212	 }
213	 else {
214	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
215	    insn->bits2.da16.src0_reg_nr = reg.nr;
216	 }
217      }
218      else {
219	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
220
221	 if (insn->header.access_mode == BRW_ALIGN_1) {
222	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
223	 }
224	 else {
225	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
226	 }
227      }
228
229      if (insn->header.access_mode == BRW_ALIGN_1) {
230	 if (reg.width == BRW_WIDTH_1 &&
231	     insn->header.execution_size == BRW_EXECUTE_1) {
232	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
233	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
234	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
235	 }
236	 else {
237	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
238	    insn->bits2.da1.src0_width = reg.width;
239	    insn->bits2.da1.src0_vert_stride = reg.vstride;
240	 }
241      }
242      else {
243	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
244	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
245	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
246	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
247
248	 /* This is an oddity of the fact we're using the same
249	  * descriptions for registers in align_16 as align_1:
250	  */
251	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
252	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
253	 else
254	    insn->bits2.da16.src0_vert_stride = reg.vstride;
255      }
256   }
257}
258
259
260void brw_set_src1( struct brw_instruction *insn,
261                   struct brw_reg reg )
262{
263   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
264
265   assert(reg.nr < 128);
266
267   validate_reg(insn, reg);
268
269   insn->bits1.da1.src1_reg_file = reg.file;
270   insn->bits1.da1.src1_reg_type = reg.type;
271   insn->bits3.da1.src1_abs = reg.abs;
272   insn->bits3.da1.src1_negate = reg.negate;
273
274   /* Only src1 can be immediate in two-argument instructions.
275    */
276   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
277
278   if (reg.file == BRW_IMMEDIATE_VALUE) {
279      insn->bits3.ud = reg.dw1.ud;
280   }
281   else {
282      /* This is a hardware restriction, which may or may not be lifted
283       * in the future:
284       */
285      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
286      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
287
288      if (insn->header.access_mode == BRW_ALIGN_1) {
289	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
290	 insn->bits3.da1.src1_reg_nr = reg.nr;
291      }
292      else {
293	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
294	 insn->bits3.da16.src1_reg_nr = reg.nr;
295      }
296
297      if (insn->header.access_mode == BRW_ALIGN_1) {
298	 if (reg.width == BRW_WIDTH_1 &&
299	     insn->header.execution_size == BRW_EXECUTE_1) {
300	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
301	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
302	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
303	 }
304	 else {
305	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
306	    insn->bits3.da1.src1_width = reg.width;
307	    insn->bits3.da1.src1_vert_stride = reg.vstride;
308	 }
309      }
310      else {
311	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
312	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
313	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
314	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
315
316	 /* This is an oddity of the fact we're using the same
317	  * descriptions for registers in align_16 as align_1:
318	  */
319	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
320	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
321	 else
322	    insn->bits3.da16.src1_vert_stride = reg.vstride;
323      }
324   }
325}
326
327
328
329static void brw_set_math_message( struct brw_context *brw,
330				  struct brw_instruction *insn,
331				  GLuint msg_length,
332				  GLuint response_length,
333				  GLuint function,
334				  GLuint integer_type,
335				  GLboolean low_precision,
336				  GLboolean saturate,
337				  GLuint dataType )
338{
339   struct intel_context *intel = &brw->intel;
340   brw_set_src1(insn, brw_imm_d(0));
341
342   if (intel->gen == 5) {
343       insn->bits3.math_gen5.function = function;
344       insn->bits3.math_gen5.int_type = integer_type;
345       insn->bits3.math_gen5.precision = low_precision;
346       insn->bits3.math_gen5.saturate = saturate;
347       insn->bits3.math_gen5.data_type = dataType;
348       insn->bits3.math_gen5.snapshot = 0;
349       insn->bits3.math_gen5.header_present = 0;
350       insn->bits3.math_gen5.response_length = response_length;
351       insn->bits3.math_gen5.msg_length = msg_length;
352       insn->bits3.math_gen5.end_of_thread = 0;
353       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
354       insn->bits2.send_gen5.end_of_thread = 0;
355   } else {
356       insn->bits3.math.function = function;
357       insn->bits3.math.int_type = integer_type;
358       insn->bits3.math.precision = low_precision;
359       insn->bits3.math.saturate = saturate;
360       insn->bits3.math.data_type = dataType;
361       insn->bits3.math.response_length = response_length;
362       insn->bits3.math.msg_length = msg_length;
363       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
364       insn->bits3.math.end_of_thread = 0;
365   }
366}
367
368
369static void brw_set_ff_sync_message(struct brw_context *brw,
370				    struct brw_instruction *insn,
371				    GLboolean allocate,
372				    GLuint response_length,
373				    GLboolean end_of_thread)
374{
375	struct intel_context *intel = &brw->intel;
376	brw_set_src1(insn, brw_imm_d(0));
377
378	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
379	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
380	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
381	insn->bits3.urb_gen5.allocate = allocate;
382	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
383	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
384	insn->bits3.urb_gen5.header_present = 1;
385	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
386	insn->bits3.urb_gen5.msg_length = 1;
387	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
388	if (intel->gen >= 6) {
389	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
390	} else {
391	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
392	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
393	}
394}
395
396static void brw_set_urb_message( struct brw_context *brw,
397				 struct brw_instruction *insn,
398				 GLboolean allocate,
399				 GLboolean used,
400				 GLuint msg_length,
401				 GLuint response_length,
402				 GLboolean end_of_thread,
403				 GLboolean complete,
404				 GLuint offset,
405				 GLuint swizzle_control )
406{
407    struct intel_context *intel = &brw->intel;
408    brw_set_src1(insn, brw_imm_d(0));
409
410    if (intel->gen >= 5) {
411        insn->bits3.urb_gen5.opcode = 0;	/* ? */
412        insn->bits3.urb_gen5.offset = offset;
413        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
414        insn->bits3.urb_gen5.allocate = allocate;
415        insn->bits3.urb_gen5.used = used;	/* ? */
416        insn->bits3.urb_gen5.complete = complete;
417        insn->bits3.urb_gen5.header_present = 1;
418        insn->bits3.urb_gen5.response_length = response_length;
419        insn->bits3.urb_gen5.msg_length = msg_length;
420        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
421	if (intel->gen >= 6) {
422	   /* For SNB, the SFID bits moved to the condmod bits, and
423	    * EOT stayed in bits3 above.  Does the EOT bit setting
424	    * below on Ironlake even do anything?
425	    */
426	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
427	} else {
428	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
429	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
430	}
431    } else {
432        insn->bits3.urb.opcode = 0;	/* ? */
433        insn->bits3.urb.offset = offset;
434        insn->bits3.urb.swizzle_control = swizzle_control;
435        insn->bits3.urb.allocate = allocate;
436        insn->bits3.urb.used = used;	/* ? */
437        insn->bits3.urb.complete = complete;
438        insn->bits3.urb.response_length = response_length;
439        insn->bits3.urb.msg_length = msg_length;
440        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
441        insn->bits3.urb.end_of_thread = end_of_thread;
442    }
443}
444
445static void brw_set_dp_write_message( struct brw_context *brw,
446				      struct brw_instruction *insn,
447				      GLuint binding_table_index,
448				      GLuint msg_control,
449				      GLuint msg_type,
450				      GLuint msg_length,
451				      GLboolean header_present,
452				      GLuint pixel_scoreboard_clear,
453				      GLuint response_length,
454				      GLuint end_of_thread,
455				      GLuint send_commit_msg)
456{
457   struct intel_context *intel = &brw->intel;
458   brw_set_src1(insn, brw_imm_ud(0));
459
460   if (intel->gen >= 6) {
461       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
462       insn->bits3.dp_render_cache.msg_control = msg_control;
463       insn->bits3.dp_render_cache.pixel_scoreboard_clear = pixel_scoreboard_clear;
464       insn->bits3.dp_render_cache.msg_type = msg_type;
465       insn->bits3.dp_render_cache.send_commit_msg = send_commit_msg;
466       insn->bits3.dp_render_cache.header_present = header_present;
467       insn->bits3.dp_render_cache.response_length = response_length;
468       insn->bits3.dp_render_cache.msg_length = msg_length;
469       insn->bits3.dp_render_cache.end_of_thread = end_of_thread;
470       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
471	/* XXX really need below? */
472       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
473       insn->bits2.send_gen5.end_of_thread = end_of_thread;
474   } else if (intel->gen == 5) {
475       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
476       insn->bits3.dp_write_gen5.msg_control = msg_control;
477       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
478       insn->bits3.dp_write_gen5.msg_type = msg_type;
479       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
480       insn->bits3.dp_write_gen5.header_present = header_present;
481       insn->bits3.dp_write_gen5.response_length = response_length;
482       insn->bits3.dp_write_gen5.msg_length = msg_length;
483       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
484       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
485       insn->bits2.send_gen5.end_of_thread = end_of_thread;
486   } else {
487       insn->bits3.dp_write.binding_table_index = binding_table_index;
488       insn->bits3.dp_write.msg_control = msg_control;
489       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
490       insn->bits3.dp_write.msg_type = msg_type;
491       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
492       insn->bits3.dp_write.response_length = response_length;
493       insn->bits3.dp_write.msg_length = msg_length;
494       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
495       insn->bits3.dp_write.end_of_thread = end_of_thread;
496   }
497}
498
499static void brw_set_dp_read_message( struct brw_context *brw,
500				      struct brw_instruction *insn,
501				      GLuint binding_table_index,
502				      GLuint msg_control,
503				      GLuint msg_type,
504				      GLuint target_cache,
505				      GLuint msg_length,
506				      GLuint response_length,
507				      GLuint end_of_thread )
508{
509   struct intel_context *intel = &brw->intel;
510   brw_set_src1(insn, brw_imm_d(0));
511
512   if (intel->gen == 5) {
513       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
514       insn->bits3.dp_read_gen5.msg_control = msg_control;
515       insn->bits3.dp_read_gen5.msg_type = msg_type;
516       insn->bits3.dp_read_gen5.target_cache = target_cache;
517       insn->bits3.dp_read_gen5.header_present = 1;
518       insn->bits3.dp_read_gen5.response_length = response_length;
519       insn->bits3.dp_read_gen5.msg_length = msg_length;
520       insn->bits3.dp_read_gen5.pad1 = 0;
521       insn->bits3.dp_read_gen5.end_of_thread = end_of_thread;
522       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
523       insn->bits2.send_gen5.end_of_thread = end_of_thread;
524   } else {
525       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
526       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
527       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
528       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
529       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
530       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
531       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
532       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
533       insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
534   }
535}
536
537static void brw_set_sampler_message(struct brw_context *brw,
538                                    struct brw_instruction *insn,
539                                    GLuint binding_table_index,
540                                    GLuint sampler,
541                                    GLuint msg_type,
542                                    GLuint response_length,
543                                    GLuint msg_length,
544                                    GLboolean eot,
545                                    GLuint header_present,
546                                    GLuint simd_mode)
547{
548   struct intel_context *intel = &brw->intel;
549   assert(eot == 0);
550   brw_set_src1(insn, brw_imm_d(0));
551
552   if (intel->gen >= 5) {
553      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
554      insn->bits3.sampler_gen5.sampler = sampler;
555      insn->bits3.sampler_gen5.msg_type = msg_type;
556      insn->bits3.sampler_gen5.simd_mode = simd_mode;
557      insn->bits3.sampler_gen5.header_present = header_present;
558      insn->bits3.sampler_gen5.response_length = response_length;
559      insn->bits3.sampler_gen5.msg_length = msg_length;
560      insn->bits3.sampler_gen5.end_of_thread = eot;
561      if (intel->gen >= 6)
562	  insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
563      else {
564	  insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
565	  insn->bits2.send_gen5.end_of_thread = eot;
566      }
567   } else if (intel->is_g4x) {
568      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
569      insn->bits3.sampler_g4x.sampler = sampler;
570      insn->bits3.sampler_g4x.msg_type = msg_type;
571      insn->bits3.sampler_g4x.response_length = response_length;
572      insn->bits3.sampler_g4x.msg_length = msg_length;
573      insn->bits3.sampler_g4x.end_of_thread = eot;
574      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
575   } else {
576      insn->bits3.sampler.binding_table_index = binding_table_index;
577      insn->bits3.sampler.sampler = sampler;
578      insn->bits3.sampler.msg_type = msg_type;
579      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
580      insn->bits3.sampler.response_length = response_length;
581      insn->bits3.sampler.msg_length = msg_length;
582      insn->bits3.sampler.end_of_thread = eot;
583      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
584   }
585}
586
587
588
589static struct brw_instruction *next_insn( struct brw_compile *p,
590					  GLuint opcode )
591{
592   struct brw_instruction *insn;
593
594   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
595
596   insn = &p->store[p->nr_insn++];
597   memcpy(insn, p->current, sizeof(*insn));
598
599   /* Reset this one-shot flag:
600    */
601
602   if (p->current->header.destreg__conditionalmod) {
603      p->current->header.destreg__conditionalmod = 0;
604      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
605   }
606
607   insn->header.opcode = opcode;
608   return insn;
609}
610
611
612static struct brw_instruction *brw_alu1( struct brw_compile *p,
613					 GLuint opcode,
614					 struct brw_reg dest,
615					 struct brw_reg src )
616{
617   struct brw_instruction *insn = next_insn(p, opcode);
618   brw_set_dest(insn, dest);
619   brw_set_src0(insn, src);
620   return insn;
621}
622
623static struct brw_instruction *brw_alu2(struct brw_compile *p,
624					GLuint opcode,
625					struct brw_reg dest,
626					struct brw_reg src0,
627					struct brw_reg src1 )
628{
629   struct brw_instruction *insn = next_insn(p, opcode);
630   brw_set_dest(insn, dest);
631   brw_set_src0(insn, src0);
632   brw_set_src1(insn, src1);
633   return insn;
634}
635
636
637/***********************************************************************
638 * Convenience routines.
639 */
640#define ALU1(OP)					\
641struct brw_instruction *brw_##OP(struct brw_compile *p,	\
642	      struct brw_reg dest,			\
643	      struct brw_reg src0)   			\
644{							\
645   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
646}
647
648#define ALU2(OP)					\
649struct brw_instruction *brw_##OP(struct brw_compile *p,	\
650	      struct brw_reg dest,			\
651	      struct brw_reg src0,			\
652	      struct brw_reg src1)   			\
653{							\
654   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
655}
656
657/* Rounding operations (other than RNDD) require two instructions - the first
658 * stores a rounded value (possibly the wrong way) in the dest register, but
659 * also sets a per-channel "increment bit" in the flag register.  A predicated
660 * add of 1.0 fixes dest to contain the desired result.
661 */
662#define ROUND(OP)							      \
663void brw_##OP(struct brw_compile *p,					      \
664	      struct brw_reg dest,					      \
665	      struct brw_reg src)					      \
666{									      \
667   struct brw_instruction *rnd, *add;					      \
668   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
669   brw_set_dest(rnd, dest);						      \
670   brw_set_src0(rnd, src);						      \
671   rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
672									      \
673   add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
674   add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
675}
676
677
678ALU1(MOV)
679ALU2(SEL)
680ALU1(NOT)
681ALU2(AND)
682ALU2(OR)
683ALU2(XOR)
684ALU2(SHR)
685ALU2(SHL)
686ALU2(RSR)
687ALU2(RSL)
688ALU2(ASR)
689ALU1(FRC)
690ALU1(RNDD)
691ALU2(MAC)
692ALU2(MACH)
693ALU1(LZD)
694ALU2(DP4)
695ALU2(DPH)
696ALU2(DP3)
697ALU2(DP2)
698ALU2(LINE)
699ALU2(PLN)
700
701
702ROUND(RNDZ)
703ROUND(RNDE)
704
705
706struct brw_instruction *brw_ADD(struct brw_compile *p,
707				struct brw_reg dest,
708				struct brw_reg src0,
709				struct brw_reg src1)
710{
711   /* 6.2.2: add */
712   if (src0.type == BRW_REGISTER_TYPE_F ||
713       (src0.file == BRW_IMMEDIATE_VALUE &&
714	src0.type == BRW_REGISTER_TYPE_VF)) {
715      assert(src1.type != BRW_REGISTER_TYPE_UD);
716      assert(src1.type != BRW_REGISTER_TYPE_D);
717   }
718
719   if (src1.type == BRW_REGISTER_TYPE_F ||
720       (src1.file == BRW_IMMEDIATE_VALUE &&
721	src1.type == BRW_REGISTER_TYPE_VF)) {
722      assert(src0.type != BRW_REGISTER_TYPE_UD);
723      assert(src0.type != BRW_REGISTER_TYPE_D);
724   }
725
726   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
727}
728
729struct brw_instruction *brw_MUL(struct brw_compile *p,
730				struct brw_reg dest,
731				struct brw_reg src0,
732				struct brw_reg src1)
733{
734   /* 6.32.38: mul */
735   if (src0.type == BRW_REGISTER_TYPE_D ||
736       src0.type == BRW_REGISTER_TYPE_UD ||
737       src1.type == BRW_REGISTER_TYPE_D ||
738       src1.type == BRW_REGISTER_TYPE_UD) {
739      assert(dest.type != BRW_REGISTER_TYPE_F);
740   }
741
742   if (src0.type == BRW_REGISTER_TYPE_F ||
743       (src0.file == BRW_IMMEDIATE_VALUE &&
744	src0.type == BRW_REGISTER_TYPE_VF)) {
745      assert(src1.type != BRW_REGISTER_TYPE_UD);
746      assert(src1.type != BRW_REGISTER_TYPE_D);
747   }
748
749   if (src1.type == BRW_REGISTER_TYPE_F ||
750       (src1.file == BRW_IMMEDIATE_VALUE &&
751	src1.type == BRW_REGISTER_TYPE_VF)) {
752      assert(src0.type != BRW_REGISTER_TYPE_UD);
753      assert(src0.type != BRW_REGISTER_TYPE_D);
754   }
755
756   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
757	  src0.nr != BRW_ARF_ACCUMULATOR);
758   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
759	  src1.nr != BRW_ARF_ACCUMULATOR);
760
761   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
762}
763
764
765void brw_NOP(struct brw_compile *p)
766{
767   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
768   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
769   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
770   brw_set_src1(insn, brw_imm_ud(0x0));
771}
772
773
774
775
776
777/***********************************************************************
778 * Comparisons, if/else/endif
779 */
780
781struct brw_instruction *brw_JMPI(struct brw_compile *p,
782                                 struct brw_reg dest,
783                                 struct brw_reg src0,
784                                 struct brw_reg src1)
785{
786   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
787
788   insn->header.execution_size = 1;
789   insn->header.compression_control = BRW_COMPRESSION_NONE;
790   insn->header.mask_control = BRW_MASK_DISABLE;
791
792   p->current->header.predicate_control = BRW_PREDICATE_NONE;
793
794   return insn;
795}
796
797/* EU takes the value from the flag register and pushes it onto some
798 * sort of a stack (presumably merging with any flag value already on
799 * the stack).  Within an if block, the flags at the top of the stack
800 * control execution on each channel of the unit, eg. on each of the
801 * 16 pixel values in our wm programs.
802 *
803 * When the matching 'else' instruction is reached (presumably by
804 * countdown of the instruction count patched in by our ELSE/ENDIF
805 * functions), the relevent flags are inverted.
806 *
807 * When the matching 'endif' instruction is reached, the flags are
808 * popped off.  If the stack is now empty, normal execution resumes.
809 *
810 * No attempt is made to deal with stack overflow (14 elements?).
811 */
812struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
813{
814   struct intel_context *intel = &p->brw->intel;
815   struct brw_instruction *insn;
816
817   if (p->single_program_flow) {
818      assert(execute_size == BRW_EXECUTE_1);
819
820      insn = next_insn(p, BRW_OPCODE_ADD);
821      insn->header.predicate_inverse = 1;
822   } else {
823      insn = next_insn(p, BRW_OPCODE_IF);
824   }
825
826   /* Override the defaults for this instruction:
827    */
828   if (intel->gen < 6) {
829      brw_set_dest(insn, brw_ip_reg());
830      brw_set_src0(insn, brw_ip_reg());
831      brw_set_src1(insn, brw_imm_d(0x0));
832   } else {
833      brw_set_dest(insn, brw_imm_w(0));
834      insn->bits1.branch_gen6.jump_count = 0;
835      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
836      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
837   }
838
839   insn->header.execution_size = execute_size;
840   insn->header.compression_control = BRW_COMPRESSION_NONE;
841   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
842   insn->header.mask_control = BRW_MASK_ENABLE;
843   if (!p->single_program_flow)
844       insn->header.thread_control = BRW_THREAD_SWITCH;
845
846   p->current->header.predicate_control = BRW_PREDICATE_NONE;
847
848   return insn;
849}
850
851struct brw_instruction *
852brw_IF_gen6(struct brw_compile *p, uint32_t conditional,
853	    struct brw_reg src0, struct brw_reg src1)
854{
855   struct brw_instruction *insn;
856
857   insn = next_insn(p, BRW_OPCODE_IF);
858
859   brw_set_dest(insn, brw_imm_w(0));
860   insn->header.execution_size = BRW_EXECUTE_8;
861   insn->bits1.branch_gen6.jump_count = 0;
862   brw_set_src0(insn, src0);
863   brw_set_src1(insn, src1);
864
865   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
866   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
867   insn->header.destreg__conditionalmod = conditional;
868
869   if (!p->single_program_flow)
870       insn->header.thread_control = BRW_THREAD_SWITCH;
871
872   return insn;
873}
874
875struct brw_instruction *brw_ELSE(struct brw_compile *p,
876				 struct brw_instruction *if_insn)
877{
878   struct intel_context *intel = &p->brw->intel;
879   struct brw_instruction *insn;
880   GLuint br = 1;
881
882   /* jump count is for 64bit data chunk each, so one 128bit
883      instruction requires 2 chunks. */
884   if (intel->gen >= 5)
885      br = 2;
886
887   if (p->single_program_flow) {
888      insn = next_insn(p, BRW_OPCODE_ADD);
889   } else {
890      insn = next_insn(p, BRW_OPCODE_ELSE);
891   }
892
893   if (intel->gen < 6) {
894      brw_set_dest(insn, brw_ip_reg());
895      brw_set_src0(insn, brw_ip_reg());
896      brw_set_src1(insn, brw_imm_d(0x0));
897   } else {
898      brw_set_dest(insn, brw_imm_w(0));
899      insn->bits1.branch_gen6.jump_count = 0;
900      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
901      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
902   }
903
904   insn->header.compression_control = BRW_COMPRESSION_NONE;
905   insn->header.execution_size = if_insn->header.execution_size;
906   insn->header.mask_control = BRW_MASK_ENABLE;
907   if (!p->single_program_flow)
908       insn->header.thread_control = BRW_THREAD_SWITCH;
909
910   /* Patch the if instruction to point at this instruction.
911    */
912   if (p->single_program_flow) {
913      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
914
915      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
916   } else {
917      assert(if_insn->header.opcode == BRW_OPCODE_IF);
918
919      if (intel->gen < 6) {
920	 if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
921	 if_insn->bits3.if_else.pop_count = 0;
922	 if_insn->bits3.if_else.pad0 = 0;
923      } else {
924	 if_insn->bits1.branch_gen6.jump_count = br * (insn - if_insn + 1);
925      }
926   }
927
928   return insn;
929}
930
931void brw_ENDIF(struct brw_compile *p,
932	       struct brw_instruction *patch_insn)
933{
934   struct intel_context *intel = &p->brw->intel;
935   GLuint br = 1;
936
937   if (intel->gen >= 5)
938      br = 2;
939
940   if (p->single_program_flow) {
941      /* In single program flow mode, there's no need to execute an ENDIF,
942       * since we don't need to do any stack operations, and if we're executing
943       * currently, we want to just continue executing.
944       */
945      struct brw_instruction *next = &p->store[p->nr_insn];
946
947      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
948
949      patch_insn->bits3.ud = (next - patch_insn) * 16;
950   } else {
951      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
952
953      if (intel->gen < 6) {
954	 brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
955	 brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
956	 brw_set_src1(insn, brw_imm_d(0x0));
957      } else {
958	 brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_W));
959	 brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
960	 brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
961      }
962
963      insn->header.compression_control = BRW_COMPRESSION_NONE;
964      insn->header.execution_size = patch_insn->header.execution_size;
965      insn->header.mask_control = BRW_MASK_ENABLE;
966      insn->header.thread_control = BRW_THREAD_SWITCH;
967
968      if (intel->gen < 6)
969	 assert(patch_insn->bits3.if_else.jump_count == 0);
970      else
971	 assert(patch_insn->bits1.branch_gen6.jump_count == 0);
972
973      /* Patch the if or else instructions to point at this or the next
974       * instruction respectively.
975       */
976      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
977	 if (intel->gen < 6) {
978	    /* Turn it into an IFF, which means no mask stack operations for
979	     * all-false and jumping past the ENDIF.
980	     */
981	    patch_insn->header.opcode = BRW_OPCODE_IFF;
982	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
983	    patch_insn->bits3.if_else.pop_count = 0;
984	    patch_insn->bits3.if_else.pad0 = 0;
985	 } else {
986	    /* As of gen6, there is no IFF and IF must point to the ENDIF. */
987	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
988	 }
989      } else {
990	 assert(patch_insn->header.opcode == BRW_OPCODE_ELSE);
991	 if (intel->gen < 6) {
992	    /* BRW_OPCODE_ELSE pre-gen6 should point just past the
993	     * matching ENDIF.
994	     */
995	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
996	    patch_insn->bits3.if_else.pop_count = 1;
997	    patch_insn->bits3.if_else.pad0 = 0;
998	 } else {
999	    /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1000	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1001	 }
1002      }
1003
1004      /* Also pop item off the stack in the endif instruction:
1005       */
1006      if (intel->gen < 6) {
1007	 insn->bits3.if_else.jump_count = 0;
1008	 insn->bits3.if_else.pop_count = 1;
1009	 insn->bits3.if_else.pad0 = 0;
1010      } else {
1011	 insn->bits1.branch_gen6.jump_count = 2;
1012      }
1013   }
1014}
1015
1016struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1017{
1018   struct brw_instruction *insn;
1019   insn = next_insn(p, BRW_OPCODE_BREAK);
1020   brw_set_dest(insn, brw_ip_reg());
1021   brw_set_src0(insn, brw_ip_reg());
1022   brw_set_src1(insn, brw_imm_d(0x0));
1023   insn->header.compression_control = BRW_COMPRESSION_NONE;
1024   insn->header.execution_size = BRW_EXECUTE_8;
1025   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1026   insn->bits3.if_else.pad0 = 0;
1027   insn->bits3.if_else.pop_count = pop_count;
1028   return insn;
1029}
1030
1031struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1032{
1033   struct brw_instruction *insn;
1034   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1035   brw_set_dest(insn, brw_ip_reg());
1036   brw_set_src0(insn, brw_ip_reg());
1037   brw_set_src1(insn, brw_imm_d(0x0));
1038   insn->header.compression_control = BRW_COMPRESSION_NONE;
1039   insn->header.execution_size = BRW_EXECUTE_8;
1040   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1041   insn->bits3.if_else.pad0 = 0;
1042   insn->bits3.if_else.pop_count = pop_count;
1043   return insn;
1044}
1045
1046/* DO/WHILE loop:
1047 */
1048struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1049{
1050   if (p->single_program_flow) {
1051      return &p->store[p->nr_insn];
1052   } else {
1053      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1054
1055      /* Override the defaults for this instruction:
1056       */
1057      brw_set_dest(insn, brw_null_reg());
1058      brw_set_src0(insn, brw_null_reg());
1059      brw_set_src1(insn, brw_null_reg());
1060
1061      insn->header.compression_control = BRW_COMPRESSION_NONE;
1062      insn->header.execution_size = execute_size;
1063      insn->header.predicate_control = BRW_PREDICATE_NONE;
1064      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1065      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1066
1067      return insn;
1068   }
1069}
1070
1071
1072
1073struct brw_instruction *brw_WHILE(struct brw_compile *p,
1074                                  struct brw_instruction *do_insn)
1075{
1076   struct intel_context *intel = &p->brw->intel;
1077   struct brw_instruction *insn;
1078   GLuint br = 1;
1079
1080   if (intel->gen >= 5)
1081      br = 2;
1082
1083   if (p->single_program_flow)
1084      insn = next_insn(p, BRW_OPCODE_ADD);
1085   else
1086      insn = next_insn(p, BRW_OPCODE_WHILE);
1087
1088   brw_set_dest(insn, brw_ip_reg());
1089   brw_set_src0(insn, brw_ip_reg());
1090   brw_set_src1(insn, brw_imm_d(0x0));
1091
1092   insn->header.compression_control = BRW_COMPRESSION_NONE;
1093
1094   if (p->single_program_flow) {
1095      insn->header.execution_size = BRW_EXECUTE_1;
1096
1097      insn->bits3.d = (do_insn - insn) * 16;
1098   } else {
1099      insn->header.execution_size = do_insn->header.execution_size;
1100
1101      assert(do_insn->header.opcode == BRW_OPCODE_DO);
1102      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1103      insn->bits3.if_else.pop_count = 0;
1104      insn->bits3.if_else.pad0 = 0;
1105   }
1106
1107/*    insn->header.mask_control = BRW_MASK_ENABLE; */
1108
1109   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1110   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1111   return insn;
1112}
1113
1114
1115/* FORWARD JUMPS:
1116 */
1117void brw_land_fwd_jump(struct brw_compile *p,
1118		       struct brw_instruction *jmp_insn)
1119{
1120   struct intel_context *intel = &p->brw->intel;
1121   struct brw_instruction *landing = &p->store[p->nr_insn];
1122   GLuint jmpi = 1;
1123
1124   if (intel->gen >= 5)
1125       jmpi = 2;
1126
1127   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1128   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1129
1130   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1131}
1132
1133
1134
1135/* To integrate with the above, it makes sense that the comparison
1136 * instruction should populate the flag register.  It might be simpler
1137 * just to use the flag reg for most WM tasks?
1138 */
1139void brw_CMP(struct brw_compile *p,
1140	     struct brw_reg dest,
1141	     GLuint conditional,
1142	     struct brw_reg src0,
1143	     struct brw_reg src1)
1144{
1145   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1146
1147   insn->header.destreg__conditionalmod = conditional;
1148   brw_set_dest(insn, dest);
1149   brw_set_src0(insn, src0);
1150   brw_set_src1(insn, src1);
1151
1152/*    guess_execution_size(insn, src0); */
1153
1154
1155   /* Make it so that future instructions will use the computed flag
1156    * value until brw_set_predicate_control_flag_value() is called
1157    * again.
1158    */
1159   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1160       dest.nr == 0) {
1161      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1162      p->flag_value = 0xff;
1163   }
1164}
1165
1166/* Issue 'wait' instruction for n1, host could program MMIO
1167   to wake up thread. */
1168void brw_WAIT (struct brw_compile *p)
1169{
1170   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1171   struct brw_reg src = brw_notification_1_reg();
1172
1173   brw_set_dest(insn, src);
1174   brw_set_src0(insn, src);
1175   brw_set_src1(insn, brw_null_reg());
1176   insn->header.execution_size = 0; /* must */
1177   insn->header.predicate_control = 0;
1178   insn->header.compression_control = 0;
1179}
1180
1181
1182/***********************************************************************
1183 * Helpers for the various SEND message types:
1184 */
1185
1186/** Extended math function, float[8].
1187 */
1188void brw_math( struct brw_compile *p,
1189	       struct brw_reg dest,
1190	       GLuint function,
1191	       GLuint saturate,
1192	       GLuint msg_reg_nr,
1193	       struct brw_reg src,
1194	       GLuint data_type,
1195	       GLuint precision )
1196{
1197   struct intel_context *intel = &p->brw->intel;
1198
1199   if (intel->gen >= 6) {
1200      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1201
1202      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1203      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1204
1205      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1206      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1207
1208      if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1209	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1210	 assert(src.type == BRW_REGISTER_TYPE_F);
1211      }
1212
1213      /* Math is the same ISA format as other opcodes, except that CondModifier
1214       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1215       */
1216      insn->header.destreg__conditionalmod = function;
1217
1218      brw_set_dest(insn, dest);
1219      brw_set_src0(insn, src);
1220      brw_set_src1(insn, brw_null_reg());
1221   } else {
1222      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1223      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1224      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1225      /* Example code doesn't set predicate_control for send
1226       * instructions.
1227       */
1228      insn->header.predicate_control = 0;
1229      insn->header.destreg__conditionalmod = msg_reg_nr;
1230
1231      brw_set_dest(insn, dest);
1232      brw_set_src0(insn, src);
1233      brw_set_math_message(p->brw,
1234			   insn,
1235			   msg_length, response_length,
1236			   function,
1237			   BRW_MATH_INTEGER_UNSIGNED,
1238			   precision,
1239			   saturate,
1240			   data_type);
1241   }
1242}
1243
1244/** Extended math function, float[8].
1245 */
1246void brw_math2(struct brw_compile *p,
1247	       struct brw_reg dest,
1248	       GLuint function,
1249	       struct brw_reg src0,
1250	       struct brw_reg src1)
1251{
1252   struct intel_context *intel = &p->brw->intel;
1253   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1254
1255   assert(intel->gen >= 6);
1256   (void) intel;
1257
1258
1259   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1260   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1261   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1262
1263   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1264   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1265   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1266
1267   if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1268       function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1269      assert(src0.type == BRW_REGISTER_TYPE_F);
1270      assert(src1.type == BRW_REGISTER_TYPE_F);
1271   }
1272
1273   /* Math is the same ISA format as other opcodes, except that CondModifier
1274    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1275    */
1276   insn->header.destreg__conditionalmod = function;
1277
1278   brw_set_dest(insn, dest);
1279   brw_set_src0(insn, src0);
1280   brw_set_src1(insn, src1);
1281}
1282
1283/**
1284 * Extended math function, float[16].
1285 * Use 2 send instructions.
1286 */
1287void brw_math_16( struct brw_compile *p,
1288		  struct brw_reg dest,
1289		  GLuint function,
1290		  GLuint saturate,
1291		  GLuint msg_reg_nr,
1292		  struct brw_reg src,
1293		  GLuint precision )
1294{
1295   struct intel_context *intel = &p->brw->intel;
1296   struct brw_instruction *insn;
1297   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1298   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1299
1300   if (intel->gen >= 6) {
1301      insn = next_insn(p, BRW_OPCODE_MATH);
1302
1303      /* Math is the same ISA format as other opcodes, except that CondModifier
1304       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1305       */
1306      insn->header.destreg__conditionalmod = function;
1307
1308      brw_set_dest(insn, dest);
1309      brw_set_src0(insn, src);
1310      brw_set_src1(insn, brw_null_reg());
1311      return;
1312   }
1313
1314   /* First instruction:
1315    */
1316   brw_push_insn_state(p);
1317   brw_set_predicate_control_flag_value(p, 0xff);
1318   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1319
1320   insn = next_insn(p, BRW_OPCODE_SEND);
1321   insn->header.destreg__conditionalmod = msg_reg_nr;
1322
1323   brw_set_dest(insn, dest);
1324   brw_set_src0(insn, src);
1325   brw_set_math_message(p->brw,
1326			insn,
1327			msg_length, response_length,
1328			function,
1329			BRW_MATH_INTEGER_UNSIGNED,
1330			precision,
1331			saturate,
1332			BRW_MATH_DATA_VECTOR);
1333
1334   /* Second instruction:
1335    */
1336   insn = next_insn(p, BRW_OPCODE_SEND);
1337   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1338   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1339
1340   brw_set_dest(insn, offset(dest,1));
1341   brw_set_src0(insn, src);
1342   brw_set_math_message(p->brw,
1343			insn,
1344			msg_length, response_length,
1345			function,
1346			BRW_MATH_INTEGER_UNSIGNED,
1347			precision,
1348			saturate,
1349			BRW_MATH_DATA_VECTOR);
1350
1351   brw_pop_insn_state(p);
1352}
1353
1354
1355/**
1356 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1357 * using a constant offset per channel.
1358 *
1359 * The offset must be aligned to oword size (16 bytes).  Used for
1360 * register spilling.
1361 */
1362void brw_oword_block_write(struct brw_compile *p,
1363			   struct brw_reg mrf,
1364			   int num_regs,
1365			   GLuint offset)
1366{
1367   struct intel_context *intel = &p->brw->intel;
1368   uint32_t msg_control;
1369   int mlen;
1370
1371   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1372
1373   if (num_regs == 1) {
1374      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1375      mlen = 2;
1376   } else {
1377      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1378      mlen = 3;
1379   }
1380
1381   /* Set up the message header.  This is g0, with g0.2 filled with
1382    * the offset.  We don't want to leave our offset around in g0 or
1383    * it'll screw up texture samples, so set it up inside the message
1384    * reg.
1385    */
1386   {
1387      brw_push_insn_state(p);
1388      brw_set_mask_control(p, BRW_MASK_DISABLE);
1389      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1390
1391      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1392
1393      /* set message header global offset field (reg 0, element 2) */
1394      brw_MOV(p,
1395	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1396				  mrf.nr,
1397				  2), BRW_REGISTER_TYPE_UD),
1398	      brw_imm_ud(offset));
1399
1400      brw_pop_insn_state(p);
1401   }
1402
1403   {
1404      struct brw_reg dest;
1405      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1406      int send_commit_msg;
1407      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1408					 BRW_REGISTER_TYPE_UW);
1409
1410      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1411	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1412	 src_header = vec16(src_header);
1413      }
1414      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1415      insn->header.destreg__conditionalmod = mrf.nr;
1416
1417      /* Until gen6, writes followed by reads from the same location
1418       * are not guaranteed to be ordered unless write_commit is set.
1419       * If set, then a no-op write is issued to the destination
1420       * register to set a dependency, and a read from the destination
1421       * can be used to ensure the ordering.
1422       *
1423       * For gen6, only writes between different threads need ordering
1424       * protection.  Our use of DP writes is all about register
1425       * spilling within a thread.
1426       */
1427      if (intel->gen >= 6) {
1428	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1429	 send_commit_msg = 0;
1430      } else {
1431	 dest = src_header;
1432	 send_commit_msg = 1;
1433      }
1434
1435      brw_set_dest(insn, dest);
1436      brw_set_src0(insn, brw_null_reg());
1437
1438      brw_set_dp_write_message(p->brw,
1439			       insn,
1440			       255, /* binding table index (255=stateless) */
1441			       msg_control,
1442			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
1443			       mlen,
1444			       GL_TRUE, /* header_present */
1445			       0, /* pixel scoreboard */
1446			       send_commit_msg, /* response_length */
1447			       0, /* eot */
1448			       send_commit_msg);
1449   }
1450}
1451
1452
1453/**
1454 * Read a block of owords (half a GRF each) from the scratch buffer
1455 * using a constant index per channel.
1456 *
1457 * Offset must be aligned to oword size (16 bytes).  Used for register
1458 * spilling.
1459 */
1460void
1461brw_oword_block_read(struct brw_compile *p,
1462		     struct brw_reg dest,
1463		     struct brw_reg mrf,
1464		     int num_regs,
1465		     GLuint offset)
1466{
1467   uint32_t msg_control;
1468   int rlen;
1469
1470   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1471   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1472
1473   if (num_regs == 1) {
1474      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1475      rlen = 1;
1476   } else {
1477      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1478      rlen = 2;
1479   }
1480
1481   {
1482      brw_push_insn_state(p);
1483      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1484      brw_set_mask_control(p, BRW_MASK_DISABLE);
1485
1486      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1487
1488      /* set message header global offset field (reg 0, element 2) */
1489      brw_MOV(p,
1490	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1491				  mrf.nr,
1492				  2), BRW_REGISTER_TYPE_UD),
1493	      brw_imm_ud(offset));
1494
1495      brw_pop_insn_state(p);
1496   }
1497
1498   {
1499      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1500
1501      assert(insn->header.predicate_control == 0);
1502      insn->header.compression_control = BRW_COMPRESSION_NONE;
1503      insn->header.destreg__conditionalmod = mrf.nr;
1504
1505      brw_set_dest(insn, dest);	/* UW? */
1506      brw_set_src0(insn, brw_null_reg());
1507
1508      brw_set_dp_read_message(p->brw,
1509			      insn,
1510			      255, /* binding table index (255=stateless) */
1511			      msg_control,
1512			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1513			      1, /* target cache (render/scratch) */
1514			      1, /* msg_length */
1515			      rlen,
1516			      0); /* eot */
1517   }
1518}
1519
1520
1521/**
1522 * Read a float[4] vector from the data port Data Cache (const buffer).
1523 * Location (in buffer) should be a multiple of 16.
1524 * Used for fetching shader constants.
1525 * If relAddr is true, we'll do an indirect fetch using the address register.
1526 */
1527void brw_dp_READ_4( struct brw_compile *p,
1528                    struct brw_reg dest,
1529                    GLboolean relAddr,
1530                    GLuint location,
1531                    GLuint bind_table_index )
1532{
1533   /* XXX: relAddr not implemented */
1534   GLuint msg_reg_nr = 1;
1535   {
1536      struct brw_reg b;
1537      brw_push_insn_state(p);
1538      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1539      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1540      brw_set_mask_control(p, BRW_MASK_DISABLE);
1541
1542   /* Setup MRF[1] with location/offset into const buffer */
1543      b = brw_message_reg(msg_reg_nr);
1544      b = retype(b, BRW_REGISTER_TYPE_UD);
1545      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
1546       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
1547       */
1548      brw_MOV(p, b, brw_imm_ud(location));
1549      brw_pop_insn_state(p);
1550   }
1551
1552   {
1553      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1554
1555      insn->header.predicate_control = BRW_PREDICATE_NONE;
1556      insn->header.compression_control = BRW_COMPRESSION_NONE;
1557      insn->header.destreg__conditionalmod = msg_reg_nr;
1558      insn->header.mask_control = BRW_MASK_DISABLE;
1559
1560      /* cast dest to a uword[8] vector */
1561      dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1562
1563      brw_set_dest(insn, dest);
1564      brw_set_src0(insn, brw_null_reg());
1565
1566      brw_set_dp_read_message(p->brw,
1567			      insn,
1568			      bind_table_index,
1569			      0,  /* msg_control (0 means 1 Oword) */
1570			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1571			      0, /* source cache = data cache */
1572			      1, /* msg_length */
1573			      1, /* response_length (1 Oword) */
1574			      0); /* eot */
1575   }
1576}
1577
1578
1579/**
1580 * Read float[4] constant(s) from VS constant buffer.
1581 * For relative addressing, two float[4] constants will be read into 'dest'.
1582 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1583 */
1584void brw_dp_READ_4_vs(struct brw_compile *p,
1585                      struct brw_reg dest,
1586                      GLuint location,
1587                      GLuint bind_table_index)
1588{
1589   struct brw_instruction *insn;
1590   GLuint msg_reg_nr = 1;
1591   struct brw_reg b;
1592
1593   /*
1594   printf("vs const read msg, location %u, msg_reg_nr %d\n",
1595          location, msg_reg_nr);
1596   */
1597
1598   /* Setup MRF[1] with location/offset into const buffer */
1599   brw_push_insn_state(p);
1600   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1601   brw_set_mask_control(p, BRW_MASK_DISABLE);
1602   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1603
1604   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
1605    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
1606    */
1607   b = brw_message_reg(msg_reg_nr);
1608   b = retype(b, BRW_REGISTER_TYPE_UD);
1609   /*b = get_element_ud(b, 2);*/
1610   brw_MOV(p, b, brw_imm_ud(location));
1611
1612   brw_pop_insn_state(p);
1613
1614   insn = next_insn(p, BRW_OPCODE_SEND);
1615
1616   insn->header.predicate_control = BRW_PREDICATE_NONE;
1617   insn->header.compression_control = BRW_COMPRESSION_NONE;
1618   insn->header.destreg__conditionalmod = msg_reg_nr;
1619   insn->header.mask_control = BRW_MASK_DISABLE;
1620
1621   brw_set_dest(insn, dest);
1622   brw_set_src0(insn, brw_null_reg());
1623
1624   brw_set_dp_read_message(p->brw,
1625			   insn,
1626			   bind_table_index,
1627			   0,
1628			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1629			   0, /* source cache = data cache */
1630			   1, /* msg_length */
1631			   1, /* response_length (1 Oword) */
1632			   0); /* eot */
1633}
1634
1635/**
1636 * Read a float[4] constant per vertex from VS constant buffer, with
1637 * relative addressing.
1638 */
1639void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1640			       struct brw_reg dest,
1641			       struct brw_reg addr_reg,
1642			       GLuint offset,
1643			       GLuint bind_table_index)
1644{
1645   struct intel_context *intel = &p->brw->intel;
1646   int msg_type;
1647
1648   /* Setup MRF[1] with offset into const buffer */
1649   brw_push_insn_state(p);
1650   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1651   brw_set_mask_control(p, BRW_MASK_DISABLE);
1652   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1653
1654   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1655    * fields ignored.
1656    */
1657   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
1658	   addr_reg, brw_imm_d(offset));
1659   brw_pop_insn_state(p);
1660
1661   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1662
1663   insn->header.predicate_control = BRW_PREDICATE_NONE;
1664   insn->header.compression_control = BRW_COMPRESSION_NONE;
1665   insn->header.destreg__conditionalmod = 0;
1666   insn->header.mask_control = BRW_MASK_DISABLE;
1667
1668   brw_set_dest(insn, dest);
1669   brw_set_src0(insn, brw_vec8_grf(0, 0));
1670
1671   if (intel->gen == 6)
1672      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1673   else if (intel->gen == 5 || intel->is_g4x)
1674      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1675   else
1676      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1677
1678   brw_set_dp_read_message(p->brw,
1679			   insn,
1680			   bind_table_index,
1681			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1682			   msg_type,
1683			   0, /* source cache = data cache */
1684			   2, /* msg_length */
1685			   1, /* response_length */
1686			   0); /* eot */
1687}
1688
1689
1690
1691void brw_fb_WRITE(struct brw_compile *p,
1692		  int dispatch_width,
1693                  struct brw_reg dest,
1694                  GLuint msg_reg_nr,
1695                  struct brw_reg src0,
1696                  GLuint binding_table_index,
1697                  GLuint msg_length,
1698                  GLuint response_length,
1699                  GLboolean eot)
1700{
1701   struct intel_context *intel = &p->brw->intel;
1702   struct brw_instruction *insn;
1703   GLuint msg_control, msg_type;
1704   GLboolean header_present = GL_TRUE;
1705
1706   insn = next_insn(p, BRW_OPCODE_SEND);
1707   insn->header.predicate_control = 0; /* XXX */
1708   insn->header.compression_control = BRW_COMPRESSION_NONE;
1709
1710   if (intel->gen >= 6) {
1711      if (msg_length == 4)
1712	 header_present = GL_FALSE;
1713
1714       /* headerless version, just submit color payload */
1715       src0 = brw_message_reg(msg_reg_nr);
1716
1717       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6;
1718   } else {
1719      insn->header.destreg__conditionalmod = msg_reg_nr;
1720
1721      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1722   }
1723
1724   if (dispatch_width == 16)
1725      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1726   else
1727      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1728
1729   brw_set_dest(insn, dest);
1730   brw_set_src0(insn, src0);
1731   brw_set_dp_write_message(p->brw,
1732			    insn,
1733			    binding_table_index,
1734			    msg_control,
1735			    msg_type,
1736			    msg_length,
1737			    header_present,
1738			    1,	/* pixel scoreboard */
1739			    response_length,
1740			    eot,
1741			    0 /* send_commit_msg */);
1742}
1743
1744
1745/**
1746 * Texture sample instruction.
1747 * Note: the msg_type plus msg_length values determine exactly what kind
1748 * of sampling operation is performed.  See volume 4, page 161 of docs.
1749 */
1750void brw_SAMPLE(struct brw_compile *p,
1751		struct brw_reg dest,
1752		GLuint msg_reg_nr,
1753		struct brw_reg src0,
1754		GLuint binding_table_index,
1755		GLuint sampler,
1756		GLuint writemask,
1757		GLuint msg_type,
1758		GLuint response_length,
1759		GLuint msg_length,
1760		GLboolean eot,
1761		GLuint header_present,
1762		GLuint simd_mode)
1763{
1764   struct intel_context *intel = &p->brw->intel;
1765   GLboolean need_stall = 0;
1766
1767   if (writemask == 0) {
1768      /*printf("%s: zero writemask??\n", __FUNCTION__); */
1769      return;
1770   }
1771
1772   /* Hardware doesn't do destination dependency checking on send
1773    * instructions properly.  Add a workaround which generates the
1774    * dependency by other means.  In practice it seems like this bug
1775    * only crops up for texture samples, and only where registers are
1776    * written by the send and then written again later without being
1777    * read in between.  Luckily for us, we already track that
1778    * information and use it to modify the writemask for the
1779    * instruction, so that is a guide for whether a workaround is
1780    * needed.
1781    */
1782   if (writemask != WRITEMASK_XYZW) {
1783      GLuint dst_offset = 0;
1784      GLuint i, newmask = 0, len = 0;
1785
1786      for (i = 0; i < 4; i++) {
1787	 if (writemask & (1<<i))
1788	    break;
1789	 dst_offset += 2;
1790      }
1791      for (; i < 4; i++) {
1792	 if (!(writemask & (1<<i)))
1793	    break;
1794	 newmask |= 1<<i;
1795	 len++;
1796      }
1797
1798      if (newmask != writemask) {
1799	 need_stall = 1;
1800         /* printf("need stall %x %x\n", newmask , writemask); */
1801      }
1802      else {
1803	 GLboolean dispatch_16 = GL_FALSE;
1804
1805	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
1806
1807	 guess_execution_size(p->current, dest);
1808	 if (p->current->header.execution_size == BRW_EXECUTE_16)
1809	    dispatch_16 = GL_TRUE;
1810
1811	 newmask = ~newmask & WRITEMASK_XYZW;
1812
1813	 brw_push_insn_state(p);
1814
1815	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1816	 brw_set_mask_control(p, BRW_MASK_DISABLE);
1817
1818	 brw_MOV(p, m1, brw_vec8_grf(0,0));
1819  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
1820
1821	 brw_pop_insn_state(p);
1822
1823  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
1824	 dest = offset(dest, dst_offset);
1825
1826	 /* For 16-wide dispatch, masked channels are skipped in the
1827	  * response.  For 8-wide, masked channels still take up slots,
1828	  * and are just not written to.
1829	  */
1830	 if (dispatch_16)
1831	    response_length = len * 2;
1832      }
1833   }
1834
1835   {
1836      struct brw_instruction *insn;
1837
1838      /* Sandybridge doesn't have the implied move for SENDs,
1839       * and the first message register index comes from src0.
1840       */
1841      if (intel->gen >= 6) {
1842	  brw_push_insn_state(p);
1843	  brw_set_mask_control( p, BRW_MASK_DISABLE );
1844	  /* m1 contains header? */
1845	  brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
1846	  brw_pop_insn_state(p);
1847	  src0 = brw_message_reg(msg_reg_nr);
1848      }
1849
1850      insn = next_insn(p, BRW_OPCODE_SEND);
1851      insn->header.predicate_control = 0; /* XXX */
1852      insn->header.compression_control = BRW_COMPRESSION_NONE;
1853      if (intel->gen < 6)
1854	  insn->header.destreg__conditionalmod = msg_reg_nr;
1855
1856      brw_set_dest(insn, dest);
1857      brw_set_src0(insn, src0);
1858      brw_set_sampler_message(p->brw, insn,
1859			      binding_table_index,
1860			      sampler,
1861			      msg_type,
1862			      response_length,
1863			      msg_length,
1864			      eot,
1865			      header_present,
1866			      simd_mode);
1867   }
1868
1869   if (need_stall) {
1870      struct brw_reg reg = vec8(offset(dest, response_length-1));
1871
1872      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
1873       */
1874      brw_push_insn_state(p);
1875      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1876      brw_MOV(p, reg, reg);
1877      brw_pop_insn_state(p);
1878   }
1879
1880}
1881
1882/* All these variables are pretty confusing - we might be better off
1883 * using bitmasks and macros for this, in the old style.  Or perhaps
1884 * just having the caller instantiate the fields in dword3 itself.
1885 */
1886void brw_urb_WRITE(struct brw_compile *p,
1887		   struct brw_reg dest,
1888		   GLuint msg_reg_nr,
1889		   struct brw_reg src0,
1890		   GLboolean allocate,
1891		   GLboolean used,
1892		   GLuint msg_length,
1893		   GLuint response_length,
1894		   GLboolean eot,
1895		   GLboolean writes_complete,
1896		   GLuint offset,
1897		   GLuint swizzle)
1898{
1899   struct intel_context *intel = &p->brw->intel;
1900   struct brw_instruction *insn;
1901
1902   /* Sandybridge doesn't have the implied move for SENDs,
1903    * and the first message register index comes from src0.
1904    */
1905   if (intel->gen >= 6) {
1906      brw_push_insn_state(p);
1907      brw_set_mask_control( p, BRW_MASK_DISABLE );
1908      brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
1909      brw_pop_insn_state(p);
1910      src0 = brw_message_reg(msg_reg_nr);
1911   }
1912
1913   insn = next_insn(p, BRW_OPCODE_SEND);
1914
1915   assert(msg_length < BRW_MAX_MRF);
1916
1917   brw_set_dest(insn, dest);
1918   brw_set_src0(insn, src0);
1919   brw_set_src1(insn, brw_imm_d(0));
1920
1921   if (intel->gen < 6)
1922      insn->header.destreg__conditionalmod = msg_reg_nr;
1923
1924   brw_set_urb_message(p->brw,
1925		       insn,
1926		       allocate,
1927		       used,
1928		       msg_length,
1929		       response_length,
1930		       eot,
1931		       writes_complete,
1932		       offset,
1933		       swizzle);
1934}
1935
1936void brw_ff_sync(struct brw_compile *p,
1937		   struct brw_reg dest,
1938		   GLuint msg_reg_nr,
1939		   struct brw_reg src0,
1940		   GLboolean allocate,
1941		   GLuint response_length,
1942		   GLboolean eot)
1943{
1944   struct intel_context *intel = &p->brw->intel;
1945   struct brw_instruction *insn;
1946
1947   /* Sandybridge doesn't have the implied move for SENDs,
1948    * and the first message register index comes from src0.
1949    */
1950   if (intel->gen >= 6) {
1951      brw_push_insn_state(p);
1952      brw_set_mask_control( p, BRW_MASK_DISABLE );
1953      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
1954	      retype(src0, BRW_REGISTER_TYPE_UD));
1955      brw_pop_insn_state(p);
1956      src0 = brw_message_reg(msg_reg_nr);
1957   }
1958
1959   insn = next_insn(p, BRW_OPCODE_SEND);
1960   brw_set_dest(insn, dest);
1961   brw_set_src0(insn, src0);
1962   brw_set_src1(insn, brw_imm_d(0));
1963
1964   if (intel->gen < 6)
1965       insn->header.destreg__conditionalmod = msg_reg_nr;
1966
1967   brw_set_ff_sync_message(p->brw,
1968			   insn,
1969			   allocate,
1970			   response_length,
1971			   eot);
1972}
1973