brw_eu_emit.c revision 72845d206e692581b6084c56b8d1f3bc689e8a03
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37
38
39
40/***********************************************************************
41 * Internal helper for constructing instructions
42 */
43
44static void guess_execution_size(struct brw_compile *p,
45				 struct brw_instruction *insn,
46				 struct brw_reg reg)
47{
48   if (reg.width == BRW_WIDTH_8 && p->compressed)
49      insn->header.execution_size = BRW_EXECUTE_16;
50   else
51      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
52}
53
54
55static void brw_set_dest(struct brw_compile *p,
56			 struct brw_instruction *insn,
57			 struct brw_reg dest)
58{
59   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
60       dest.file != BRW_MESSAGE_REGISTER_FILE)
61      assert(dest.nr < 128);
62
63   insn->bits1.da1.dest_reg_file = dest.file;
64   insn->bits1.da1.dest_reg_type = dest.type;
65   insn->bits1.da1.dest_address_mode = dest.address_mode;
66
67   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
68      insn->bits1.da1.dest_reg_nr = dest.nr;
69
70      if (insn->header.access_mode == BRW_ALIGN_1) {
71	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
72	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
73	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
74	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
75      }
76      else {
77	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
78	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
79	 /* even ignored in da16, still need to set as '01' */
80	 insn->bits1.da16.dest_horiz_stride = 1;
81      }
82   }
83   else {
84      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
85
86      /* These are different sizes in align1 vs align16:
87       */
88      if (insn->header.access_mode == BRW_ALIGN_1) {
89	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
90	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
91	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
92	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
93      }
94      else {
95	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
96	 /* even ignored in da16, still need to set as '01' */
97	 insn->bits1.ia16.dest_horiz_stride = 1;
98      }
99   }
100
101   /* NEW: Set the execution size based on dest.width and
102    * insn->compression_control:
103    */
104   guess_execution_size(p, insn, dest);
105}
106
107extern int reg_type_size[];
108
109static void
110validate_reg(struct brw_instruction *insn, struct brw_reg reg)
111{
112   int hstride_for_reg[] = {0, 1, 2, 4};
113   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
114   int width_for_reg[] = {1, 2, 4, 8, 16};
115   int execsize_for_reg[] = {1, 2, 4, 8, 16};
116   int width, hstride, vstride, execsize;
117
118   if (reg.file == BRW_IMMEDIATE_VALUE) {
119      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
120       * mean the destination has to be 128-bit aligned and the
121       * destination horiz stride has to be a word.
122       */
123      if (reg.type == BRW_REGISTER_TYPE_V) {
124	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
125		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
126      }
127
128      return;
129   }
130
131   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
132       reg.file == BRW_ARF_NULL)
133      return;
134
135   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
136   hstride = hstride_for_reg[reg.hstride];
137
138   if (reg.vstride == 0xf) {
139      vstride = -1;
140   } else {
141      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
142      vstride = vstride_for_reg[reg.vstride];
143   }
144
145   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
146   width = width_for_reg[reg.width];
147
148   assert(insn->header.execution_size >= 0 &&
149	  insn->header.execution_size < Elements(execsize_for_reg));
150   execsize = execsize_for_reg[insn->header.execution_size];
151
152   /* Restrictions from 3.3.10: Register Region Restrictions. */
153   /* 3. */
154   assert(execsize >= width);
155
156   /* 4. */
157   if (execsize == width && hstride != 0) {
158      assert(vstride == -1 || vstride == width * hstride);
159   }
160
161   /* 5. */
162   if (execsize == width && hstride == 0) {
163      /* no restriction on vstride. */
164   }
165
166   /* 6. */
167   if (width == 1) {
168      assert(hstride == 0);
169   }
170
171   /* 7. */
172   if (execsize == 1 && width == 1) {
173      assert(hstride == 0);
174      assert(vstride == 0);
175   }
176
177   /* 8. */
178   if (vstride == 0 && hstride == 0) {
179      assert(width == 1);
180   }
181
182   /* 10. Check destination issues. */
183}
184
185static void brw_set_src0( struct brw_instruction *insn,
186                          struct brw_reg reg )
187{
188   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
189      assert(reg.nr < 128);
190
191   validate_reg(insn, reg);
192
193   insn->bits1.da1.src0_reg_file = reg.file;
194   insn->bits1.da1.src0_reg_type = reg.type;
195   insn->bits2.da1.src0_abs = reg.abs;
196   insn->bits2.da1.src0_negate = reg.negate;
197   insn->bits2.da1.src0_address_mode = reg.address_mode;
198
199   if (reg.file == BRW_IMMEDIATE_VALUE) {
200      insn->bits3.ud = reg.dw1.ud;
201
202      /* Required to set some fields in src1 as well:
203       */
204      insn->bits1.da1.src1_reg_file = 0; /* arf */
205      insn->bits1.da1.src1_reg_type = reg.type;
206   }
207   else
208   {
209      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
210	 if (insn->header.access_mode == BRW_ALIGN_1) {
211	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
212	    insn->bits2.da1.src0_reg_nr = reg.nr;
213	 }
214	 else {
215	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
216	    insn->bits2.da16.src0_reg_nr = reg.nr;
217	 }
218      }
219      else {
220	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
221
222	 if (insn->header.access_mode == BRW_ALIGN_1) {
223	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
224	 }
225	 else {
226	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
227	 }
228      }
229
230      if (insn->header.access_mode == BRW_ALIGN_1) {
231	 if (reg.width == BRW_WIDTH_1 &&
232	     insn->header.execution_size == BRW_EXECUTE_1) {
233	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
234	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
235	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
236	 }
237	 else {
238	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
239	    insn->bits2.da1.src0_width = reg.width;
240	    insn->bits2.da1.src0_vert_stride = reg.vstride;
241	 }
242      }
243      else {
244	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
245	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
246	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
247	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
248
249	 /* This is an oddity of the fact we're using the same
250	  * descriptions for registers in align_16 as align_1:
251	  */
252	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
253	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
254	 else
255	    insn->bits2.da16.src0_vert_stride = reg.vstride;
256      }
257   }
258}
259
260
261void brw_set_src1( struct brw_instruction *insn,
262                   struct brw_reg reg )
263{
264   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
265
266   assert(reg.nr < 128);
267
268   validate_reg(insn, reg);
269
270   insn->bits1.da1.src1_reg_file = reg.file;
271   insn->bits1.da1.src1_reg_type = reg.type;
272   insn->bits3.da1.src1_abs = reg.abs;
273   insn->bits3.da1.src1_negate = reg.negate;
274
275   /* Only src1 can be immediate in two-argument instructions.
276    */
277   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
278
279   if (reg.file == BRW_IMMEDIATE_VALUE) {
280      insn->bits3.ud = reg.dw1.ud;
281   }
282   else {
283      /* This is a hardware restriction, which may or may not be lifted
284       * in the future:
285       */
286      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
287      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
288
289      if (insn->header.access_mode == BRW_ALIGN_1) {
290	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
291	 insn->bits3.da1.src1_reg_nr = reg.nr;
292      }
293      else {
294	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
295	 insn->bits3.da16.src1_reg_nr = reg.nr;
296      }
297
298      if (insn->header.access_mode == BRW_ALIGN_1) {
299	 if (reg.width == BRW_WIDTH_1 &&
300	     insn->header.execution_size == BRW_EXECUTE_1) {
301	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
302	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
303	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
304	 }
305	 else {
306	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
307	    insn->bits3.da1.src1_width = reg.width;
308	    insn->bits3.da1.src1_vert_stride = reg.vstride;
309	 }
310      }
311      else {
312	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
313	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
314	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
315	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
316
317	 /* This is an oddity of the fact we're using the same
318	  * descriptions for registers in align_16 as align_1:
319	  */
320	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
321	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
322	 else
323	    insn->bits3.da16.src1_vert_stride = reg.vstride;
324      }
325   }
326}
327
328
329
330static void brw_set_math_message( struct brw_context *brw,
331				  struct brw_instruction *insn,
332				  GLuint msg_length,
333				  GLuint response_length,
334				  GLuint function,
335				  GLuint integer_type,
336				  GLboolean low_precision,
337				  GLboolean saturate,
338				  GLuint dataType )
339{
340   struct intel_context *intel = &brw->intel;
341   brw_set_src1(insn, brw_imm_d(0));
342
343   if (intel->gen == 5) {
344       insn->bits3.math_gen5.function = function;
345       insn->bits3.math_gen5.int_type = integer_type;
346       insn->bits3.math_gen5.precision = low_precision;
347       insn->bits3.math_gen5.saturate = saturate;
348       insn->bits3.math_gen5.data_type = dataType;
349       insn->bits3.math_gen5.snapshot = 0;
350       insn->bits3.math_gen5.header_present = 0;
351       insn->bits3.math_gen5.response_length = response_length;
352       insn->bits3.math_gen5.msg_length = msg_length;
353       insn->bits3.math_gen5.end_of_thread = 0;
354       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
355       insn->bits2.send_gen5.end_of_thread = 0;
356   } else {
357       insn->bits3.math.function = function;
358       insn->bits3.math.int_type = integer_type;
359       insn->bits3.math.precision = low_precision;
360       insn->bits3.math.saturate = saturate;
361       insn->bits3.math.data_type = dataType;
362       insn->bits3.math.response_length = response_length;
363       insn->bits3.math.msg_length = msg_length;
364       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
365       insn->bits3.math.end_of_thread = 0;
366   }
367}
368
369
370static void brw_set_ff_sync_message(struct brw_context *brw,
371				    struct brw_instruction *insn,
372				    GLboolean allocate,
373				    GLuint response_length,
374				    GLboolean end_of_thread)
375{
376	struct intel_context *intel = &brw->intel;
377	brw_set_src1(insn, brw_imm_d(0));
378
379	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
380	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
381	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
382	insn->bits3.urb_gen5.allocate = allocate;
383	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
384	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
385	insn->bits3.urb_gen5.header_present = 1;
386	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
387	insn->bits3.urb_gen5.msg_length = 1;
388	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
389	if (intel->gen >= 6) {
390	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
391	} else {
392	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
393	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
394	}
395}
396
397static void brw_set_urb_message( struct brw_context *brw,
398				 struct brw_instruction *insn,
399				 GLboolean allocate,
400				 GLboolean used,
401				 GLuint msg_length,
402				 GLuint response_length,
403				 GLboolean end_of_thread,
404				 GLboolean complete,
405				 GLuint offset,
406				 GLuint swizzle_control )
407{
408    struct intel_context *intel = &brw->intel;
409    brw_set_src1(insn, brw_imm_d(0));
410
411    if (intel->gen >= 5) {
412        insn->bits3.urb_gen5.opcode = 0;	/* ? */
413        insn->bits3.urb_gen5.offset = offset;
414        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
415        insn->bits3.urb_gen5.allocate = allocate;
416        insn->bits3.urb_gen5.used = used;	/* ? */
417        insn->bits3.urb_gen5.complete = complete;
418        insn->bits3.urb_gen5.header_present = 1;
419        insn->bits3.urb_gen5.response_length = response_length;
420        insn->bits3.urb_gen5.msg_length = msg_length;
421        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
422	if (intel->gen >= 6) {
423	   /* For SNB, the SFID bits moved to the condmod bits, and
424	    * EOT stayed in bits3 above.  Does the EOT bit setting
425	    * below on Ironlake even do anything?
426	    */
427	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
428	} else {
429	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
430	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
431	}
432    } else {
433        insn->bits3.urb.opcode = 0;	/* ? */
434        insn->bits3.urb.offset = offset;
435        insn->bits3.urb.swizzle_control = swizzle_control;
436        insn->bits3.urb.allocate = allocate;
437        insn->bits3.urb.used = used;	/* ? */
438        insn->bits3.urb.complete = complete;
439        insn->bits3.urb.response_length = response_length;
440        insn->bits3.urb.msg_length = msg_length;
441        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
442        insn->bits3.urb.end_of_thread = end_of_thread;
443    }
444}
445
446static void brw_set_dp_write_message( struct brw_context *brw,
447				      struct brw_instruction *insn,
448				      GLuint binding_table_index,
449				      GLuint msg_control,
450				      GLuint msg_type,
451				      GLuint msg_length,
452				      GLboolean header_present,
453				      GLuint pixel_scoreboard_clear,
454				      GLuint response_length,
455				      GLuint end_of_thread,
456				      GLuint send_commit_msg)
457{
458   struct intel_context *intel = &brw->intel;
459   brw_set_src1(insn, brw_imm_ud(0));
460
461   if (intel->gen >= 6) {
462       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
463       insn->bits3.dp_render_cache.msg_control = msg_control;
464       insn->bits3.dp_render_cache.pixel_scoreboard_clear = pixel_scoreboard_clear;
465       insn->bits3.dp_render_cache.msg_type = msg_type;
466       insn->bits3.dp_render_cache.send_commit_msg = send_commit_msg;
467       insn->bits3.dp_render_cache.header_present = header_present;
468       insn->bits3.dp_render_cache.response_length = response_length;
469       insn->bits3.dp_render_cache.msg_length = msg_length;
470       insn->bits3.dp_render_cache.end_of_thread = end_of_thread;
471       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
472	/* XXX really need below? */
473       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
474       insn->bits2.send_gen5.end_of_thread = end_of_thread;
475   } else if (intel->gen == 5) {
476       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
477       insn->bits3.dp_write_gen5.msg_control = msg_control;
478       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
479       insn->bits3.dp_write_gen5.msg_type = msg_type;
480       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
481       insn->bits3.dp_write_gen5.header_present = header_present;
482       insn->bits3.dp_write_gen5.response_length = response_length;
483       insn->bits3.dp_write_gen5.msg_length = msg_length;
484       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
485       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
486       insn->bits2.send_gen5.end_of_thread = end_of_thread;
487   } else {
488       insn->bits3.dp_write.binding_table_index = binding_table_index;
489       insn->bits3.dp_write.msg_control = msg_control;
490       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
491       insn->bits3.dp_write.msg_type = msg_type;
492       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
493       insn->bits3.dp_write.response_length = response_length;
494       insn->bits3.dp_write.msg_length = msg_length;
495       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
496       insn->bits3.dp_write.end_of_thread = end_of_thread;
497   }
498}
499
500static void
501brw_set_dp_read_message(struct brw_context *brw,
502			struct brw_instruction *insn,
503			GLuint binding_table_index,
504			GLuint msg_control,
505			GLuint msg_type,
506			GLuint target_cache,
507			GLuint msg_length,
508			GLuint response_length)
509{
510   struct intel_context *intel = &brw->intel;
511   brw_set_src1(insn, brw_imm_d(0));
512
513   if (intel->gen >= 6) {
514       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
515       insn->bits3.dp_render_cache.msg_control = msg_control;
516       insn->bits3.dp_render_cache.pixel_scoreboard_clear = 0;
517       insn->bits3.dp_render_cache.msg_type = msg_type;
518       insn->bits3.dp_render_cache.send_commit_msg = 0;
519       insn->bits3.dp_render_cache.header_present = 1;
520       insn->bits3.dp_render_cache.response_length = response_length;
521       insn->bits3.dp_render_cache.msg_length = msg_length;
522       insn->bits3.dp_render_cache.end_of_thread = 0;
523       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_READ;
524	/* XXX really need below? */
525       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
526       insn->bits2.send_gen5.end_of_thread = 0;
527   } else if (intel->gen == 5) {
528       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
529       insn->bits3.dp_read_gen5.msg_control = msg_control;
530       insn->bits3.dp_read_gen5.msg_type = msg_type;
531       insn->bits3.dp_read_gen5.target_cache = target_cache;
532       insn->bits3.dp_read_gen5.header_present = 1;
533       insn->bits3.dp_read_gen5.response_length = response_length;
534       insn->bits3.dp_read_gen5.msg_length = msg_length;
535       insn->bits3.dp_read_gen5.pad1 = 0;
536       insn->bits3.dp_read_gen5.end_of_thread = 0;
537       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
538       insn->bits2.send_gen5.end_of_thread = 0;
539   } else {
540       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
541       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
542       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
543       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
544       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
545       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
546       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
547       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
548       insn->bits3.dp_read.end_of_thread = 0;  /*31*/
549   }
550}
551
552static void brw_set_sampler_message(struct brw_context *brw,
553                                    struct brw_instruction *insn,
554                                    GLuint binding_table_index,
555                                    GLuint sampler,
556                                    GLuint msg_type,
557                                    GLuint response_length,
558                                    GLuint msg_length,
559                                    GLboolean eot,
560                                    GLuint header_present,
561                                    GLuint simd_mode)
562{
563   struct intel_context *intel = &brw->intel;
564   assert(eot == 0);
565   brw_set_src1(insn, brw_imm_d(0));
566
567   if (intel->gen >= 5) {
568      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
569      insn->bits3.sampler_gen5.sampler = sampler;
570      insn->bits3.sampler_gen5.msg_type = msg_type;
571      insn->bits3.sampler_gen5.simd_mode = simd_mode;
572      insn->bits3.sampler_gen5.header_present = header_present;
573      insn->bits3.sampler_gen5.response_length = response_length;
574      insn->bits3.sampler_gen5.msg_length = msg_length;
575      insn->bits3.sampler_gen5.end_of_thread = eot;
576      if (intel->gen >= 6)
577	  insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
578      else {
579	  insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
580	  insn->bits2.send_gen5.end_of_thread = eot;
581      }
582   } else if (intel->is_g4x) {
583      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
584      insn->bits3.sampler_g4x.sampler = sampler;
585      insn->bits3.sampler_g4x.msg_type = msg_type;
586      insn->bits3.sampler_g4x.response_length = response_length;
587      insn->bits3.sampler_g4x.msg_length = msg_length;
588      insn->bits3.sampler_g4x.end_of_thread = eot;
589      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
590   } else {
591      insn->bits3.sampler.binding_table_index = binding_table_index;
592      insn->bits3.sampler.sampler = sampler;
593      insn->bits3.sampler.msg_type = msg_type;
594      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
595      insn->bits3.sampler.response_length = response_length;
596      insn->bits3.sampler.msg_length = msg_length;
597      insn->bits3.sampler.end_of_thread = eot;
598      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
599   }
600}
601
602
603
604static struct brw_instruction *next_insn( struct brw_compile *p,
605					  GLuint opcode )
606{
607   struct brw_instruction *insn;
608
609   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
610
611   insn = &p->store[p->nr_insn++];
612   memcpy(insn, p->current, sizeof(*insn));
613
614   /* Reset this one-shot flag:
615    */
616
617   if (p->current->header.destreg__conditionalmod) {
618      p->current->header.destreg__conditionalmod = 0;
619      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
620   }
621
622   insn->header.opcode = opcode;
623   return insn;
624}
625
626
627static struct brw_instruction *brw_alu1( struct brw_compile *p,
628					 GLuint opcode,
629					 struct brw_reg dest,
630					 struct brw_reg src )
631{
632   struct brw_instruction *insn = next_insn(p, opcode);
633   brw_set_dest(p, insn, dest);
634   brw_set_src0(insn, src);
635   return insn;
636}
637
638static struct brw_instruction *brw_alu2(struct brw_compile *p,
639					GLuint opcode,
640					struct brw_reg dest,
641					struct brw_reg src0,
642					struct brw_reg src1 )
643{
644   struct brw_instruction *insn = next_insn(p, opcode);
645   brw_set_dest(p, insn, dest);
646   brw_set_src0(insn, src0);
647   brw_set_src1(insn, src1);
648   return insn;
649}
650
651
652/***********************************************************************
653 * Convenience routines.
654 */
655#define ALU1(OP)					\
656struct brw_instruction *brw_##OP(struct brw_compile *p,	\
657	      struct brw_reg dest,			\
658	      struct brw_reg src0)   			\
659{							\
660   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
661}
662
663#define ALU2(OP)					\
664struct brw_instruction *brw_##OP(struct brw_compile *p,	\
665	      struct brw_reg dest,			\
666	      struct brw_reg src0,			\
667	      struct brw_reg src1)   			\
668{							\
669   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
670}
671
672/* Rounding operations (other than RNDD) require two instructions - the first
673 * stores a rounded value (possibly the wrong way) in the dest register, but
674 * also sets a per-channel "increment bit" in the flag register.  A predicated
675 * add of 1.0 fixes dest to contain the desired result.
676 */
677#define ROUND(OP)							      \
678void brw_##OP(struct brw_compile *p,					      \
679	      struct brw_reg dest,					      \
680	      struct brw_reg src)					      \
681{									      \
682   struct brw_instruction *rnd, *add;					      \
683   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
684   brw_set_dest(p, rnd, dest);						      \
685   brw_set_src0(rnd, src);						      \
686   rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
687									      \
688   add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
689   add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
690}
691
692
693ALU1(MOV)
694ALU2(SEL)
695ALU1(NOT)
696ALU2(AND)
697ALU2(OR)
698ALU2(XOR)
699ALU2(SHR)
700ALU2(SHL)
701ALU2(RSR)
702ALU2(RSL)
703ALU2(ASR)
704ALU1(FRC)
705ALU1(RNDD)
706ALU2(MAC)
707ALU2(MACH)
708ALU1(LZD)
709ALU2(DP4)
710ALU2(DPH)
711ALU2(DP3)
712ALU2(DP2)
713ALU2(LINE)
714ALU2(PLN)
715
716
717ROUND(RNDZ)
718ROUND(RNDE)
719
720
721struct brw_instruction *brw_ADD(struct brw_compile *p,
722				struct brw_reg dest,
723				struct brw_reg src0,
724				struct brw_reg src1)
725{
726   /* 6.2.2: add */
727   if (src0.type == BRW_REGISTER_TYPE_F ||
728       (src0.file == BRW_IMMEDIATE_VALUE &&
729	src0.type == BRW_REGISTER_TYPE_VF)) {
730      assert(src1.type != BRW_REGISTER_TYPE_UD);
731      assert(src1.type != BRW_REGISTER_TYPE_D);
732   }
733
734   if (src1.type == BRW_REGISTER_TYPE_F ||
735       (src1.file == BRW_IMMEDIATE_VALUE &&
736	src1.type == BRW_REGISTER_TYPE_VF)) {
737      assert(src0.type != BRW_REGISTER_TYPE_UD);
738      assert(src0.type != BRW_REGISTER_TYPE_D);
739   }
740
741   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
742}
743
744struct brw_instruction *brw_MUL(struct brw_compile *p,
745				struct brw_reg dest,
746				struct brw_reg src0,
747				struct brw_reg src1)
748{
749   /* 6.32.38: mul */
750   if (src0.type == BRW_REGISTER_TYPE_D ||
751       src0.type == BRW_REGISTER_TYPE_UD ||
752       src1.type == BRW_REGISTER_TYPE_D ||
753       src1.type == BRW_REGISTER_TYPE_UD) {
754      assert(dest.type != BRW_REGISTER_TYPE_F);
755   }
756
757   if (src0.type == BRW_REGISTER_TYPE_F ||
758       (src0.file == BRW_IMMEDIATE_VALUE &&
759	src0.type == BRW_REGISTER_TYPE_VF)) {
760      assert(src1.type != BRW_REGISTER_TYPE_UD);
761      assert(src1.type != BRW_REGISTER_TYPE_D);
762   }
763
764   if (src1.type == BRW_REGISTER_TYPE_F ||
765       (src1.file == BRW_IMMEDIATE_VALUE &&
766	src1.type == BRW_REGISTER_TYPE_VF)) {
767      assert(src0.type != BRW_REGISTER_TYPE_UD);
768      assert(src0.type != BRW_REGISTER_TYPE_D);
769   }
770
771   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
772	  src0.nr != BRW_ARF_ACCUMULATOR);
773   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
774	  src1.nr != BRW_ARF_ACCUMULATOR);
775
776   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
777}
778
779
780void brw_NOP(struct brw_compile *p)
781{
782   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
783   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
784   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
785   brw_set_src1(insn, brw_imm_ud(0x0));
786}
787
788
789
790
791
792/***********************************************************************
793 * Comparisons, if/else/endif
794 */
795
796struct brw_instruction *brw_JMPI(struct brw_compile *p,
797                                 struct brw_reg dest,
798                                 struct brw_reg src0,
799                                 struct brw_reg src1)
800{
801   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
802
803   insn->header.execution_size = 1;
804   insn->header.compression_control = BRW_COMPRESSION_NONE;
805   insn->header.mask_control = BRW_MASK_DISABLE;
806
807   p->current->header.predicate_control = BRW_PREDICATE_NONE;
808
809   return insn;
810}
811
812/* EU takes the value from the flag register and pushes it onto some
813 * sort of a stack (presumably merging with any flag value already on
814 * the stack).  Within an if block, the flags at the top of the stack
815 * control execution on each channel of the unit, eg. on each of the
816 * 16 pixel values in our wm programs.
817 *
818 * When the matching 'else' instruction is reached (presumably by
819 * countdown of the instruction count patched in by our ELSE/ENDIF
820 * functions), the relevent flags are inverted.
821 *
822 * When the matching 'endif' instruction is reached, the flags are
823 * popped off.  If the stack is now empty, normal execution resumes.
824 *
825 * No attempt is made to deal with stack overflow (14 elements?).
826 */
827struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
828{
829   struct intel_context *intel = &p->brw->intel;
830   struct brw_instruction *insn;
831
832   if (p->single_program_flow) {
833      assert(execute_size == BRW_EXECUTE_1);
834
835      insn = next_insn(p, BRW_OPCODE_ADD);
836      insn->header.predicate_inverse = 1;
837   } else {
838      insn = next_insn(p, BRW_OPCODE_IF);
839   }
840
841   /* Override the defaults for this instruction:
842    */
843   if (intel->gen < 6) {
844      brw_set_dest(p, insn, brw_ip_reg());
845      brw_set_src0(insn, brw_ip_reg());
846      brw_set_src1(insn, brw_imm_d(0x0));
847   } else {
848      brw_set_dest(p, insn, brw_imm_w(0));
849      insn->bits1.branch_gen6.jump_count = 0;
850      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
851      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
852   }
853
854   insn->header.execution_size = execute_size;
855   insn->header.compression_control = BRW_COMPRESSION_NONE;
856   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
857   insn->header.mask_control = BRW_MASK_ENABLE;
858   if (!p->single_program_flow)
859       insn->header.thread_control = BRW_THREAD_SWITCH;
860
861   p->current->header.predicate_control = BRW_PREDICATE_NONE;
862
863   return insn;
864}
865
866struct brw_instruction *
867brw_IF_gen6(struct brw_compile *p, uint32_t conditional,
868	    struct brw_reg src0, struct brw_reg src1)
869{
870   struct brw_instruction *insn;
871
872   insn = next_insn(p, BRW_OPCODE_IF);
873
874   brw_set_dest(p, insn, brw_imm_w(0));
875   insn->header.execution_size = BRW_EXECUTE_8;
876   insn->bits1.branch_gen6.jump_count = 0;
877   brw_set_src0(insn, src0);
878   brw_set_src1(insn, src1);
879
880   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
881   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
882   insn->header.destreg__conditionalmod = conditional;
883
884   if (!p->single_program_flow)
885       insn->header.thread_control = BRW_THREAD_SWITCH;
886
887   return insn;
888}
889
890struct brw_instruction *brw_ELSE(struct brw_compile *p,
891				 struct brw_instruction *if_insn)
892{
893   struct intel_context *intel = &p->brw->intel;
894   struct brw_instruction *insn;
895   GLuint br = 1;
896
897   /* jump count is for 64bit data chunk each, so one 128bit
898      instruction requires 2 chunks. */
899   if (intel->gen >= 5)
900      br = 2;
901
902   if (p->single_program_flow) {
903      insn = next_insn(p, BRW_OPCODE_ADD);
904   } else {
905      insn = next_insn(p, BRW_OPCODE_ELSE);
906   }
907
908   if (intel->gen < 6) {
909      brw_set_dest(p, insn, brw_ip_reg());
910      brw_set_src0(insn, brw_ip_reg());
911      brw_set_src1(insn, brw_imm_d(0x0));
912   } else {
913      brw_set_dest(p, insn, brw_imm_w(0));
914      insn->bits1.branch_gen6.jump_count = 0;
915      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
916      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
917   }
918
919   insn->header.compression_control = BRW_COMPRESSION_NONE;
920   insn->header.execution_size = if_insn->header.execution_size;
921   insn->header.mask_control = BRW_MASK_ENABLE;
922   if (!p->single_program_flow)
923       insn->header.thread_control = BRW_THREAD_SWITCH;
924
925   /* Patch the if instruction to point at this instruction.
926    */
927   if (p->single_program_flow) {
928      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
929
930      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
931   } else {
932      assert(if_insn->header.opcode == BRW_OPCODE_IF);
933
934      if (intel->gen < 6) {
935	 if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
936	 if_insn->bits3.if_else.pop_count = 0;
937	 if_insn->bits3.if_else.pad0 = 0;
938      } else {
939	 if_insn->bits1.branch_gen6.jump_count = br * (insn - if_insn + 1);
940      }
941   }
942
943   return insn;
944}
945
946void brw_ENDIF(struct brw_compile *p,
947	       struct brw_instruction *patch_insn)
948{
949   struct intel_context *intel = &p->brw->intel;
950   GLuint br = 1;
951
952   if (intel->gen >= 5)
953      br = 2;
954
955   if (p->single_program_flow) {
956      /* In single program flow mode, there's no need to execute an ENDIF,
957       * since we don't need to do any stack operations, and if we're executing
958       * currently, we want to just continue executing.
959       */
960      struct brw_instruction *next = &p->store[p->nr_insn];
961
962      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
963
964      patch_insn->bits3.ud = (next - patch_insn) * 16;
965   } else {
966      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
967
968      if (intel->gen < 6) {
969	 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
970	 brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
971	 brw_set_src1(insn, brw_imm_d(0x0));
972      } else {
973	 brw_set_dest(p, insn, brw_imm_w(0));
974	 brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
975	 brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
976      }
977
978      insn->header.compression_control = BRW_COMPRESSION_NONE;
979      insn->header.execution_size = patch_insn->header.execution_size;
980      insn->header.mask_control = BRW_MASK_ENABLE;
981      insn->header.thread_control = BRW_THREAD_SWITCH;
982
983      if (intel->gen < 6)
984	 assert(patch_insn->bits3.if_else.jump_count == 0);
985      else
986	 assert(patch_insn->bits1.branch_gen6.jump_count == 0);
987
988      /* Patch the if or else instructions to point at this or the next
989       * instruction respectively.
990       */
991      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
992	 if (intel->gen < 6) {
993	    /* Turn it into an IFF, which means no mask stack operations for
994	     * all-false and jumping past the ENDIF.
995	     */
996	    patch_insn->header.opcode = BRW_OPCODE_IFF;
997	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
998	    patch_insn->bits3.if_else.pop_count = 0;
999	    patch_insn->bits3.if_else.pad0 = 0;
1000	 } else {
1001	    /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1002	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1003	 }
1004      } else {
1005	 assert(patch_insn->header.opcode == BRW_OPCODE_ELSE);
1006	 if (intel->gen < 6) {
1007	    /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1008	     * matching ENDIF.
1009	     */
1010	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
1011	    patch_insn->bits3.if_else.pop_count = 1;
1012	    patch_insn->bits3.if_else.pad0 = 0;
1013	 } else {
1014	    /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1015	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1016	 }
1017      }
1018
1019      /* Also pop item off the stack in the endif instruction:
1020       */
1021      if (intel->gen < 6) {
1022	 insn->bits3.if_else.jump_count = 0;
1023	 insn->bits3.if_else.pop_count = 1;
1024	 insn->bits3.if_else.pad0 = 0;
1025      } else {
1026	 insn->bits1.branch_gen6.jump_count = 2;
1027      }
1028   }
1029}
1030
1031struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1032{
1033   struct intel_context *intel = &p->brw->intel;
1034   struct brw_instruction *insn;
1035
1036   insn = next_insn(p, BRW_OPCODE_BREAK);
1037   if (intel->gen >= 6) {
1038      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1039      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1040      brw_set_src1(insn, brw_imm_d(0x0));
1041   } else {
1042      brw_set_dest(p, insn, brw_ip_reg());
1043      brw_set_src0(insn, brw_ip_reg());
1044      brw_set_src1(insn, brw_imm_d(0x0));
1045      insn->bits3.if_else.pad0 = 0;
1046      insn->bits3.if_else.pop_count = pop_count;
1047   }
1048   insn->header.compression_control = BRW_COMPRESSION_NONE;
1049   insn->header.execution_size = BRW_EXECUTE_8;
1050
1051   return insn;
1052}
1053
1054struct brw_instruction *brw_CONT_gen6(struct brw_compile *p,
1055				      struct brw_instruction *do_insn)
1056{
1057   struct brw_instruction *insn;
1058   int br = 2;
1059
1060   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1061   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1062   brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1063   brw_set_dest(p, insn, brw_ip_reg());
1064   brw_set_src0(insn, brw_ip_reg());
1065   brw_set_src1(insn, brw_imm_d(0x0));
1066
1067   insn->bits3.break_cont.uip = br * (do_insn - insn);
1068
1069   insn->header.compression_control = BRW_COMPRESSION_NONE;
1070   insn->header.execution_size = BRW_EXECUTE_8;
1071   return insn;
1072}
1073
1074struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1075{
1076   struct brw_instruction *insn;
1077   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1078   brw_set_dest(p, insn, brw_ip_reg());
1079   brw_set_src0(insn, brw_ip_reg());
1080   brw_set_src1(insn, brw_imm_d(0x0));
1081   insn->header.compression_control = BRW_COMPRESSION_NONE;
1082   insn->header.execution_size = BRW_EXECUTE_8;
1083   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1084   insn->bits3.if_else.pad0 = 0;
1085   insn->bits3.if_else.pop_count = pop_count;
1086   return insn;
1087}
1088
1089/* DO/WHILE loop:
1090 *
1091 * The DO/WHILE is just an unterminated loop -- break or continue are
1092 * used for control within the loop.  We have a few ways they can be
1093 * done.
1094 *
1095 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1096 * jip and no DO instruction.
1097 *
1098 * For non-uniform control flow pre-gen6, there's a DO instruction to
1099 * push the mask, and a WHILE to jump back, and BREAK to get out and
1100 * pop the mask.
1101 *
1102 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1103 * just points back to the first instruction of the loop.
1104 */
1105struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1106{
1107   struct intel_context *intel = &p->brw->intel;
1108
1109   if (intel->gen >= 6 || p->single_program_flow) {
1110      return &p->store[p->nr_insn];
1111   } else {
1112      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1113
1114      /* Override the defaults for this instruction:
1115       */
1116      brw_set_dest(p, insn, brw_null_reg());
1117      brw_set_src0(insn, brw_null_reg());
1118      brw_set_src1(insn, brw_null_reg());
1119
1120      insn->header.compression_control = BRW_COMPRESSION_NONE;
1121      insn->header.execution_size = execute_size;
1122      insn->header.predicate_control = BRW_PREDICATE_NONE;
1123      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1124      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1125
1126      return insn;
1127   }
1128}
1129
1130
1131
1132struct brw_instruction *brw_WHILE(struct brw_compile *p,
1133                                  struct brw_instruction *do_insn)
1134{
1135   struct intel_context *intel = &p->brw->intel;
1136   struct brw_instruction *insn;
1137   GLuint br = 1;
1138
1139   if (intel->gen >= 5)
1140      br = 2;
1141
1142   if (intel->gen >= 6) {
1143      insn = next_insn(p, BRW_OPCODE_WHILE);
1144
1145      brw_set_dest(p, insn, brw_imm_w(0));
1146      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1147      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1148      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1149
1150      insn->header.execution_size = do_insn->header.execution_size;
1151      assert(insn->header.execution_size == BRW_EXECUTE_8);
1152   } else {
1153      if (p->single_program_flow) {
1154	 insn = next_insn(p, BRW_OPCODE_ADD);
1155
1156	 brw_set_dest(p, insn, brw_ip_reg());
1157	 brw_set_src0(insn, brw_ip_reg());
1158	 brw_set_src1(insn, brw_imm_d((do_insn - insn) * 16));
1159	 insn->header.execution_size = BRW_EXECUTE_1;
1160      } else {
1161	 insn = next_insn(p, BRW_OPCODE_WHILE);
1162
1163	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1164
1165	 brw_set_dest(p, insn, brw_ip_reg());
1166	 brw_set_src0(insn, brw_ip_reg());
1167	 brw_set_src1(insn, brw_imm_d(0));
1168
1169	 insn->header.execution_size = do_insn->header.execution_size;
1170	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1171	 insn->bits3.if_else.pop_count = 0;
1172	 insn->bits3.if_else.pad0 = 0;
1173      }
1174   }
1175   insn->header.compression_control = BRW_COMPRESSION_NONE;
1176   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1177
1178   return insn;
1179}
1180
1181
1182/* FORWARD JUMPS:
1183 */
1184void brw_land_fwd_jump(struct brw_compile *p,
1185		       struct brw_instruction *jmp_insn)
1186{
1187   struct intel_context *intel = &p->brw->intel;
1188   struct brw_instruction *landing = &p->store[p->nr_insn];
1189   GLuint jmpi = 1;
1190
1191   if (intel->gen >= 5)
1192       jmpi = 2;
1193
1194   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1195   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1196
1197   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1198}
1199
1200
1201
1202/* To integrate with the above, it makes sense that the comparison
1203 * instruction should populate the flag register.  It might be simpler
1204 * just to use the flag reg for most WM tasks?
1205 */
1206void brw_CMP(struct brw_compile *p,
1207	     struct brw_reg dest,
1208	     GLuint conditional,
1209	     struct brw_reg src0,
1210	     struct brw_reg src1)
1211{
1212   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1213
1214   insn->header.destreg__conditionalmod = conditional;
1215   brw_set_dest(p, insn, dest);
1216   brw_set_src0(insn, src0);
1217   brw_set_src1(insn, src1);
1218
1219/*    guess_execution_size(insn, src0); */
1220
1221
1222   /* Make it so that future instructions will use the computed flag
1223    * value until brw_set_predicate_control_flag_value() is called
1224    * again.
1225    */
1226   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1227       dest.nr == 0) {
1228      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1229      p->flag_value = 0xff;
1230   }
1231}
1232
1233/* Issue 'wait' instruction for n1, host could program MMIO
1234   to wake up thread. */
1235void brw_WAIT (struct brw_compile *p)
1236{
1237   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1238   struct brw_reg src = brw_notification_1_reg();
1239
1240   brw_set_dest(p, insn, src);
1241   brw_set_src0(insn, src);
1242   brw_set_src1(insn, brw_null_reg());
1243   insn->header.execution_size = 0; /* must */
1244   insn->header.predicate_control = 0;
1245   insn->header.compression_control = 0;
1246}
1247
1248
1249/***********************************************************************
1250 * Helpers for the various SEND message types:
1251 */
1252
1253/** Extended math function, float[8].
1254 */
1255void brw_math( struct brw_compile *p,
1256	       struct brw_reg dest,
1257	       GLuint function,
1258	       GLuint saturate,
1259	       GLuint msg_reg_nr,
1260	       struct brw_reg src,
1261	       GLuint data_type,
1262	       GLuint precision )
1263{
1264   struct intel_context *intel = &p->brw->intel;
1265
1266   if (intel->gen >= 6) {
1267      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1268
1269      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1270      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1271
1272      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1273      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1274
1275      if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1276	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1277	 assert(src.type == BRW_REGISTER_TYPE_F);
1278      }
1279
1280      /* Math is the same ISA format as other opcodes, except that CondModifier
1281       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1282       */
1283      insn->header.destreg__conditionalmod = function;
1284      insn->header.saturate = saturate;
1285
1286      brw_set_dest(p, insn, dest);
1287      brw_set_src0(insn, src);
1288      brw_set_src1(insn, brw_null_reg());
1289   } else {
1290      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1291      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1292      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1293      /* Example code doesn't set predicate_control for send
1294       * instructions.
1295       */
1296      insn->header.predicate_control = 0;
1297      insn->header.destreg__conditionalmod = msg_reg_nr;
1298
1299      brw_set_dest(p, insn, dest);
1300      brw_set_src0(insn, src);
1301      brw_set_math_message(p->brw,
1302			   insn,
1303			   msg_length, response_length,
1304			   function,
1305			   BRW_MATH_INTEGER_UNSIGNED,
1306			   precision,
1307			   saturate,
1308			   data_type);
1309   }
1310}
1311
1312/** Extended math function, float[8].
1313 */
1314void brw_math2(struct brw_compile *p,
1315	       struct brw_reg dest,
1316	       GLuint function,
1317	       struct brw_reg src0,
1318	       struct brw_reg src1)
1319{
1320   struct intel_context *intel = &p->brw->intel;
1321   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1322
1323   assert(intel->gen >= 6);
1324   (void) intel;
1325
1326
1327   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1328   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1329   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1330
1331   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1332   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1333   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1334
1335   if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1336       function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1337      assert(src0.type == BRW_REGISTER_TYPE_F);
1338      assert(src1.type == BRW_REGISTER_TYPE_F);
1339   }
1340
1341   /* Math is the same ISA format as other opcodes, except that CondModifier
1342    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1343    */
1344   insn->header.destreg__conditionalmod = function;
1345
1346   brw_set_dest(p, insn, dest);
1347   brw_set_src0(insn, src0);
1348   brw_set_src1(insn, src1);
1349}
1350
1351/**
1352 * Extended math function, float[16].
1353 * Use 2 send instructions.
1354 */
1355void brw_math_16( struct brw_compile *p,
1356		  struct brw_reg dest,
1357		  GLuint function,
1358		  GLuint saturate,
1359		  GLuint msg_reg_nr,
1360		  struct brw_reg src,
1361		  GLuint precision )
1362{
1363   struct intel_context *intel = &p->brw->intel;
1364   struct brw_instruction *insn;
1365   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1366   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1367
1368   if (intel->gen >= 6) {
1369      insn = next_insn(p, BRW_OPCODE_MATH);
1370
1371      /* Math is the same ISA format as other opcodes, except that CondModifier
1372       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1373       */
1374      insn->header.destreg__conditionalmod = function;
1375      insn->header.saturate = saturate;
1376
1377      brw_set_dest(p, insn, dest);
1378      brw_set_src0(insn, src);
1379      brw_set_src1(insn, brw_null_reg());
1380      return;
1381   }
1382
1383   /* First instruction:
1384    */
1385   brw_push_insn_state(p);
1386   brw_set_predicate_control_flag_value(p, 0xff);
1387   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1388
1389   insn = next_insn(p, BRW_OPCODE_SEND);
1390   insn->header.destreg__conditionalmod = msg_reg_nr;
1391
1392   brw_set_dest(p, insn, dest);
1393   brw_set_src0(insn, src);
1394   brw_set_math_message(p->brw,
1395			insn,
1396			msg_length, response_length,
1397			function,
1398			BRW_MATH_INTEGER_UNSIGNED,
1399			precision,
1400			saturate,
1401			BRW_MATH_DATA_VECTOR);
1402
1403   /* Second instruction:
1404    */
1405   insn = next_insn(p, BRW_OPCODE_SEND);
1406   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1407   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1408
1409   brw_set_dest(p, insn, offset(dest,1));
1410   brw_set_src0(insn, src);
1411   brw_set_math_message(p->brw,
1412			insn,
1413			msg_length, response_length,
1414			function,
1415			BRW_MATH_INTEGER_UNSIGNED,
1416			precision,
1417			saturate,
1418			BRW_MATH_DATA_VECTOR);
1419
1420   brw_pop_insn_state(p);
1421}
1422
1423
1424/**
1425 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1426 * using a constant offset per channel.
1427 *
1428 * The offset must be aligned to oword size (16 bytes).  Used for
1429 * register spilling.
1430 */
1431void brw_oword_block_write_scratch(struct brw_compile *p,
1432				   struct brw_reg mrf,
1433				   int num_regs,
1434				   GLuint offset)
1435{
1436   struct intel_context *intel = &p->brw->intel;
1437   uint32_t msg_control;
1438   int mlen;
1439
1440   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1441
1442   if (num_regs == 1) {
1443      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1444      mlen = 2;
1445   } else {
1446      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1447      mlen = 3;
1448   }
1449
1450   /* Set up the message header.  This is g0, with g0.2 filled with
1451    * the offset.  We don't want to leave our offset around in g0 or
1452    * it'll screw up texture samples, so set it up inside the message
1453    * reg.
1454    */
1455   {
1456      brw_push_insn_state(p);
1457      brw_set_mask_control(p, BRW_MASK_DISABLE);
1458      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1459
1460      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1461
1462      /* set message header global offset field (reg 0, element 2) */
1463      brw_MOV(p,
1464	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1465				  mrf.nr,
1466				  2), BRW_REGISTER_TYPE_UD),
1467	      brw_imm_ud(offset));
1468
1469      brw_pop_insn_state(p);
1470   }
1471
1472   {
1473      struct brw_reg dest;
1474      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1475      int send_commit_msg;
1476      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1477					 BRW_REGISTER_TYPE_UW);
1478
1479      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1480	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1481	 src_header = vec16(src_header);
1482      }
1483      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1484      insn->header.destreg__conditionalmod = mrf.nr;
1485
1486      /* Until gen6, writes followed by reads from the same location
1487       * are not guaranteed to be ordered unless write_commit is set.
1488       * If set, then a no-op write is issued to the destination
1489       * register to set a dependency, and a read from the destination
1490       * can be used to ensure the ordering.
1491       *
1492       * For gen6, only writes between different threads need ordering
1493       * protection.  Our use of DP writes is all about register
1494       * spilling within a thread.
1495       */
1496      if (intel->gen >= 6) {
1497	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1498	 send_commit_msg = 0;
1499      } else {
1500	 dest = src_header;
1501	 send_commit_msg = 1;
1502      }
1503
1504      brw_set_dest(p, insn, dest);
1505      brw_set_src0(insn, brw_null_reg());
1506
1507      brw_set_dp_write_message(p->brw,
1508			       insn,
1509			       255, /* binding table index (255=stateless) */
1510			       msg_control,
1511			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
1512			       mlen,
1513			       GL_TRUE, /* header_present */
1514			       0, /* pixel scoreboard */
1515			       send_commit_msg, /* response_length */
1516			       0, /* eot */
1517			       send_commit_msg);
1518   }
1519}
1520
1521
1522/**
1523 * Read a block of owords (half a GRF each) from the scratch buffer
1524 * using a constant index per channel.
1525 *
1526 * Offset must be aligned to oword size (16 bytes).  Used for register
1527 * spilling.
1528 */
1529void
1530brw_oword_block_read_scratch(struct brw_compile *p,
1531			     struct brw_reg dest,
1532			     struct brw_reg mrf,
1533			     int num_regs,
1534			     GLuint offset)
1535{
1536   uint32_t msg_control;
1537   int rlen;
1538
1539   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1540   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1541
1542   if (num_regs == 1) {
1543      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1544      rlen = 1;
1545   } else {
1546      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1547      rlen = 2;
1548   }
1549
1550   {
1551      brw_push_insn_state(p);
1552      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1553      brw_set_mask_control(p, BRW_MASK_DISABLE);
1554
1555      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1556
1557      /* set message header global offset field (reg 0, element 2) */
1558      brw_MOV(p,
1559	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1560				  mrf.nr,
1561				  2), BRW_REGISTER_TYPE_UD),
1562	      brw_imm_ud(offset));
1563
1564      brw_pop_insn_state(p);
1565   }
1566
1567   {
1568      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1569
1570      assert(insn->header.predicate_control == 0);
1571      insn->header.compression_control = BRW_COMPRESSION_NONE;
1572      insn->header.destreg__conditionalmod = mrf.nr;
1573
1574      brw_set_dest(p, insn, dest);	/* UW? */
1575      brw_set_src0(insn, brw_null_reg());
1576
1577      brw_set_dp_read_message(p->brw,
1578			      insn,
1579			      255, /* binding table index (255=stateless) */
1580			      msg_control,
1581			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1582			      1, /* target cache (render/scratch) */
1583			      1, /* msg_length */
1584			      rlen);
1585   }
1586}
1587
1588/**
1589 * Read a float[4] vector from the data port Data Cache (const buffer).
1590 * Location (in buffer) should be a multiple of 16.
1591 * Used for fetching shader constants.
1592 */
1593void brw_oword_block_read(struct brw_compile *p,
1594			  struct brw_reg dest,
1595			  struct brw_reg mrf,
1596			  uint32_t offset,
1597			  uint32_t bind_table_index)
1598{
1599   struct intel_context *intel = &p->brw->intel;
1600
1601   /* On newer hardware, offset is in units of owords. */
1602   if (intel->gen >= 6)
1603      offset /= 16;
1604
1605   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1606
1607   brw_push_insn_state(p);
1608   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1609   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1610   brw_set_mask_control(p, BRW_MASK_DISABLE);
1611
1612   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1613
1614   /* set message header global offset field (reg 0, element 2) */
1615   brw_MOV(p,
1616	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1617			       mrf.nr,
1618			       2), BRW_REGISTER_TYPE_UD),
1619	   brw_imm_ud(offset));
1620
1621   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1622   insn->header.destreg__conditionalmod = mrf.nr;
1623
1624   /* cast dest to a uword[8] vector */
1625   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1626
1627   brw_set_dest(p, insn, dest);
1628   if (intel->gen >= 6) {
1629      brw_set_src0(insn, mrf);
1630   } else {
1631      brw_set_src0(insn, brw_null_reg());
1632   }
1633
1634   brw_set_dp_read_message(p->brw,
1635			   insn,
1636			   bind_table_index,
1637			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1638			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1639			   0, /* source cache = data cache */
1640			   1, /* msg_length */
1641			   1); /* response_length (1 reg, 2 owords!) */
1642
1643   brw_pop_insn_state(p);
1644}
1645
1646/**
1647 * Read a set of dwords from the data port Data Cache (const buffer).
1648 *
1649 * Location (in buffer) appears as UD offsets in the register after
1650 * the provided mrf header reg.
1651 */
1652void brw_dword_scattered_read(struct brw_compile *p,
1653			      struct brw_reg dest,
1654			      struct brw_reg mrf,
1655			      uint32_t bind_table_index)
1656{
1657   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1658
1659   brw_push_insn_state(p);
1660   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1661   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1662   brw_set_mask_control(p, BRW_MASK_DISABLE);
1663   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1664   brw_pop_insn_state(p);
1665
1666   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1667   insn->header.destreg__conditionalmod = mrf.nr;
1668
1669   /* cast dest to a uword[8] vector */
1670   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1671
1672   brw_set_dest(p, insn, dest);
1673   brw_set_src0(insn, brw_null_reg());
1674
1675   brw_set_dp_read_message(p->brw,
1676			   insn,
1677			   bind_table_index,
1678			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1679			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1680			   0, /* source cache = data cache */
1681			   2, /* msg_length */
1682			   1); /* response_length */
1683}
1684
1685
1686
1687/**
1688 * Read float[4] constant(s) from VS constant buffer.
1689 * For relative addressing, two float[4] constants will be read into 'dest'.
1690 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1691 */
1692void brw_dp_READ_4_vs(struct brw_compile *p,
1693                      struct brw_reg dest,
1694                      GLuint location,
1695                      GLuint bind_table_index)
1696{
1697   struct brw_instruction *insn;
1698   GLuint msg_reg_nr = 1;
1699   struct brw_reg b;
1700
1701   /*
1702   printf("vs const read msg, location %u, msg_reg_nr %d\n",
1703          location, msg_reg_nr);
1704   */
1705
1706   /* Setup MRF[1] with location/offset into const buffer */
1707   brw_push_insn_state(p);
1708   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1709   brw_set_mask_control(p, BRW_MASK_DISABLE);
1710   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1711
1712   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
1713    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
1714    */
1715   b = brw_message_reg(msg_reg_nr);
1716   b = retype(b, BRW_REGISTER_TYPE_UD);
1717   /*b = get_element_ud(b, 2);*/
1718   brw_MOV(p, b, brw_imm_ud(location));
1719
1720   brw_pop_insn_state(p);
1721
1722   insn = next_insn(p, BRW_OPCODE_SEND);
1723
1724   insn->header.predicate_control = BRW_PREDICATE_NONE;
1725   insn->header.compression_control = BRW_COMPRESSION_NONE;
1726   insn->header.destreg__conditionalmod = msg_reg_nr;
1727   insn->header.mask_control = BRW_MASK_DISABLE;
1728
1729   brw_set_dest(p, insn, dest);
1730   brw_set_src0(insn, brw_null_reg());
1731
1732   brw_set_dp_read_message(p->brw,
1733			   insn,
1734			   bind_table_index,
1735			   0,
1736			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1737			   0, /* source cache = data cache */
1738			   1, /* msg_length */
1739			   1); /* response_length (1 Oword) */
1740}
1741
1742/**
1743 * Read a float[4] constant per vertex from VS constant buffer, with
1744 * relative addressing.
1745 */
1746void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1747			       struct brw_reg dest,
1748			       struct brw_reg addr_reg,
1749			       GLuint offset,
1750			       GLuint bind_table_index)
1751{
1752   struct intel_context *intel = &p->brw->intel;
1753   int msg_type;
1754
1755   /* Setup MRF[1] with offset into const buffer */
1756   brw_push_insn_state(p);
1757   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1758   brw_set_mask_control(p, BRW_MASK_DISABLE);
1759   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1760
1761   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1762    * fields ignored.
1763    */
1764   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
1765	   addr_reg, brw_imm_d(offset));
1766   brw_pop_insn_state(p);
1767
1768   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1769
1770   insn->header.predicate_control = BRW_PREDICATE_NONE;
1771   insn->header.compression_control = BRW_COMPRESSION_NONE;
1772   insn->header.destreg__conditionalmod = 0;
1773   insn->header.mask_control = BRW_MASK_DISABLE;
1774
1775   brw_set_dest(p, insn, dest);
1776   brw_set_src0(insn, brw_vec8_grf(0, 0));
1777
1778   if (intel->gen == 6)
1779      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1780   else if (intel->gen == 5 || intel->is_g4x)
1781      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1782   else
1783      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1784
1785   brw_set_dp_read_message(p->brw,
1786			   insn,
1787			   bind_table_index,
1788			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1789			   msg_type,
1790			   0, /* source cache = data cache */
1791			   2, /* msg_length */
1792			   1); /* response_length */
1793}
1794
1795
1796
1797void brw_fb_WRITE(struct brw_compile *p,
1798		  int dispatch_width,
1799                  struct brw_reg dest,
1800                  GLuint msg_reg_nr,
1801                  struct brw_reg src0,
1802                  GLuint binding_table_index,
1803                  GLuint msg_length,
1804                  GLuint response_length,
1805                  GLboolean eot)
1806{
1807   struct intel_context *intel = &p->brw->intel;
1808   struct brw_instruction *insn;
1809   GLuint msg_control, msg_type;
1810   GLboolean header_present = GL_TRUE;
1811
1812   if (intel->gen >= 6 && binding_table_index == 0) {
1813      insn = next_insn(p, BRW_OPCODE_SENDC);
1814   } else {
1815      insn = next_insn(p, BRW_OPCODE_SEND);
1816   }
1817   /* The execution mask is ignored for render target writes. */
1818   insn->header.predicate_control = 0;
1819   insn->header.compression_control = BRW_COMPRESSION_NONE;
1820
1821   if (intel->gen >= 6) {
1822      if (msg_length == 4)
1823	 header_present = GL_FALSE;
1824
1825       /* headerless version, just submit color payload */
1826       src0 = brw_message_reg(msg_reg_nr);
1827
1828       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6;
1829   } else {
1830      insn->header.destreg__conditionalmod = msg_reg_nr;
1831
1832      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1833   }
1834
1835   if (dispatch_width == 16)
1836      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1837   else
1838      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1839
1840   brw_set_dest(p, insn, dest);
1841   brw_set_src0(insn, src0);
1842   brw_set_dp_write_message(p->brw,
1843			    insn,
1844			    binding_table_index,
1845			    msg_control,
1846			    msg_type,
1847			    msg_length,
1848			    header_present,
1849			    1,	/* pixel scoreboard */
1850			    response_length,
1851			    eot,
1852			    0 /* send_commit_msg */);
1853}
1854
1855
1856/**
1857 * Texture sample instruction.
1858 * Note: the msg_type plus msg_length values determine exactly what kind
1859 * of sampling operation is performed.  See volume 4, page 161 of docs.
1860 */
1861void brw_SAMPLE(struct brw_compile *p,
1862		struct brw_reg dest,
1863		GLuint msg_reg_nr,
1864		struct brw_reg src0,
1865		GLuint binding_table_index,
1866		GLuint sampler,
1867		GLuint writemask,
1868		GLuint msg_type,
1869		GLuint response_length,
1870		GLuint msg_length,
1871		GLboolean eot,
1872		GLuint header_present,
1873		GLuint simd_mode)
1874{
1875   struct intel_context *intel = &p->brw->intel;
1876   GLboolean need_stall = 0;
1877
1878   if (writemask == 0) {
1879      /*printf("%s: zero writemask??\n", __FUNCTION__); */
1880      return;
1881   }
1882
1883   /* Hardware doesn't do destination dependency checking on send
1884    * instructions properly.  Add a workaround which generates the
1885    * dependency by other means.  In practice it seems like this bug
1886    * only crops up for texture samples, and only where registers are
1887    * written by the send and then written again later without being
1888    * read in between.  Luckily for us, we already track that
1889    * information and use it to modify the writemask for the
1890    * instruction, so that is a guide for whether a workaround is
1891    * needed.
1892    */
1893   if (writemask != WRITEMASK_XYZW) {
1894      GLuint dst_offset = 0;
1895      GLuint i, newmask = 0, len = 0;
1896
1897      for (i = 0; i < 4; i++) {
1898	 if (writemask & (1<<i))
1899	    break;
1900	 dst_offset += 2;
1901      }
1902      for (; i < 4; i++) {
1903	 if (!(writemask & (1<<i)))
1904	    break;
1905	 newmask |= 1<<i;
1906	 len++;
1907      }
1908
1909      if (newmask != writemask) {
1910	 need_stall = 1;
1911         /* printf("need stall %x %x\n", newmask , writemask); */
1912      }
1913      else {
1914	 GLboolean dispatch_16 = GL_FALSE;
1915
1916	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
1917
1918	 guess_execution_size(p, p->current, dest);
1919	 if (p->current->header.execution_size == BRW_EXECUTE_16)
1920	    dispatch_16 = GL_TRUE;
1921
1922	 newmask = ~newmask & WRITEMASK_XYZW;
1923
1924	 brw_push_insn_state(p);
1925
1926	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1927	 brw_set_mask_control(p, BRW_MASK_DISABLE);
1928
1929	 brw_MOV(p, m1, brw_vec8_grf(0,0));
1930  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
1931
1932	 brw_pop_insn_state(p);
1933
1934  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
1935	 dest = offset(dest, dst_offset);
1936
1937	 /* For 16-wide dispatch, masked channels are skipped in the
1938	  * response.  For 8-wide, masked channels still take up slots,
1939	  * and are just not written to.
1940	  */
1941	 if (dispatch_16)
1942	    response_length = len * 2;
1943      }
1944   }
1945
1946   {
1947      struct brw_instruction *insn;
1948
1949      /* Sandybridge doesn't have the implied move for SENDs,
1950       * and the first message register index comes from src0.
1951       */
1952      if (intel->gen >= 6) {
1953	 if (src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1954	     src0.nr != BRW_ARF_NULL) {
1955	    brw_push_insn_state(p);
1956	    brw_set_mask_control( p, BRW_MASK_DISABLE );
1957	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1958	    brw_MOV(p, retype(brw_message_reg(msg_reg_nr), src0.type), src0);
1959	    brw_pop_insn_state(p);
1960	 }
1961	 src0 = brw_message_reg(msg_reg_nr);
1962      }
1963
1964      insn = next_insn(p, BRW_OPCODE_SEND);
1965      insn->header.predicate_control = 0; /* XXX */
1966      insn->header.compression_control = BRW_COMPRESSION_NONE;
1967      if (intel->gen < 6)
1968	  insn->header.destreg__conditionalmod = msg_reg_nr;
1969
1970      brw_set_dest(p, insn, dest);
1971      brw_set_src0(insn, src0);
1972      brw_set_sampler_message(p->brw, insn,
1973			      binding_table_index,
1974			      sampler,
1975			      msg_type,
1976			      response_length,
1977			      msg_length,
1978			      eot,
1979			      header_present,
1980			      simd_mode);
1981   }
1982
1983   if (need_stall) {
1984      struct brw_reg reg = vec8(offset(dest, response_length-1));
1985
1986      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
1987       */
1988      brw_push_insn_state(p);
1989      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1990      brw_MOV(p, reg, reg);
1991      brw_pop_insn_state(p);
1992   }
1993
1994}
1995
1996/* All these variables are pretty confusing - we might be better off
1997 * using bitmasks and macros for this, in the old style.  Or perhaps
1998 * just having the caller instantiate the fields in dword3 itself.
1999 */
2000void brw_urb_WRITE(struct brw_compile *p,
2001		   struct brw_reg dest,
2002		   GLuint msg_reg_nr,
2003		   struct brw_reg src0,
2004		   GLboolean allocate,
2005		   GLboolean used,
2006		   GLuint msg_length,
2007		   GLuint response_length,
2008		   GLboolean eot,
2009		   GLboolean writes_complete,
2010		   GLuint offset,
2011		   GLuint swizzle)
2012{
2013   struct intel_context *intel = &p->brw->intel;
2014   struct brw_instruction *insn;
2015
2016   /* Sandybridge doesn't have the implied move for SENDs,
2017    * and the first message register index comes from src0.
2018    */
2019   if (intel->gen >= 6) {
2020      brw_push_insn_state(p);
2021      brw_set_mask_control( p, BRW_MASK_DISABLE );
2022      brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
2023      brw_pop_insn_state(p);
2024      src0 = brw_message_reg(msg_reg_nr);
2025   }
2026
2027   insn = next_insn(p, BRW_OPCODE_SEND);
2028
2029   assert(msg_length < BRW_MAX_MRF);
2030
2031   brw_set_dest(p, insn, dest);
2032   brw_set_src0(insn, src0);
2033   brw_set_src1(insn, brw_imm_d(0));
2034
2035   if (intel->gen < 6)
2036      insn->header.destreg__conditionalmod = msg_reg_nr;
2037
2038   brw_set_urb_message(p->brw,
2039		       insn,
2040		       allocate,
2041		       used,
2042		       msg_length,
2043		       response_length,
2044		       eot,
2045		       writes_complete,
2046		       offset,
2047		       swizzle);
2048}
2049
2050static int
2051brw_find_next_block_end(struct brw_compile *p, int start)
2052{
2053   int ip;
2054
2055   for (ip = start + 1; ip < p->nr_insn; ip++) {
2056      struct brw_instruction *insn = &p->store[ip];
2057
2058      switch (insn->header.opcode) {
2059      case BRW_OPCODE_ENDIF:
2060      case BRW_OPCODE_ELSE:
2061      case BRW_OPCODE_WHILE:
2062	 return ip;
2063      }
2064   }
2065   assert(!"not reached");
2066   return start + 1;
2067}
2068
2069/* There is no DO instruction on gen6, so to find the end of the loop
2070 * we have to see if the loop is jumping back before our start
2071 * instruction.
2072 */
2073static int
2074brw_find_loop_end(struct brw_compile *p, int start)
2075{
2076   int ip;
2077   int br = 2;
2078
2079   for (ip = start + 1; ip < p->nr_insn; ip++) {
2080      struct brw_instruction *insn = &p->store[ip];
2081
2082      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2083	 if (ip + insn->bits1.branch_gen6.jump_count / br < start)
2084	    return ip;
2085      }
2086   }
2087   assert(!"not reached");
2088   return start + 1;
2089}
2090
2091/* After program generation, go back and update the UIP and JIP of
2092 * BREAK and CONT instructions to their correct locations.
2093 */
2094void
2095brw_set_uip_jip(struct brw_compile *p)
2096{
2097   struct intel_context *intel = &p->brw->intel;
2098   int ip;
2099   int br = 2;
2100
2101   if (intel->gen < 6)
2102      return;
2103
2104   for (ip = 0; ip < p->nr_insn; ip++) {
2105      struct brw_instruction *insn = &p->store[ip];
2106
2107      switch (insn->header.opcode) {
2108      case BRW_OPCODE_BREAK:
2109	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2110	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
2111	 break;
2112      case BRW_OPCODE_CONTINUE:
2113	 /* JIP is set at CONTINUE emit time, since that's when we
2114	  * know where the start of the loop is.
2115	  */
2116	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2117	 assert(insn->bits3.break_cont.uip != 0);
2118	 assert(insn->bits3.break_cont.jip != 0);
2119	 break;
2120      }
2121   }
2122}
2123
2124void brw_ff_sync(struct brw_compile *p,
2125		   struct brw_reg dest,
2126		   GLuint msg_reg_nr,
2127		   struct brw_reg src0,
2128		   GLboolean allocate,
2129		   GLuint response_length,
2130		   GLboolean eot)
2131{
2132   struct intel_context *intel = &p->brw->intel;
2133   struct brw_instruction *insn;
2134
2135   /* Sandybridge doesn't have the implied move for SENDs,
2136    * and the first message register index comes from src0.
2137    */
2138   if (intel->gen >= 6) {
2139      brw_push_insn_state(p);
2140      brw_set_mask_control( p, BRW_MASK_DISABLE );
2141      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
2142	      retype(src0, BRW_REGISTER_TYPE_UD));
2143      brw_pop_insn_state(p);
2144      src0 = brw_message_reg(msg_reg_nr);
2145   }
2146
2147   insn = next_insn(p, BRW_OPCODE_SEND);
2148   brw_set_dest(p, insn, dest);
2149   brw_set_src0(insn, src0);
2150   brw_set_src1(insn, brw_imm_d(0));
2151
2152   if (intel->gen < 6)
2153       insn->header.destreg__conditionalmod = msg_reg_nr;
2154
2155   brw_set_ff_sync_message(p->brw,
2156			   insn,
2157			   allocate,
2158			   response_length,
2159			   eot);
2160}
2161