brw_eu_emit.c revision 14a9153a32255f186a30b500d6db412388f4de28
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37
38
39
40/***********************************************************************
41 * Internal helper for constructing instructions
42 */
43
44static void guess_execution_size(struct brw_compile *p,
45				 struct brw_instruction *insn,
46				 struct brw_reg reg)
47{
48   if (reg.width == BRW_WIDTH_8 && p->compressed)
49      insn->header.execution_size = BRW_EXECUTE_16;
50   else
51      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
52}
53
54
55static void brw_set_dest(struct brw_compile *p,
56			 struct brw_instruction *insn,
57			 struct brw_reg dest)
58{
59   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
60       dest.file != BRW_MESSAGE_REGISTER_FILE)
61      assert(dest.nr < 128);
62
63   insn->bits1.da1.dest_reg_file = dest.file;
64   insn->bits1.da1.dest_reg_type = dest.type;
65   insn->bits1.da1.dest_address_mode = dest.address_mode;
66
67   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
68      insn->bits1.da1.dest_reg_nr = dest.nr;
69
70      if (insn->header.access_mode == BRW_ALIGN_1) {
71	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
72	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
73	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
74	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
75      }
76      else {
77	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
78	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
79	 /* even ignored in da16, still need to set as '01' */
80	 insn->bits1.da16.dest_horiz_stride = 1;
81      }
82   }
83   else {
84      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
85
86      /* These are different sizes in align1 vs align16:
87       */
88      if (insn->header.access_mode == BRW_ALIGN_1) {
89	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
90	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
91	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
92	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
93      }
94      else {
95	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
96	 /* even ignored in da16, still need to set as '01' */
97	 insn->bits1.ia16.dest_horiz_stride = 1;
98      }
99   }
100
101   /* NEW: Set the execution size based on dest.width and
102    * insn->compression_control:
103    */
104   guess_execution_size(p, insn, dest);
105}
106
107extern int reg_type_size[];
108
109static void
110validate_reg(struct brw_instruction *insn, struct brw_reg reg)
111{
112   int hstride_for_reg[] = {0, 1, 2, 4};
113   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
114   int width_for_reg[] = {1, 2, 4, 8, 16};
115   int execsize_for_reg[] = {1, 2, 4, 8, 16};
116   int width, hstride, vstride, execsize;
117
118   if (reg.file == BRW_IMMEDIATE_VALUE) {
119      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
120       * mean the destination has to be 128-bit aligned and the
121       * destination horiz stride has to be a word.
122       */
123      if (reg.type == BRW_REGISTER_TYPE_V) {
124	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
125		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
126      }
127
128      return;
129   }
130
131   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
132       reg.file == BRW_ARF_NULL)
133      return;
134
135   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
136   hstride = hstride_for_reg[reg.hstride];
137
138   if (reg.vstride == 0xf) {
139      vstride = -1;
140   } else {
141      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
142      vstride = vstride_for_reg[reg.vstride];
143   }
144
145   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
146   width = width_for_reg[reg.width];
147
148   assert(insn->header.execution_size >= 0 &&
149	  insn->header.execution_size < Elements(execsize_for_reg));
150   execsize = execsize_for_reg[insn->header.execution_size];
151
152   /* Restrictions from 3.3.10: Register Region Restrictions. */
153   /* 3. */
154   assert(execsize >= width);
155
156   /* 4. */
157   if (execsize == width && hstride != 0) {
158      assert(vstride == -1 || vstride == width * hstride);
159   }
160
161   /* 5. */
162   if (execsize == width && hstride == 0) {
163      /* no restriction on vstride. */
164   }
165
166   /* 6. */
167   if (width == 1) {
168      assert(hstride == 0);
169   }
170
171   /* 7. */
172   if (execsize == 1 && width == 1) {
173      assert(hstride == 0);
174      assert(vstride == 0);
175   }
176
177   /* 8. */
178   if (vstride == 0 && hstride == 0) {
179      assert(width == 1);
180   }
181
182   /* 10. Check destination issues. */
183}
184
185static void brw_set_src0( struct brw_instruction *insn,
186                          struct brw_reg reg )
187{
188   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
189      assert(reg.nr < 128);
190
191   validate_reg(insn, reg);
192
193   insn->bits1.da1.src0_reg_file = reg.file;
194   insn->bits1.da1.src0_reg_type = reg.type;
195   insn->bits2.da1.src0_abs = reg.abs;
196   insn->bits2.da1.src0_negate = reg.negate;
197   insn->bits2.da1.src0_address_mode = reg.address_mode;
198
199   if (reg.file == BRW_IMMEDIATE_VALUE) {
200      insn->bits3.ud = reg.dw1.ud;
201
202      /* Required to set some fields in src1 as well:
203       */
204      insn->bits1.da1.src1_reg_file = 0; /* arf */
205      insn->bits1.da1.src1_reg_type = reg.type;
206   }
207   else
208   {
209      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
210	 if (insn->header.access_mode == BRW_ALIGN_1) {
211	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
212	    insn->bits2.da1.src0_reg_nr = reg.nr;
213	 }
214	 else {
215	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
216	    insn->bits2.da16.src0_reg_nr = reg.nr;
217	 }
218      }
219      else {
220	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
221
222	 if (insn->header.access_mode == BRW_ALIGN_1) {
223	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
224	 }
225	 else {
226	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
227	 }
228      }
229
230      if (insn->header.access_mode == BRW_ALIGN_1) {
231	 if (reg.width == BRW_WIDTH_1 &&
232	     insn->header.execution_size == BRW_EXECUTE_1) {
233	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
234	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
235	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
236	 }
237	 else {
238	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
239	    insn->bits2.da1.src0_width = reg.width;
240	    insn->bits2.da1.src0_vert_stride = reg.vstride;
241	 }
242      }
243      else {
244	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
245	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
246	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
247	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
248
249	 /* This is an oddity of the fact we're using the same
250	  * descriptions for registers in align_16 as align_1:
251	  */
252	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
253	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
254	 else
255	    insn->bits2.da16.src0_vert_stride = reg.vstride;
256      }
257   }
258}
259
260
261void brw_set_src1( struct brw_instruction *insn,
262                   struct brw_reg reg )
263{
264   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
265
266   assert(reg.nr < 128);
267
268   validate_reg(insn, reg);
269
270   insn->bits1.da1.src1_reg_file = reg.file;
271   insn->bits1.da1.src1_reg_type = reg.type;
272   insn->bits3.da1.src1_abs = reg.abs;
273   insn->bits3.da1.src1_negate = reg.negate;
274
275   /* Only src1 can be immediate in two-argument instructions.
276    */
277   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
278
279   if (reg.file == BRW_IMMEDIATE_VALUE) {
280      insn->bits3.ud = reg.dw1.ud;
281   }
282   else {
283      /* This is a hardware restriction, which may or may not be lifted
284       * in the future:
285       */
286      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
287      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
288
289      if (insn->header.access_mode == BRW_ALIGN_1) {
290	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
291	 insn->bits3.da1.src1_reg_nr = reg.nr;
292      }
293      else {
294	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
295	 insn->bits3.da16.src1_reg_nr = reg.nr;
296      }
297
298      if (insn->header.access_mode == BRW_ALIGN_1) {
299	 if (reg.width == BRW_WIDTH_1 &&
300	     insn->header.execution_size == BRW_EXECUTE_1) {
301	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
302	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
303	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
304	 }
305	 else {
306	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
307	    insn->bits3.da1.src1_width = reg.width;
308	    insn->bits3.da1.src1_vert_stride = reg.vstride;
309	 }
310      }
311      else {
312	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
313	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
314	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
315	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
316
317	 /* This is an oddity of the fact we're using the same
318	  * descriptions for registers in align_16 as align_1:
319	  */
320	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
321	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
322	 else
323	    insn->bits3.da16.src1_vert_stride = reg.vstride;
324      }
325   }
326}
327
328
329
330static void brw_set_math_message( struct brw_context *brw,
331				  struct brw_instruction *insn,
332				  GLuint msg_length,
333				  GLuint response_length,
334				  GLuint function,
335				  GLuint integer_type,
336				  GLboolean low_precision,
337				  GLboolean saturate,
338				  GLuint dataType )
339{
340   struct intel_context *intel = &brw->intel;
341   brw_set_src1(insn, brw_imm_d(0));
342
343   if (intel->gen == 5) {
344       insn->bits3.math_gen5.function = function;
345       insn->bits3.math_gen5.int_type = integer_type;
346       insn->bits3.math_gen5.precision = low_precision;
347       insn->bits3.math_gen5.saturate = saturate;
348       insn->bits3.math_gen5.data_type = dataType;
349       insn->bits3.math_gen5.snapshot = 0;
350       insn->bits3.math_gen5.header_present = 0;
351       insn->bits3.math_gen5.response_length = response_length;
352       insn->bits3.math_gen5.msg_length = msg_length;
353       insn->bits3.math_gen5.end_of_thread = 0;
354       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
355       insn->bits2.send_gen5.end_of_thread = 0;
356   } else {
357       insn->bits3.math.function = function;
358       insn->bits3.math.int_type = integer_type;
359       insn->bits3.math.precision = low_precision;
360       insn->bits3.math.saturate = saturate;
361       insn->bits3.math.data_type = dataType;
362       insn->bits3.math.response_length = response_length;
363       insn->bits3.math.msg_length = msg_length;
364       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
365       insn->bits3.math.end_of_thread = 0;
366   }
367}
368
369
370static void brw_set_ff_sync_message(struct brw_context *brw,
371				    struct brw_instruction *insn,
372				    GLboolean allocate,
373				    GLuint response_length,
374				    GLboolean end_of_thread)
375{
376	struct intel_context *intel = &brw->intel;
377	brw_set_src1(insn, brw_imm_d(0));
378
379	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
380	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
381	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
382	insn->bits3.urb_gen5.allocate = allocate;
383	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
384	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
385	insn->bits3.urb_gen5.header_present = 1;
386	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
387	insn->bits3.urb_gen5.msg_length = 1;
388	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
389	if (intel->gen >= 6) {
390	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
391	} else {
392	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
393	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
394	}
395}
396
397static void brw_set_urb_message( struct brw_context *brw,
398				 struct brw_instruction *insn,
399				 GLboolean allocate,
400				 GLboolean used,
401				 GLuint msg_length,
402				 GLuint response_length,
403				 GLboolean end_of_thread,
404				 GLboolean complete,
405				 GLuint offset,
406				 GLuint swizzle_control )
407{
408    struct intel_context *intel = &brw->intel;
409    brw_set_src1(insn, brw_imm_d(0));
410
411    if (intel->gen >= 5) {
412        insn->bits3.urb_gen5.opcode = 0;	/* ? */
413        insn->bits3.urb_gen5.offset = offset;
414        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
415        insn->bits3.urb_gen5.allocate = allocate;
416        insn->bits3.urb_gen5.used = used;	/* ? */
417        insn->bits3.urb_gen5.complete = complete;
418        insn->bits3.urb_gen5.header_present = 1;
419        insn->bits3.urb_gen5.response_length = response_length;
420        insn->bits3.urb_gen5.msg_length = msg_length;
421        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
422	if (intel->gen >= 6) {
423	   /* For SNB, the SFID bits moved to the condmod bits, and
424	    * EOT stayed in bits3 above.  Does the EOT bit setting
425	    * below on Ironlake even do anything?
426	    */
427	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
428	} else {
429	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
430	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
431	}
432    } else {
433        insn->bits3.urb.opcode = 0;	/* ? */
434        insn->bits3.urb.offset = offset;
435        insn->bits3.urb.swizzle_control = swizzle_control;
436        insn->bits3.urb.allocate = allocate;
437        insn->bits3.urb.used = used;	/* ? */
438        insn->bits3.urb.complete = complete;
439        insn->bits3.urb.response_length = response_length;
440        insn->bits3.urb.msg_length = msg_length;
441        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
442        insn->bits3.urb.end_of_thread = end_of_thread;
443    }
444}
445
446static void brw_set_dp_write_message( struct brw_context *brw,
447				      struct brw_instruction *insn,
448				      GLuint binding_table_index,
449				      GLuint msg_control,
450				      GLuint msg_type,
451				      GLuint msg_length,
452				      GLboolean header_present,
453				      GLuint pixel_scoreboard_clear,
454				      GLuint response_length,
455				      GLuint end_of_thread,
456				      GLuint send_commit_msg)
457{
458   struct intel_context *intel = &brw->intel;
459   brw_set_src1(insn, brw_imm_ud(0));
460
461   if (intel->gen >= 6) {
462       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
463       insn->bits3.dp_render_cache.msg_control = msg_control;
464       insn->bits3.dp_render_cache.pixel_scoreboard_clear = pixel_scoreboard_clear;
465       insn->bits3.dp_render_cache.msg_type = msg_type;
466       insn->bits3.dp_render_cache.send_commit_msg = send_commit_msg;
467       insn->bits3.dp_render_cache.header_present = header_present;
468       insn->bits3.dp_render_cache.response_length = response_length;
469       insn->bits3.dp_render_cache.msg_length = msg_length;
470       insn->bits3.dp_render_cache.end_of_thread = end_of_thread;
471       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
472	/* XXX really need below? */
473       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
474       insn->bits2.send_gen5.end_of_thread = end_of_thread;
475   } else if (intel->gen == 5) {
476       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
477       insn->bits3.dp_write_gen5.msg_control = msg_control;
478       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
479       insn->bits3.dp_write_gen5.msg_type = msg_type;
480       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
481       insn->bits3.dp_write_gen5.header_present = header_present;
482       insn->bits3.dp_write_gen5.response_length = response_length;
483       insn->bits3.dp_write_gen5.msg_length = msg_length;
484       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
485       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
486       insn->bits2.send_gen5.end_of_thread = end_of_thread;
487   } else {
488       insn->bits3.dp_write.binding_table_index = binding_table_index;
489       insn->bits3.dp_write.msg_control = msg_control;
490       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
491       insn->bits3.dp_write.msg_type = msg_type;
492       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
493       insn->bits3.dp_write.response_length = response_length;
494       insn->bits3.dp_write.msg_length = msg_length;
495       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
496       insn->bits3.dp_write.end_of_thread = end_of_thread;
497   }
498}
499
500static void
501brw_set_dp_read_message(struct brw_context *brw,
502			struct brw_instruction *insn,
503			GLuint binding_table_index,
504			GLuint msg_control,
505			GLuint msg_type,
506			GLuint target_cache,
507			GLuint msg_length,
508			GLuint response_length)
509{
510   struct intel_context *intel = &brw->intel;
511   brw_set_src1(insn, brw_imm_d(0));
512
513   if (intel->gen >= 6) {
514       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
515       insn->bits3.dp_render_cache.msg_control = msg_control;
516       insn->bits3.dp_render_cache.pixel_scoreboard_clear = 0;
517       insn->bits3.dp_render_cache.msg_type = msg_type;
518       insn->bits3.dp_render_cache.send_commit_msg = 0;
519       insn->bits3.dp_render_cache.header_present = 1;
520       insn->bits3.dp_render_cache.response_length = response_length;
521       insn->bits3.dp_render_cache.msg_length = msg_length;
522       insn->bits3.dp_render_cache.end_of_thread = 0;
523       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_READ;
524	/* XXX really need below? */
525       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
526       insn->bits2.send_gen5.end_of_thread = 0;
527   } else if (intel->gen == 5) {
528       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
529       insn->bits3.dp_read_gen5.msg_control = msg_control;
530       insn->bits3.dp_read_gen5.msg_type = msg_type;
531       insn->bits3.dp_read_gen5.target_cache = target_cache;
532       insn->bits3.dp_read_gen5.header_present = 1;
533       insn->bits3.dp_read_gen5.response_length = response_length;
534       insn->bits3.dp_read_gen5.msg_length = msg_length;
535       insn->bits3.dp_read_gen5.pad1 = 0;
536       insn->bits3.dp_read_gen5.end_of_thread = 0;
537       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
538       insn->bits2.send_gen5.end_of_thread = 0;
539   } else {
540       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
541       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
542       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
543       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
544       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
545       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
546       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
547       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
548       insn->bits3.dp_read.end_of_thread = 0;  /*31*/
549   }
550}
551
552static void brw_set_sampler_message(struct brw_context *brw,
553                                    struct brw_instruction *insn,
554                                    GLuint binding_table_index,
555                                    GLuint sampler,
556                                    GLuint msg_type,
557                                    GLuint response_length,
558                                    GLuint msg_length,
559                                    GLboolean eot,
560                                    GLuint header_present,
561                                    GLuint simd_mode)
562{
563   struct intel_context *intel = &brw->intel;
564   assert(eot == 0);
565   brw_set_src1(insn, brw_imm_d(0));
566
567   if (intel->gen >= 5) {
568      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
569      insn->bits3.sampler_gen5.sampler = sampler;
570      insn->bits3.sampler_gen5.msg_type = msg_type;
571      insn->bits3.sampler_gen5.simd_mode = simd_mode;
572      insn->bits3.sampler_gen5.header_present = header_present;
573      insn->bits3.sampler_gen5.response_length = response_length;
574      insn->bits3.sampler_gen5.msg_length = msg_length;
575      insn->bits3.sampler_gen5.end_of_thread = eot;
576      if (intel->gen >= 6)
577	  insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
578      else {
579	  insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
580	  insn->bits2.send_gen5.end_of_thread = eot;
581      }
582   } else if (intel->is_g4x) {
583      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
584      insn->bits3.sampler_g4x.sampler = sampler;
585      insn->bits3.sampler_g4x.msg_type = msg_type;
586      insn->bits3.sampler_g4x.response_length = response_length;
587      insn->bits3.sampler_g4x.msg_length = msg_length;
588      insn->bits3.sampler_g4x.end_of_thread = eot;
589      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
590   } else {
591      insn->bits3.sampler.binding_table_index = binding_table_index;
592      insn->bits3.sampler.sampler = sampler;
593      insn->bits3.sampler.msg_type = msg_type;
594      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
595      insn->bits3.sampler.response_length = response_length;
596      insn->bits3.sampler.msg_length = msg_length;
597      insn->bits3.sampler.end_of_thread = eot;
598      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
599   }
600}
601
602
603
604static struct brw_instruction *next_insn( struct brw_compile *p,
605					  GLuint opcode )
606{
607   struct brw_instruction *insn;
608
609   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
610
611   insn = &p->store[p->nr_insn++];
612   memcpy(insn, p->current, sizeof(*insn));
613
614   /* Reset this one-shot flag:
615    */
616
617   if (p->current->header.destreg__conditionalmod) {
618      p->current->header.destreg__conditionalmod = 0;
619      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
620   }
621
622   insn->header.opcode = opcode;
623   return insn;
624}
625
626
627static struct brw_instruction *brw_alu1( struct brw_compile *p,
628					 GLuint opcode,
629					 struct brw_reg dest,
630					 struct brw_reg src )
631{
632   struct brw_instruction *insn = next_insn(p, opcode);
633   brw_set_dest(p, insn, dest);
634   brw_set_src0(insn, src);
635   return insn;
636}
637
638static struct brw_instruction *brw_alu2(struct brw_compile *p,
639					GLuint opcode,
640					struct brw_reg dest,
641					struct brw_reg src0,
642					struct brw_reg src1 )
643{
644   struct brw_instruction *insn = next_insn(p, opcode);
645   brw_set_dest(p, insn, dest);
646   brw_set_src0(insn, src0);
647   brw_set_src1(insn, src1);
648   return insn;
649}
650
651
652/***********************************************************************
653 * Convenience routines.
654 */
655#define ALU1(OP)					\
656struct brw_instruction *brw_##OP(struct brw_compile *p,	\
657	      struct brw_reg dest,			\
658	      struct brw_reg src0)   			\
659{							\
660   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
661}
662
663#define ALU2(OP)					\
664struct brw_instruction *brw_##OP(struct brw_compile *p,	\
665	      struct brw_reg dest,			\
666	      struct brw_reg src0,			\
667	      struct brw_reg src1)   			\
668{							\
669   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
670}
671
672/* Rounding operations (other than RNDD) require two instructions - the first
673 * stores a rounded value (possibly the wrong way) in the dest register, but
674 * also sets a per-channel "increment bit" in the flag register.  A predicated
675 * add of 1.0 fixes dest to contain the desired result.
676 */
677#define ROUND(OP)							      \
678void brw_##OP(struct brw_compile *p,					      \
679	      struct brw_reg dest,					      \
680	      struct brw_reg src)					      \
681{									      \
682   struct brw_instruction *rnd, *add;					      \
683   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
684   brw_set_dest(p, rnd, dest);						      \
685   brw_set_src0(rnd, src);						      \
686   rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
687									      \
688   add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
689   add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
690}
691
692
693ALU1(MOV)
694ALU2(SEL)
695ALU1(NOT)
696ALU2(AND)
697ALU2(OR)
698ALU2(XOR)
699ALU2(SHR)
700ALU2(SHL)
701ALU2(RSR)
702ALU2(RSL)
703ALU2(ASR)
704ALU1(FRC)
705ALU1(RNDD)
706ALU2(MAC)
707ALU2(MACH)
708ALU1(LZD)
709ALU2(DP4)
710ALU2(DPH)
711ALU2(DP3)
712ALU2(DP2)
713ALU2(LINE)
714ALU2(PLN)
715
716
717ROUND(RNDZ)
718ROUND(RNDE)
719
720
721struct brw_instruction *brw_ADD(struct brw_compile *p,
722				struct brw_reg dest,
723				struct brw_reg src0,
724				struct brw_reg src1)
725{
726   /* 6.2.2: add */
727   if (src0.type == BRW_REGISTER_TYPE_F ||
728       (src0.file == BRW_IMMEDIATE_VALUE &&
729	src0.type == BRW_REGISTER_TYPE_VF)) {
730      assert(src1.type != BRW_REGISTER_TYPE_UD);
731      assert(src1.type != BRW_REGISTER_TYPE_D);
732   }
733
734   if (src1.type == BRW_REGISTER_TYPE_F ||
735       (src1.file == BRW_IMMEDIATE_VALUE &&
736	src1.type == BRW_REGISTER_TYPE_VF)) {
737      assert(src0.type != BRW_REGISTER_TYPE_UD);
738      assert(src0.type != BRW_REGISTER_TYPE_D);
739   }
740
741   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
742}
743
744struct brw_instruction *brw_MUL(struct brw_compile *p,
745				struct brw_reg dest,
746				struct brw_reg src0,
747				struct brw_reg src1)
748{
749   /* 6.32.38: mul */
750   if (src0.type == BRW_REGISTER_TYPE_D ||
751       src0.type == BRW_REGISTER_TYPE_UD ||
752       src1.type == BRW_REGISTER_TYPE_D ||
753       src1.type == BRW_REGISTER_TYPE_UD) {
754      assert(dest.type != BRW_REGISTER_TYPE_F);
755   }
756
757   if (src0.type == BRW_REGISTER_TYPE_F ||
758       (src0.file == BRW_IMMEDIATE_VALUE &&
759	src0.type == BRW_REGISTER_TYPE_VF)) {
760      assert(src1.type != BRW_REGISTER_TYPE_UD);
761      assert(src1.type != BRW_REGISTER_TYPE_D);
762   }
763
764   if (src1.type == BRW_REGISTER_TYPE_F ||
765       (src1.file == BRW_IMMEDIATE_VALUE &&
766	src1.type == BRW_REGISTER_TYPE_VF)) {
767      assert(src0.type != BRW_REGISTER_TYPE_UD);
768      assert(src0.type != BRW_REGISTER_TYPE_D);
769   }
770
771   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
772	  src0.nr != BRW_ARF_ACCUMULATOR);
773   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
774	  src1.nr != BRW_ARF_ACCUMULATOR);
775
776   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
777}
778
779
780void brw_NOP(struct brw_compile *p)
781{
782   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
783   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
784   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
785   brw_set_src1(insn, brw_imm_ud(0x0));
786}
787
788
789
790
791
792/***********************************************************************
793 * Comparisons, if/else/endif
794 */
795
796struct brw_instruction *brw_JMPI(struct brw_compile *p,
797                                 struct brw_reg dest,
798                                 struct brw_reg src0,
799                                 struct brw_reg src1)
800{
801   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
802
803   insn->header.execution_size = 1;
804   insn->header.compression_control = BRW_COMPRESSION_NONE;
805   insn->header.mask_control = BRW_MASK_DISABLE;
806
807   p->current->header.predicate_control = BRW_PREDICATE_NONE;
808
809   return insn;
810}
811
812/* EU takes the value from the flag register and pushes it onto some
813 * sort of a stack (presumably merging with any flag value already on
814 * the stack).  Within an if block, the flags at the top of the stack
815 * control execution on each channel of the unit, eg. on each of the
816 * 16 pixel values in our wm programs.
817 *
818 * When the matching 'else' instruction is reached (presumably by
819 * countdown of the instruction count patched in by our ELSE/ENDIF
820 * functions), the relevent flags are inverted.
821 *
822 * When the matching 'endif' instruction is reached, the flags are
823 * popped off.  If the stack is now empty, normal execution resumes.
824 *
825 * No attempt is made to deal with stack overflow (14 elements?).
826 */
827struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
828{
829   struct intel_context *intel = &p->brw->intel;
830   struct brw_instruction *insn;
831
832   if (p->single_program_flow) {
833      assert(execute_size == BRW_EXECUTE_1);
834
835      insn = next_insn(p, BRW_OPCODE_ADD);
836      insn->header.predicate_inverse = 1;
837   } else {
838      insn = next_insn(p, BRW_OPCODE_IF);
839   }
840
841   /* Override the defaults for this instruction:
842    */
843   if (intel->gen < 6) {
844      brw_set_dest(p, insn, brw_ip_reg());
845      brw_set_src0(insn, brw_ip_reg());
846      brw_set_src1(insn, brw_imm_d(0x0));
847   } else {
848      brw_set_dest(p, insn, brw_imm_w(0));
849      insn->bits1.branch_gen6.jump_count = 0;
850      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
851      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
852   }
853
854   insn->header.execution_size = execute_size;
855   insn->header.compression_control = BRW_COMPRESSION_NONE;
856   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
857   insn->header.mask_control = BRW_MASK_ENABLE;
858   if (!p->single_program_flow)
859       insn->header.thread_control = BRW_THREAD_SWITCH;
860
861   p->current->header.predicate_control = BRW_PREDICATE_NONE;
862
863   return insn;
864}
865
866struct brw_instruction *
867brw_IF_gen6(struct brw_compile *p, uint32_t conditional,
868	    struct brw_reg src0, struct brw_reg src1)
869{
870   struct brw_instruction *insn;
871
872   insn = next_insn(p, BRW_OPCODE_IF);
873
874   brw_set_dest(p, insn, brw_imm_w(0));
875   insn->header.execution_size = BRW_EXECUTE_8;
876   insn->bits1.branch_gen6.jump_count = 0;
877   brw_set_src0(insn, src0);
878   brw_set_src1(insn, src1);
879
880   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
881   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
882   insn->header.destreg__conditionalmod = conditional;
883
884   if (!p->single_program_flow)
885       insn->header.thread_control = BRW_THREAD_SWITCH;
886
887   return insn;
888}
889
890struct brw_instruction *brw_ELSE(struct brw_compile *p,
891				 struct brw_instruction *if_insn)
892{
893   struct intel_context *intel = &p->brw->intel;
894   struct brw_instruction *insn;
895   GLuint br = 1;
896
897   /* jump count is for 64bit data chunk each, so one 128bit
898      instruction requires 2 chunks. */
899   if (intel->gen >= 5)
900      br = 2;
901
902   if (p->single_program_flow) {
903      insn = next_insn(p, BRW_OPCODE_ADD);
904   } else {
905      insn = next_insn(p, BRW_OPCODE_ELSE);
906   }
907
908   if (intel->gen < 6) {
909      brw_set_dest(p, insn, brw_ip_reg());
910      brw_set_src0(insn, brw_ip_reg());
911      brw_set_src1(insn, brw_imm_d(0x0));
912   } else {
913      brw_set_dest(p, insn, brw_imm_w(0));
914      insn->bits1.branch_gen6.jump_count = 0;
915      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
916      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
917   }
918
919   insn->header.compression_control = BRW_COMPRESSION_NONE;
920   insn->header.execution_size = if_insn->header.execution_size;
921   insn->header.mask_control = BRW_MASK_ENABLE;
922   if (!p->single_program_flow)
923       insn->header.thread_control = BRW_THREAD_SWITCH;
924
925   /* Patch the if instruction to point at this instruction.
926    */
927   if (p->single_program_flow) {
928      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
929
930      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
931   } else {
932      assert(if_insn->header.opcode == BRW_OPCODE_IF);
933
934      if (intel->gen < 6) {
935	 if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
936	 if_insn->bits3.if_else.pop_count = 0;
937	 if_insn->bits3.if_else.pad0 = 0;
938      } else {
939	 if_insn->bits1.branch_gen6.jump_count = br * (insn - if_insn + 1);
940      }
941   }
942
943   return insn;
944}
945
946void brw_ENDIF(struct brw_compile *p,
947	       struct brw_instruction *patch_insn)
948{
949   struct intel_context *intel = &p->brw->intel;
950   GLuint br = 1;
951
952   if (intel->gen >= 5)
953      br = 2;
954
955   if (p->single_program_flow) {
956      /* In single program flow mode, there's no need to execute an ENDIF,
957       * since we don't need to do any stack operations, and if we're executing
958       * currently, we want to just continue executing.
959       */
960      struct brw_instruction *next = &p->store[p->nr_insn];
961
962      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
963
964      patch_insn->bits3.ud = (next - patch_insn) * 16;
965   } else {
966      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
967
968      if (intel->gen < 6) {
969	 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
970	 brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
971	 brw_set_src1(insn, brw_imm_d(0x0));
972      } else {
973	 brw_set_dest(p, insn, brw_imm_w(0));
974	 brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
975	 brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
976      }
977
978      insn->header.compression_control = BRW_COMPRESSION_NONE;
979      insn->header.execution_size = patch_insn->header.execution_size;
980      insn->header.mask_control = BRW_MASK_ENABLE;
981      insn->header.thread_control = BRW_THREAD_SWITCH;
982
983      if (intel->gen < 6)
984	 assert(patch_insn->bits3.if_else.jump_count == 0);
985      else
986	 assert(patch_insn->bits1.branch_gen6.jump_count == 0);
987
988      /* Patch the if or else instructions to point at this or the next
989       * instruction respectively.
990       */
991      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
992	 if (intel->gen < 6) {
993	    /* Turn it into an IFF, which means no mask stack operations for
994	     * all-false and jumping past the ENDIF.
995	     */
996	    patch_insn->header.opcode = BRW_OPCODE_IFF;
997	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
998	    patch_insn->bits3.if_else.pop_count = 0;
999	    patch_insn->bits3.if_else.pad0 = 0;
1000	 } else {
1001	    /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1002	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1003	 }
1004      } else {
1005	 assert(patch_insn->header.opcode == BRW_OPCODE_ELSE);
1006	 if (intel->gen < 6) {
1007	    /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1008	     * matching ENDIF.
1009	     */
1010	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
1011	    patch_insn->bits3.if_else.pop_count = 1;
1012	    patch_insn->bits3.if_else.pad0 = 0;
1013	 } else {
1014	    /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1015	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1016	 }
1017      }
1018
1019      /* Also pop item off the stack in the endif instruction:
1020       */
1021      if (intel->gen < 6) {
1022	 insn->bits3.if_else.jump_count = 0;
1023	 insn->bits3.if_else.pop_count = 1;
1024	 insn->bits3.if_else.pad0 = 0;
1025      } else {
1026	 insn->bits1.branch_gen6.jump_count = 2;
1027      }
1028   }
1029}
1030
1031struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1032{
1033   struct intel_context *intel = &p->brw->intel;
1034   struct brw_instruction *insn;
1035
1036   insn = next_insn(p, BRW_OPCODE_BREAK);
1037   if (intel->gen >= 6) {
1038      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1039      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1040      brw_set_src1(insn, brw_imm_d(0x0));
1041   } else {
1042      brw_set_dest(p, insn, brw_ip_reg());
1043      brw_set_src0(insn, brw_ip_reg());
1044      brw_set_src1(insn, brw_imm_d(0x0));
1045      insn->bits3.if_else.pad0 = 0;
1046      insn->bits3.if_else.pop_count = pop_count;
1047   }
1048   insn->header.compression_control = BRW_COMPRESSION_NONE;
1049   insn->header.execution_size = BRW_EXECUTE_8;
1050
1051   return insn;
1052}
1053
1054struct brw_instruction *brw_CONT_gen6(struct brw_compile *p,
1055				      struct brw_instruction *do_insn)
1056{
1057   struct brw_instruction *insn;
1058   int br = 2;
1059
1060   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1061   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1062   brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1063   brw_set_dest(p, insn, brw_ip_reg());
1064   brw_set_src0(insn, brw_ip_reg());
1065   brw_set_src1(insn, brw_imm_d(0x0));
1066
1067   insn->bits3.break_cont.uip = br * (do_insn - insn);
1068
1069   insn->header.compression_control = BRW_COMPRESSION_NONE;
1070   insn->header.execution_size = BRW_EXECUTE_8;
1071   return insn;
1072}
1073
1074struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1075{
1076   struct brw_instruction *insn;
1077   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1078   brw_set_dest(p, insn, brw_ip_reg());
1079   brw_set_src0(insn, brw_ip_reg());
1080   brw_set_src1(insn, brw_imm_d(0x0));
1081   insn->header.compression_control = BRW_COMPRESSION_NONE;
1082   insn->header.execution_size = BRW_EXECUTE_8;
1083   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1084   insn->bits3.if_else.pad0 = 0;
1085   insn->bits3.if_else.pop_count = pop_count;
1086   return insn;
1087}
1088
1089/* DO/WHILE loop:
1090 *
1091 * The DO/WHILE is just an unterminated loop -- break or continue are
1092 * used for control within the loop.  We have a few ways they can be
1093 * done.
1094 *
1095 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1096 * jip and no DO instruction.
1097 *
1098 * For non-uniform control flow pre-gen6, there's a DO instruction to
1099 * push the mask, and a WHILE to jump back, and BREAK to get out and
1100 * pop the mask.
1101 *
1102 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1103 * just points back to the first instruction of the loop.
1104 */
1105struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1106{
1107   struct intel_context *intel = &p->brw->intel;
1108
1109   if (intel->gen >= 6 || p->single_program_flow) {
1110      return &p->store[p->nr_insn];
1111   } else {
1112      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1113
1114      /* Override the defaults for this instruction:
1115       */
1116      brw_set_dest(p, insn, brw_null_reg());
1117      brw_set_src0(insn, brw_null_reg());
1118      brw_set_src1(insn, brw_null_reg());
1119
1120      insn->header.compression_control = BRW_COMPRESSION_NONE;
1121      insn->header.execution_size = execute_size;
1122      insn->header.predicate_control = BRW_PREDICATE_NONE;
1123      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1124      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1125
1126      return insn;
1127   }
1128}
1129
1130
1131
1132struct brw_instruction *brw_WHILE(struct brw_compile *p,
1133                                  struct brw_instruction *do_insn)
1134{
1135   struct intel_context *intel = &p->brw->intel;
1136   struct brw_instruction *insn;
1137   GLuint br = 1;
1138
1139   if (intel->gen >= 5)
1140      br = 2;
1141
1142   if (intel->gen >= 6) {
1143      insn = next_insn(p, BRW_OPCODE_WHILE);
1144
1145      brw_set_dest(p, insn, brw_imm_w(0));
1146      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1147      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1148      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1149
1150      insn->header.execution_size = do_insn->header.execution_size;
1151      assert(insn->header.execution_size == BRW_EXECUTE_8);
1152   } else {
1153      if (p->single_program_flow) {
1154	 insn = next_insn(p, BRW_OPCODE_ADD);
1155
1156	 brw_set_dest(p, insn, brw_ip_reg());
1157	 brw_set_src0(insn, brw_ip_reg());
1158	 brw_set_src1(insn, brw_imm_d((do_insn - insn) * 16));
1159	 insn->header.execution_size = BRW_EXECUTE_1;
1160      } else {
1161	 insn = next_insn(p, BRW_OPCODE_WHILE);
1162
1163	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1164
1165	 brw_set_dest(p, insn, brw_ip_reg());
1166	 brw_set_src0(insn, brw_ip_reg());
1167	 brw_set_src1(insn, brw_imm_d(0));
1168
1169	 insn->header.execution_size = do_insn->header.execution_size;
1170	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1171	 insn->bits3.if_else.pop_count = 0;
1172	 insn->bits3.if_else.pad0 = 0;
1173      }
1174   }
1175   insn->header.compression_control = BRW_COMPRESSION_NONE;
1176   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1177
1178   return insn;
1179}
1180
1181
1182/* FORWARD JUMPS:
1183 */
1184void brw_land_fwd_jump(struct brw_compile *p,
1185		       struct brw_instruction *jmp_insn)
1186{
1187   struct intel_context *intel = &p->brw->intel;
1188   struct brw_instruction *landing = &p->store[p->nr_insn];
1189   GLuint jmpi = 1;
1190
1191   if (intel->gen >= 5)
1192       jmpi = 2;
1193
1194   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1195   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1196
1197   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1198}
1199
1200
1201
1202/* To integrate with the above, it makes sense that the comparison
1203 * instruction should populate the flag register.  It might be simpler
1204 * just to use the flag reg for most WM tasks?
1205 */
1206void brw_CMP(struct brw_compile *p,
1207	     struct brw_reg dest,
1208	     GLuint conditional,
1209	     struct brw_reg src0,
1210	     struct brw_reg src1)
1211{
1212   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1213
1214   insn->header.destreg__conditionalmod = conditional;
1215   brw_set_dest(p, insn, dest);
1216   brw_set_src0(insn, src0);
1217   brw_set_src1(insn, src1);
1218
1219/*    guess_execution_size(insn, src0); */
1220
1221
1222   /* Make it so that future instructions will use the computed flag
1223    * value until brw_set_predicate_control_flag_value() is called
1224    * again.
1225    */
1226   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1227       dest.nr == 0) {
1228      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1229      p->flag_value = 0xff;
1230   }
1231}
1232
1233/* Issue 'wait' instruction for n1, host could program MMIO
1234   to wake up thread. */
1235void brw_WAIT (struct brw_compile *p)
1236{
1237   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1238   struct brw_reg src = brw_notification_1_reg();
1239
1240   brw_set_dest(p, insn, src);
1241   brw_set_src0(insn, src);
1242   brw_set_src1(insn, brw_null_reg());
1243   insn->header.execution_size = 0; /* must */
1244   insn->header.predicate_control = 0;
1245   insn->header.compression_control = 0;
1246}
1247
1248
1249/***********************************************************************
1250 * Helpers for the various SEND message types:
1251 */
1252
1253/** Extended math function, float[8].
1254 */
1255void brw_math( struct brw_compile *p,
1256	       struct brw_reg dest,
1257	       GLuint function,
1258	       GLuint saturate,
1259	       GLuint msg_reg_nr,
1260	       struct brw_reg src,
1261	       GLuint data_type,
1262	       GLuint precision )
1263{
1264   struct intel_context *intel = &p->brw->intel;
1265
1266   if (intel->gen >= 6) {
1267      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1268
1269      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1270      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1271
1272      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1273      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1274
1275      /* Source modifiers are ignored for extended math instructions. */
1276      assert(!src.negate);
1277      assert(!src.abs);
1278
1279      if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1280	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1281	 assert(src.type == BRW_REGISTER_TYPE_F);
1282      }
1283
1284      /* Math is the same ISA format as other opcodes, except that CondModifier
1285       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1286       */
1287      insn->header.destreg__conditionalmod = function;
1288      insn->header.saturate = saturate;
1289
1290      brw_set_dest(p, insn, dest);
1291      brw_set_src0(insn, src);
1292      brw_set_src1(insn, brw_null_reg());
1293   } else {
1294      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1295      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1296      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1297      /* Example code doesn't set predicate_control for send
1298       * instructions.
1299       */
1300      insn->header.predicate_control = 0;
1301      insn->header.destreg__conditionalmod = msg_reg_nr;
1302
1303      brw_set_dest(p, insn, dest);
1304      brw_set_src0(insn, src);
1305      brw_set_math_message(p->brw,
1306			   insn,
1307			   msg_length, response_length,
1308			   function,
1309			   BRW_MATH_INTEGER_UNSIGNED,
1310			   precision,
1311			   saturate,
1312			   data_type);
1313   }
1314}
1315
1316/** Extended math function, float[8].
1317 */
1318void brw_math2(struct brw_compile *p,
1319	       struct brw_reg dest,
1320	       GLuint function,
1321	       struct brw_reg src0,
1322	       struct brw_reg src1)
1323{
1324   struct intel_context *intel = &p->brw->intel;
1325   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1326
1327   assert(intel->gen >= 6);
1328   (void) intel;
1329
1330
1331   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1332   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1333   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1334
1335   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1336   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1337   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1338
1339   if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1340       function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1341      assert(src0.type == BRW_REGISTER_TYPE_F);
1342      assert(src1.type == BRW_REGISTER_TYPE_F);
1343   }
1344
1345   /* Source modifiers are ignored for extended math instructions. */
1346   assert(!src0.negate);
1347   assert(!src0.abs);
1348   assert(!src1.negate);
1349   assert(!src1.abs);
1350
1351   /* Math is the same ISA format as other opcodes, except that CondModifier
1352    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1353    */
1354   insn->header.destreg__conditionalmod = function;
1355
1356   brw_set_dest(p, insn, dest);
1357   brw_set_src0(insn, src0);
1358   brw_set_src1(insn, src1);
1359}
1360
1361/**
1362 * Extended math function, float[16].
1363 * Use 2 send instructions.
1364 */
1365void brw_math_16( struct brw_compile *p,
1366		  struct brw_reg dest,
1367		  GLuint function,
1368		  GLuint saturate,
1369		  GLuint msg_reg_nr,
1370		  struct brw_reg src,
1371		  GLuint precision )
1372{
1373   struct intel_context *intel = &p->brw->intel;
1374   struct brw_instruction *insn;
1375   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1376   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1377
1378   if (intel->gen >= 6) {
1379      insn = next_insn(p, BRW_OPCODE_MATH);
1380
1381      /* Math is the same ISA format as other opcodes, except that CondModifier
1382       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1383       */
1384      insn->header.destreg__conditionalmod = function;
1385      insn->header.saturate = saturate;
1386
1387      /* Source modifiers are ignored for extended math instructions. */
1388      assert(!src.negate);
1389      assert(!src.abs);
1390
1391      brw_set_dest(p, insn, dest);
1392      brw_set_src0(insn, src);
1393      brw_set_src1(insn, brw_null_reg());
1394      return;
1395   }
1396
1397   /* First instruction:
1398    */
1399   brw_push_insn_state(p);
1400   brw_set_predicate_control_flag_value(p, 0xff);
1401   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1402
1403   insn = next_insn(p, BRW_OPCODE_SEND);
1404   insn->header.destreg__conditionalmod = msg_reg_nr;
1405
1406   brw_set_dest(p, insn, dest);
1407   brw_set_src0(insn, src);
1408   brw_set_math_message(p->brw,
1409			insn,
1410			msg_length, response_length,
1411			function,
1412			BRW_MATH_INTEGER_UNSIGNED,
1413			precision,
1414			saturate,
1415			BRW_MATH_DATA_VECTOR);
1416
1417   /* Second instruction:
1418    */
1419   insn = next_insn(p, BRW_OPCODE_SEND);
1420   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1421   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1422
1423   brw_set_dest(p, insn, offset(dest,1));
1424   brw_set_src0(insn, src);
1425   brw_set_math_message(p->brw,
1426			insn,
1427			msg_length, response_length,
1428			function,
1429			BRW_MATH_INTEGER_UNSIGNED,
1430			precision,
1431			saturate,
1432			BRW_MATH_DATA_VECTOR);
1433
1434   brw_pop_insn_state(p);
1435}
1436
1437
1438/**
1439 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1440 * using a constant offset per channel.
1441 *
1442 * The offset must be aligned to oword size (16 bytes).  Used for
1443 * register spilling.
1444 */
1445void brw_oword_block_write_scratch(struct brw_compile *p,
1446				   struct brw_reg mrf,
1447				   int num_regs,
1448				   GLuint offset)
1449{
1450   struct intel_context *intel = &p->brw->intel;
1451   uint32_t msg_control;
1452   int mlen;
1453
1454   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1455
1456   if (num_regs == 1) {
1457      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1458      mlen = 2;
1459   } else {
1460      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1461      mlen = 3;
1462   }
1463
1464   /* Set up the message header.  This is g0, with g0.2 filled with
1465    * the offset.  We don't want to leave our offset around in g0 or
1466    * it'll screw up texture samples, so set it up inside the message
1467    * reg.
1468    */
1469   {
1470      brw_push_insn_state(p);
1471      brw_set_mask_control(p, BRW_MASK_DISABLE);
1472      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1473
1474      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1475
1476      /* set message header global offset field (reg 0, element 2) */
1477      brw_MOV(p,
1478	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1479				  mrf.nr,
1480				  2), BRW_REGISTER_TYPE_UD),
1481	      brw_imm_ud(offset));
1482
1483      brw_pop_insn_state(p);
1484   }
1485
1486   {
1487      struct brw_reg dest;
1488      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1489      int send_commit_msg;
1490      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1491					 BRW_REGISTER_TYPE_UW);
1492
1493      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1494	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1495	 src_header = vec16(src_header);
1496      }
1497      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1498      insn->header.destreg__conditionalmod = mrf.nr;
1499
1500      /* Until gen6, writes followed by reads from the same location
1501       * are not guaranteed to be ordered unless write_commit is set.
1502       * If set, then a no-op write is issued to the destination
1503       * register to set a dependency, and a read from the destination
1504       * can be used to ensure the ordering.
1505       *
1506       * For gen6, only writes between different threads need ordering
1507       * protection.  Our use of DP writes is all about register
1508       * spilling within a thread.
1509       */
1510      if (intel->gen >= 6) {
1511	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1512	 send_commit_msg = 0;
1513      } else {
1514	 dest = src_header;
1515	 send_commit_msg = 1;
1516      }
1517
1518      brw_set_dest(p, insn, dest);
1519      brw_set_src0(insn, brw_null_reg());
1520
1521      brw_set_dp_write_message(p->brw,
1522			       insn,
1523			       255, /* binding table index (255=stateless) */
1524			       msg_control,
1525			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
1526			       mlen,
1527			       GL_TRUE, /* header_present */
1528			       0, /* pixel scoreboard */
1529			       send_commit_msg, /* response_length */
1530			       0, /* eot */
1531			       send_commit_msg);
1532   }
1533}
1534
1535
1536/**
1537 * Read a block of owords (half a GRF each) from the scratch buffer
1538 * using a constant index per channel.
1539 *
1540 * Offset must be aligned to oword size (16 bytes).  Used for register
1541 * spilling.
1542 */
1543void
1544brw_oword_block_read_scratch(struct brw_compile *p,
1545			     struct brw_reg dest,
1546			     struct brw_reg mrf,
1547			     int num_regs,
1548			     GLuint offset)
1549{
1550   uint32_t msg_control;
1551   int rlen;
1552
1553   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1554   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1555
1556   if (num_regs == 1) {
1557      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1558      rlen = 1;
1559   } else {
1560      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1561      rlen = 2;
1562   }
1563
1564   {
1565      brw_push_insn_state(p);
1566      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1567      brw_set_mask_control(p, BRW_MASK_DISABLE);
1568
1569      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1570
1571      /* set message header global offset field (reg 0, element 2) */
1572      brw_MOV(p,
1573	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1574				  mrf.nr,
1575				  2), BRW_REGISTER_TYPE_UD),
1576	      brw_imm_ud(offset));
1577
1578      brw_pop_insn_state(p);
1579   }
1580
1581   {
1582      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1583
1584      assert(insn->header.predicate_control == 0);
1585      insn->header.compression_control = BRW_COMPRESSION_NONE;
1586      insn->header.destreg__conditionalmod = mrf.nr;
1587
1588      brw_set_dest(p, insn, dest);	/* UW? */
1589      brw_set_src0(insn, brw_null_reg());
1590
1591      brw_set_dp_read_message(p->brw,
1592			      insn,
1593			      255, /* binding table index (255=stateless) */
1594			      msg_control,
1595			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1596			      1, /* target cache (render/scratch) */
1597			      1, /* msg_length */
1598			      rlen);
1599   }
1600}
1601
1602/**
1603 * Read a float[4] vector from the data port Data Cache (const buffer).
1604 * Location (in buffer) should be a multiple of 16.
1605 * Used for fetching shader constants.
1606 */
1607void brw_oword_block_read(struct brw_compile *p,
1608			  struct brw_reg dest,
1609			  struct brw_reg mrf,
1610			  uint32_t offset,
1611			  uint32_t bind_table_index)
1612{
1613   struct intel_context *intel = &p->brw->intel;
1614
1615   /* On newer hardware, offset is in units of owords. */
1616   if (intel->gen >= 6)
1617      offset /= 16;
1618
1619   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1620
1621   brw_push_insn_state(p);
1622   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1623   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1624   brw_set_mask_control(p, BRW_MASK_DISABLE);
1625
1626   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1627
1628   /* set message header global offset field (reg 0, element 2) */
1629   brw_MOV(p,
1630	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1631			       mrf.nr,
1632			       2), BRW_REGISTER_TYPE_UD),
1633	   brw_imm_ud(offset));
1634
1635   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1636   insn->header.destreg__conditionalmod = mrf.nr;
1637
1638   /* cast dest to a uword[8] vector */
1639   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1640
1641   brw_set_dest(p, insn, dest);
1642   if (intel->gen >= 6) {
1643      brw_set_src0(insn, mrf);
1644   } else {
1645      brw_set_src0(insn, brw_null_reg());
1646   }
1647
1648   brw_set_dp_read_message(p->brw,
1649			   insn,
1650			   bind_table_index,
1651			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1652			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1653			   0, /* source cache = data cache */
1654			   1, /* msg_length */
1655			   1); /* response_length (1 reg, 2 owords!) */
1656
1657   brw_pop_insn_state(p);
1658}
1659
1660/**
1661 * Read a set of dwords from the data port Data Cache (const buffer).
1662 *
1663 * Location (in buffer) appears as UD offsets in the register after
1664 * the provided mrf header reg.
1665 */
1666void brw_dword_scattered_read(struct brw_compile *p,
1667			      struct brw_reg dest,
1668			      struct brw_reg mrf,
1669			      uint32_t bind_table_index)
1670{
1671   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1672
1673   brw_push_insn_state(p);
1674   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1675   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1676   brw_set_mask_control(p, BRW_MASK_DISABLE);
1677   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1678   brw_pop_insn_state(p);
1679
1680   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1681   insn->header.destreg__conditionalmod = mrf.nr;
1682
1683   /* cast dest to a uword[8] vector */
1684   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1685
1686   brw_set_dest(p, insn, dest);
1687   brw_set_src0(insn, brw_null_reg());
1688
1689   brw_set_dp_read_message(p->brw,
1690			   insn,
1691			   bind_table_index,
1692			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1693			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1694			   0, /* source cache = data cache */
1695			   2, /* msg_length */
1696			   1); /* response_length */
1697}
1698
1699
1700
1701/**
1702 * Read float[4] constant(s) from VS constant buffer.
1703 * For relative addressing, two float[4] constants will be read into 'dest'.
1704 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1705 */
1706void brw_dp_READ_4_vs(struct brw_compile *p,
1707                      struct brw_reg dest,
1708                      GLuint location,
1709                      GLuint bind_table_index)
1710{
1711   struct brw_instruction *insn;
1712   GLuint msg_reg_nr = 1;
1713
1714   /* Setup MRF[1] with location/offset into const buffer */
1715   brw_push_insn_state(p);
1716   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1717   brw_set_mask_control(p, BRW_MASK_DISABLE);
1718   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1719   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1720		     BRW_REGISTER_TYPE_UD),
1721	   brw_imm_ud(location));
1722   brw_pop_insn_state(p);
1723
1724   insn = next_insn(p, BRW_OPCODE_SEND);
1725
1726   insn->header.predicate_control = BRW_PREDICATE_NONE;
1727   insn->header.compression_control = BRW_COMPRESSION_NONE;
1728   insn->header.destreg__conditionalmod = msg_reg_nr;
1729   insn->header.mask_control = BRW_MASK_DISABLE;
1730
1731   brw_set_dest(p, insn, dest);
1732   brw_set_src0(insn, brw_null_reg());
1733
1734   brw_set_dp_read_message(p->brw,
1735			   insn,
1736			   bind_table_index,
1737			   0,
1738			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1739			   0, /* source cache = data cache */
1740			   1, /* msg_length */
1741			   1); /* response_length (1 Oword) */
1742}
1743
1744/**
1745 * Read a float[4] constant per vertex from VS constant buffer, with
1746 * relative addressing.
1747 */
1748void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1749			       struct brw_reg dest,
1750			       struct brw_reg addr_reg,
1751			       GLuint offset,
1752			       GLuint bind_table_index)
1753{
1754   struct intel_context *intel = &p->brw->intel;
1755   int msg_type;
1756
1757   /* Setup MRF[1] with offset into const buffer */
1758   brw_push_insn_state(p);
1759   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1760   brw_set_mask_control(p, BRW_MASK_DISABLE);
1761   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1762
1763   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1764    * fields ignored.
1765    */
1766   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
1767	   addr_reg, brw_imm_d(offset));
1768   brw_pop_insn_state(p);
1769
1770   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1771
1772   insn->header.predicate_control = BRW_PREDICATE_NONE;
1773   insn->header.compression_control = BRW_COMPRESSION_NONE;
1774   insn->header.destreg__conditionalmod = 0;
1775   insn->header.mask_control = BRW_MASK_DISABLE;
1776
1777   brw_set_dest(p, insn, dest);
1778   brw_set_src0(insn, brw_vec8_grf(0, 0));
1779
1780   if (intel->gen == 6)
1781      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1782   else if (intel->gen == 5 || intel->is_g4x)
1783      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1784   else
1785      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1786
1787   brw_set_dp_read_message(p->brw,
1788			   insn,
1789			   bind_table_index,
1790			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1791			   msg_type,
1792			   0, /* source cache = data cache */
1793			   2, /* msg_length */
1794			   1); /* response_length */
1795}
1796
1797
1798
1799void brw_fb_WRITE(struct brw_compile *p,
1800		  int dispatch_width,
1801                  struct brw_reg dest,
1802                  GLuint msg_reg_nr,
1803                  struct brw_reg src0,
1804                  GLuint binding_table_index,
1805                  GLuint msg_length,
1806                  GLuint response_length,
1807                  GLboolean eot)
1808{
1809   struct intel_context *intel = &p->brw->intel;
1810   struct brw_instruction *insn;
1811   GLuint msg_control, msg_type;
1812   GLboolean header_present = GL_TRUE;
1813
1814   if (intel->gen >= 6 && binding_table_index == 0) {
1815      insn = next_insn(p, BRW_OPCODE_SENDC);
1816   } else {
1817      insn = next_insn(p, BRW_OPCODE_SEND);
1818   }
1819   /* The execution mask is ignored for render target writes. */
1820   insn->header.predicate_control = 0;
1821   insn->header.compression_control = BRW_COMPRESSION_NONE;
1822
1823   if (intel->gen >= 6) {
1824      if (msg_length == 4)
1825	 header_present = GL_FALSE;
1826
1827       /* headerless version, just submit color payload */
1828       src0 = brw_message_reg(msg_reg_nr);
1829
1830       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6;
1831   } else {
1832      insn->header.destreg__conditionalmod = msg_reg_nr;
1833
1834      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1835   }
1836
1837   if (dispatch_width == 16)
1838      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1839   else
1840      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1841
1842   brw_set_dest(p, insn, dest);
1843   brw_set_src0(insn, src0);
1844   brw_set_dp_write_message(p->brw,
1845			    insn,
1846			    binding_table_index,
1847			    msg_control,
1848			    msg_type,
1849			    msg_length,
1850			    header_present,
1851			    1,	/* pixel scoreboard */
1852			    response_length,
1853			    eot,
1854			    0 /* send_commit_msg */);
1855}
1856
1857
1858/**
1859 * Texture sample instruction.
1860 * Note: the msg_type plus msg_length values determine exactly what kind
1861 * of sampling operation is performed.  See volume 4, page 161 of docs.
1862 */
1863void brw_SAMPLE(struct brw_compile *p,
1864		struct brw_reg dest,
1865		GLuint msg_reg_nr,
1866		struct brw_reg src0,
1867		GLuint binding_table_index,
1868		GLuint sampler,
1869		GLuint writemask,
1870		GLuint msg_type,
1871		GLuint response_length,
1872		GLuint msg_length,
1873		GLboolean eot,
1874		GLuint header_present,
1875		GLuint simd_mode)
1876{
1877   struct intel_context *intel = &p->brw->intel;
1878   GLboolean need_stall = 0;
1879
1880   if (writemask == 0) {
1881      /*printf("%s: zero writemask??\n", __FUNCTION__); */
1882      return;
1883   }
1884
1885   /* Hardware doesn't do destination dependency checking on send
1886    * instructions properly.  Add a workaround which generates the
1887    * dependency by other means.  In practice it seems like this bug
1888    * only crops up for texture samples, and only where registers are
1889    * written by the send and then written again later without being
1890    * read in between.  Luckily for us, we already track that
1891    * information and use it to modify the writemask for the
1892    * instruction, so that is a guide for whether a workaround is
1893    * needed.
1894    */
1895   if (writemask != WRITEMASK_XYZW) {
1896      GLuint dst_offset = 0;
1897      GLuint i, newmask = 0, len = 0;
1898
1899      for (i = 0; i < 4; i++) {
1900	 if (writemask & (1<<i))
1901	    break;
1902	 dst_offset += 2;
1903      }
1904      for (; i < 4; i++) {
1905	 if (!(writemask & (1<<i)))
1906	    break;
1907	 newmask |= 1<<i;
1908	 len++;
1909      }
1910
1911      if (newmask != writemask) {
1912	 need_stall = 1;
1913         /* printf("need stall %x %x\n", newmask , writemask); */
1914      }
1915      else {
1916	 GLboolean dispatch_16 = GL_FALSE;
1917
1918	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
1919
1920	 guess_execution_size(p, p->current, dest);
1921	 if (p->current->header.execution_size == BRW_EXECUTE_16)
1922	    dispatch_16 = GL_TRUE;
1923
1924	 newmask = ~newmask & WRITEMASK_XYZW;
1925
1926	 brw_push_insn_state(p);
1927
1928	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1929	 brw_set_mask_control(p, BRW_MASK_DISABLE);
1930
1931	 brw_MOV(p, m1, brw_vec8_grf(0,0));
1932  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
1933
1934	 brw_pop_insn_state(p);
1935
1936  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
1937	 dest = offset(dest, dst_offset);
1938
1939	 /* For 16-wide dispatch, masked channels are skipped in the
1940	  * response.  For 8-wide, masked channels still take up slots,
1941	  * and are just not written to.
1942	  */
1943	 if (dispatch_16)
1944	    response_length = len * 2;
1945      }
1946   }
1947
1948   {
1949      struct brw_instruction *insn;
1950
1951      /* Sandybridge doesn't have the implied move for SENDs,
1952       * and the first message register index comes from src0.
1953       */
1954      if (intel->gen >= 6) {
1955	 if (src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1956	     src0.nr != BRW_ARF_NULL) {
1957	    brw_push_insn_state(p);
1958	    brw_set_mask_control( p, BRW_MASK_DISABLE );
1959	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1960	    brw_MOV(p, retype(brw_message_reg(msg_reg_nr), src0.type), src0);
1961	    brw_pop_insn_state(p);
1962	 }
1963	 src0 = brw_message_reg(msg_reg_nr);
1964      }
1965
1966      insn = next_insn(p, BRW_OPCODE_SEND);
1967      insn->header.predicate_control = 0; /* XXX */
1968      insn->header.compression_control = BRW_COMPRESSION_NONE;
1969      if (intel->gen < 6)
1970	  insn->header.destreg__conditionalmod = msg_reg_nr;
1971
1972      brw_set_dest(p, insn, dest);
1973      brw_set_src0(insn, src0);
1974      brw_set_sampler_message(p->brw, insn,
1975			      binding_table_index,
1976			      sampler,
1977			      msg_type,
1978			      response_length,
1979			      msg_length,
1980			      eot,
1981			      header_present,
1982			      simd_mode);
1983   }
1984
1985   if (need_stall) {
1986      struct brw_reg reg = vec8(offset(dest, response_length-1));
1987
1988      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
1989       */
1990      brw_push_insn_state(p);
1991      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1992      brw_MOV(p, reg, reg);
1993      brw_pop_insn_state(p);
1994   }
1995
1996}
1997
1998/* All these variables are pretty confusing - we might be better off
1999 * using bitmasks and macros for this, in the old style.  Or perhaps
2000 * just having the caller instantiate the fields in dword3 itself.
2001 */
2002void brw_urb_WRITE(struct brw_compile *p,
2003		   struct brw_reg dest,
2004		   GLuint msg_reg_nr,
2005		   struct brw_reg src0,
2006		   GLboolean allocate,
2007		   GLboolean used,
2008		   GLuint msg_length,
2009		   GLuint response_length,
2010		   GLboolean eot,
2011		   GLboolean writes_complete,
2012		   GLuint offset,
2013		   GLuint swizzle)
2014{
2015   struct intel_context *intel = &p->brw->intel;
2016   struct brw_instruction *insn;
2017
2018   /* Sandybridge doesn't have the implied move for SENDs,
2019    * and the first message register index comes from src0.
2020    */
2021   if (intel->gen >= 6) {
2022      brw_push_insn_state(p);
2023      brw_set_mask_control( p, BRW_MASK_DISABLE );
2024      brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
2025      brw_pop_insn_state(p);
2026      src0 = brw_message_reg(msg_reg_nr);
2027   }
2028
2029   insn = next_insn(p, BRW_OPCODE_SEND);
2030
2031   assert(msg_length < BRW_MAX_MRF);
2032
2033   brw_set_dest(p, insn, dest);
2034   brw_set_src0(insn, src0);
2035   brw_set_src1(insn, brw_imm_d(0));
2036
2037   if (intel->gen < 6)
2038      insn->header.destreg__conditionalmod = msg_reg_nr;
2039
2040   brw_set_urb_message(p->brw,
2041		       insn,
2042		       allocate,
2043		       used,
2044		       msg_length,
2045		       response_length,
2046		       eot,
2047		       writes_complete,
2048		       offset,
2049		       swizzle);
2050}
2051
2052static int
2053brw_find_next_block_end(struct brw_compile *p, int start)
2054{
2055   int ip;
2056
2057   for (ip = start + 1; ip < p->nr_insn; ip++) {
2058      struct brw_instruction *insn = &p->store[ip];
2059
2060      switch (insn->header.opcode) {
2061      case BRW_OPCODE_ENDIF:
2062      case BRW_OPCODE_ELSE:
2063      case BRW_OPCODE_WHILE:
2064	 return ip;
2065      }
2066   }
2067   assert(!"not reached");
2068   return start + 1;
2069}
2070
2071/* There is no DO instruction on gen6, so to find the end of the loop
2072 * we have to see if the loop is jumping back before our start
2073 * instruction.
2074 */
2075static int
2076brw_find_loop_end(struct brw_compile *p, int start)
2077{
2078   int ip;
2079   int br = 2;
2080
2081   for (ip = start + 1; ip < p->nr_insn; ip++) {
2082      struct brw_instruction *insn = &p->store[ip];
2083
2084      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2085	 if (ip + insn->bits1.branch_gen6.jump_count / br < start)
2086	    return ip;
2087      }
2088   }
2089   assert(!"not reached");
2090   return start + 1;
2091}
2092
2093/* After program generation, go back and update the UIP and JIP of
2094 * BREAK and CONT instructions to their correct locations.
2095 */
2096void
2097brw_set_uip_jip(struct brw_compile *p)
2098{
2099   struct intel_context *intel = &p->brw->intel;
2100   int ip;
2101   int br = 2;
2102
2103   if (intel->gen < 6)
2104      return;
2105
2106   for (ip = 0; ip < p->nr_insn; ip++) {
2107      struct brw_instruction *insn = &p->store[ip];
2108
2109      switch (insn->header.opcode) {
2110      case BRW_OPCODE_BREAK:
2111	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2112	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
2113	 break;
2114      case BRW_OPCODE_CONTINUE:
2115	 /* JIP is set at CONTINUE emit time, since that's when we
2116	  * know where the start of the loop is.
2117	  */
2118	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2119	 assert(insn->bits3.break_cont.uip != 0);
2120	 assert(insn->bits3.break_cont.jip != 0);
2121	 break;
2122      }
2123   }
2124}
2125
2126void brw_ff_sync(struct brw_compile *p,
2127		   struct brw_reg dest,
2128		   GLuint msg_reg_nr,
2129		   struct brw_reg src0,
2130		   GLboolean allocate,
2131		   GLuint response_length,
2132		   GLboolean eot)
2133{
2134   struct intel_context *intel = &p->brw->intel;
2135   struct brw_instruction *insn;
2136
2137   /* Sandybridge doesn't have the implied move for SENDs,
2138    * and the first message register index comes from src0.
2139    */
2140   if (intel->gen >= 6) {
2141      brw_push_insn_state(p);
2142      brw_set_mask_control( p, BRW_MASK_DISABLE );
2143      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
2144	      retype(src0, BRW_REGISTER_TYPE_UD));
2145      brw_pop_insn_state(p);
2146      src0 = brw_message_reg(msg_reg_nr);
2147   }
2148
2149   insn = next_insn(p, BRW_OPCODE_SEND);
2150   brw_set_dest(p, insn, dest);
2151   brw_set_src0(insn, src0);
2152   brw_set_src1(insn, brw_imm_d(0));
2153
2154   if (intel->gen < 6)
2155       insn->header.destreg__conditionalmod = msg_reg_nr;
2156
2157   brw_set_ff_sync_message(p->brw,
2158			   insn,
2159			   allocate,
2160			   response_length,
2161			   eot);
2162}
2163