brw_eu_emit.c revision 59c6b775a6aacfe03c84dae62c2fd45d4af9d70b
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37
38
39
40/***********************************************************************
41 * Internal helper for constructing instructions
42 */
43
44static void guess_execution_size(struct brw_compile *p,
45				 struct brw_instruction *insn,
46				 struct brw_reg reg)
47{
48   if (reg.width == BRW_WIDTH_8 && p->compressed)
49      insn->header.execution_size = BRW_EXECUTE_16;
50   else
51      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
52}
53
54
55/**
56 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
57 * registers, implicitly moving the operand to a message register.
58 *
59 * On Sandybridge, this is no longer the case.  This function performs the
60 * explicit move; it should be called before emitting a SEND instruction.
61 */
62static void
63gen6_resolve_implied_move(struct brw_compile *p,
64			  struct brw_reg *src,
65			  GLuint msg_reg_nr)
66{
67   struct intel_context *intel = &p->brw->intel;
68   if (intel->gen != 6)
69      return;
70
71   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
72      brw_push_insn_state(p);
73      brw_set_mask_control(p, BRW_MASK_DISABLE);
74      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
75      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
76	      retype(*src, BRW_REGISTER_TYPE_UD));
77      brw_pop_insn_state(p);
78   }
79   *src = brw_message_reg(msg_reg_nr);
80}
81
82
83static void brw_set_dest(struct brw_compile *p,
84			 struct brw_instruction *insn,
85			 struct brw_reg dest)
86{
87   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
88       dest.file != BRW_MESSAGE_REGISTER_FILE)
89      assert(dest.nr < 128);
90
91   insn->bits1.da1.dest_reg_file = dest.file;
92   insn->bits1.da1.dest_reg_type = dest.type;
93   insn->bits1.da1.dest_address_mode = dest.address_mode;
94
95   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
96      insn->bits1.da1.dest_reg_nr = dest.nr;
97
98      if (insn->header.access_mode == BRW_ALIGN_1) {
99	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
100	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
101	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
102	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
103      }
104      else {
105	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
106	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
107	 /* even ignored in da16, still need to set as '01' */
108	 insn->bits1.da16.dest_horiz_stride = 1;
109      }
110   }
111   else {
112      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
113
114      /* These are different sizes in align1 vs align16:
115       */
116      if (insn->header.access_mode == BRW_ALIGN_1) {
117	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
118	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
119	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
120	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
121      }
122      else {
123	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
124	 /* even ignored in da16, still need to set as '01' */
125	 insn->bits1.ia16.dest_horiz_stride = 1;
126      }
127   }
128
129   /* NEW: Set the execution size based on dest.width and
130    * insn->compression_control:
131    */
132   guess_execution_size(p, insn, dest);
133}
134
135extern int reg_type_size[];
136
137static void
138validate_reg(struct brw_instruction *insn, struct brw_reg reg)
139{
140   int hstride_for_reg[] = {0, 1, 2, 4};
141   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
142   int width_for_reg[] = {1, 2, 4, 8, 16};
143   int execsize_for_reg[] = {1, 2, 4, 8, 16};
144   int width, hstride, vstride, execsize;
145
146   if (reg.file == BRW_IMMEDIATE_VALUE) {
147      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
148       * mean the destination has to be 128-bit aligned and the
149       * destination horiz stride has to be a word.
150       */
151      if (reg.type == BRW_REGISTER_TYPE_V) {
152	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
153		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
154      }
155
156      return;
157   }
158
159   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
160       reg.file == BRW_ARF_NULL)
161      return;
162
163   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
164   hstride = hstride_for_reg[reg.hstride];
165
166   if (reg.vstride == 0xf) {
167      vstride = -1;
168   } else {
169      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
170      vstride = vstride_for_reg[reg.vstride];
171   }
172
173   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
174   width = width_for_reg[reg.width];
175
176   assert(insn->header.execution_size >= 0 &&
177	  insn->header.execution_size < Elements(execsize_for_reg));
178   execsize = execsize_for_reg[insn->header.execution_size];
179
180   /* Restrictions from 3.3.10: Register Region Restrictions. */
181   /* 3. */
182   assert(execsize >= width);
183
184   /* 4. */
185   if (execsize == width && hstride != 0) {
186      assert(vstride == -1 || vstride == width * hstride);
187   }
188
189   /* 5. */
190   if (execsize == width && hstride == 0) {
191      /* no restriction on vstride. */
192   }
193
194   /* 6. */
195   if (width == 1) {
196      assert(hstride == 0);
197   }
198
199   /* 7. */
200   if (execsize == 1 && width == 1) {
201      assert(hstride == 0);
202      assert(vstride == 0);
203   }
204
205   /* 8. */
206   if (vstride == 0 && hstride == 0) {
207      assert(width == 1);
208   }
209
210   /* 10. Check destination issues. */
211}
212
213static void brw_set_src0( struct brw_instruction *insn,
214                          struct brw_reg reg )
215{
216   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
217      assert(reg.nr < 128);
218
219   validate_reg(insn, reg);
220
221   insn->bits1.da1.src0_reg_file = reg.file;
222   insn->bits1.da1.src0_reg_type = reg.type;
223   insn->bits2.da1.src0_abs = reg.abs;
224   insn->bits2.da1.src0_negate = reg.negate;
225   insn->bits2.da1.src0_address_mode = reg.address_mode;
226
227   if (reg.file == BRW_IMMEDIATE_VALUE) {
228      insn->bits3.ud = reg.dw1.ud;
229
230      /* Required to set some fields in src1 as well:
231       */
232      insn->bits1.da1.src1_reg_file = 0; /* arf */
233      insn->bits1.da1.src1_reg_type = reg.type;
234   }
235   else
236   {
237      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
238	 if (insn->header.access_mode == BRW_ALIGN_1) {
239	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
240	    insn->bits2.da1.src0_reg_nr = reg.nr;
241	 }
242	 else {
243	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
244	    insn->bits2.da16.src0_reg_nr = reg.nr;
245	 }
246      }
247      else {
248	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
249
250	 if (insn->header.access_mode == BRW_ALIGN_1) {
251	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
252	 }
253	 else {
254	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
255	 }
256      }
257
258      if (insn->header.access_mode == BRW_ALIGN_1) {
259	 if (reg.width == BRW_WIDTH_1 &&
260	     insn->header.execution_size == BRW_EXECUTE_1) {
261	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
262	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
263	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
264	 }
265	 else {
266	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
267	    insn->bits2.da1.src0_width = reg.width;
268	    insn->bits2.da1.src0_vert_stride = reg.vstride;
269	 }
270      }
271      else {
272	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
273	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
274	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
275	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
276
277	 /* This is an oddity of the fact we're using the same
278	  * descriptions for registers in align_16 as align_1:
279	  */
280	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
281	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
282	 else
283	    insn->bits2.da16.src0_vert_stride = reg.vstride;
284      }
285   }
286}
287
288
289void brw_set_src1( struct brw_instruction *insn,
290                   struct brw_reg reg )
291{
292   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
293
294   assert(reg.nr < 128);
295
296   validate_reg(insn, reg);
297
298   insn->bits1.da1.src1_reg_file = reg.file;
299   insn->bits1.da1.src1_reg_type = reg.type;
300   insn->bits3.da1.src1_abs = reg.abs;
301   insn->bits3.da1.src1_negate = reg.negate;
302
303   /* Only src1 can be immediate in two-argument instructions.
304    */
305   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
306
307   if (reg.file == BRW_IMMEDIATE_VALUE) {
308      insn->bits3.ud = reg.dw1.ud;
309   }
310   else {
311      /* This is a hardware restriction, which may or may not be lifted
312       * in the future:
313       */
314      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
315      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
316
317      if (insn->header.access_mode == BRW_ALIGN_1) {
318	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
319	 insn->bits3.da1.src1_reg_nr = reg.nr;
320      }
321      else {
322	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
323	 insn->bits3.da16.src1_reg_nr = reg.nr;
324      }
325
326      if (insn->header.access_mode == BRW_ALIGN_1) {
327	 if (reg.width == BRW_WIDTH_1 &&
328	     insn->header.execution_size == BRW_EXECUTE_1) {
329	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
330	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
331	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
332	 }
333	 else {
334	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
335	    insn->bits3.da1.src1_width = reg.width;
336	    insn->bits3.da1.src1_vert_stride = reg.vstride;
337	 }
338      }
339      else {
340	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
341	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
342	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
343	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
344
345	 /* This is an oddity of the fact we're using the same
346	  * descriptions for registers in align_16 as align_1:
347	  */
348	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
349	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
350	 else
351	    insn->bits3.da16.src1_vert_stride = reg.vstride;
352      }
353   }
354}
355
356
357
358static void brw_set_math_message( struct brw_context *brw,
359				  struct brw_instruction *insn,
360				  GLuint msg_length,
361				  GLuint response_length,
362				  GLuint function,
363				  GLuint integer_type,
364				  GLboolean low_precision,
365				  GLboolean saturate,
366				  GLuint dataType )
367{
368   struct intel_context *intel = &brw->intel;
369   brw_set_src1(insn, brw_imm_d(0));
370
371   if (intel->gen == 5) {
372       insn->bits3.math_gen5.function = function;
373       insn->bits3.math_gen5.int_type = integer_type;
374       insn->bits3.math_gen5.precision = low_precision;
375       insn->bits3.math_gen5.saturate = saturate;
376       insn->bits3.math_gen5.data_type = dataType;
377       insn->bits3.math_gen5.snapshot = 0;
378       insn->bits3.math_gen5.header_present = 0;
379       insn->bits3.math_gen5.response_length = response_length;
380       insn->bits3.math_gen5.msg_length = msg_length;
381       insn->bits3.math_gen5.end_of_thread = 0;
382       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
383       insn->bits2.send_gen5.end_of_thread = 0;
384   } else {
385       insn->bits3.math.function = function;
386       insn->bits3.math.int_type = integer_type;
387       insn->bits3.math.precision = low_precision;
388       insn->bits3.math.saturate = saturate;
389       insn->bits3.math.data_type = dataType;
390       insn->bits3.math.response_length = response_length;
391       insn->bits3.math.msg_length = msg_length;
392       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
393       insn->bits3.math.end_of_thread = 0;
394   }
395}
396
397
398static void brw_set_ff_sync_message(struct brw_context *brw,
399				    struct brw_instruction *insn,
400				    GLboolean allocate,
401				    GLuint response_length,
402				    GLboolean end_of_thread)
403{
404	struct intel_context *intel = &brw->intel;
405	brw_set_src1(insn, brw_imm_d(0));
406
407	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
408	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
409	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
410	insn->bits3.urb_gen5.allocate = allocate;
411	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
412	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
413	insn->bits3.urb_gen5.header_present = 1;
414	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
415	insn->bits3.urb_gen5.msg_length = 1;
416	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
417	if (intel->gen >= 6) {
418	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
419	} else {
420	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
421	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
422	}
423}
424
425static void brw_set_urb_message( struct brw_context *brw,
426				 struct brw_instruction *insn,
427				 GLboolean allocate,
428				 GLboolean used,
429				 GLuint msg_length,
430				 GLuint response_length,
431				 GLboolean end_of_thread,
432				 GLboolean complete,
433				 GLuint offset,
434				 GLuint swizzle_control )
435{
436    struct intel_context *intel = &brw->intel;
437    brw_set_src1(insn, brw_imm_d(0));
438
439    if (intel->gen >= 5) {
440        insn->bits3.urb_gen5.opcode = 0;	/* ? */
441        insn->bits3.urb_gen5.offset = offset;
442        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
443        insn->bits3.urb_gen5.allocate = allocate;
444        insn->bits3.urb_gen5.used = used;	/* ? */
445        insn->bits3.urb_gen5.complete = complete;
446        insn->bits3.urb_gen5.header_present = 1;
447        insn->bits3.urb_gen5.response_length = response_length;
448        insn->bits3.urb_gen5.msg_length = msg_length;
449        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
450	if (intel->gen >= 6) {
451	   /* For SNB, the SFID bits moved to the condmod bits, and
452	    * EOT stayed in bits3 above.  Does the EOT bit setting
453	    * below on Ironlake even do anything?
454	    */
455	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
456	} else {
457	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
458	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
459	}
460    } else {
461        insn->bits3.urb.opcode = 0;	/* ? */
462        insn->bits3.urb.offset = offset;
463        insn->bits3.urb.swizzle_control = swizzle_control;
464        insn->bits3.urb.allocate = allocate;
465        insn->bits3.urb.used = used;	/* ? */
466        insn->bits3.urb.complete = complete;
467        insn->bits3.urb.response_length = response_length;
468        insn->bits3.urb.msg_length = msg_length;
469        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
470        insn->bits3.urb.end_of_thread = end_of_thread;
471    }
472}
473
474static void brw_set_dp_write_message( struct brw_context *brw,
475				      struct brw_instruction *insn,
476				      GLuint binding_table_index,
477				      GLuint msg_control,
478				      GLuint msg_type,
479				      GLuint msg_length,
480				      GLboolean header_present,
481				      GLuint pixel_scoreboard_clear,
482				      GLuint response_length,
483				      GLuint end_of_thread,
484				      GLuint send_commit_msg)
485{
486   struct intel_context *intel = &brw->intel;
487   brw_set_src1(insn, brw_imm_ud(0));
488
489   if (intel->gen >= 6) {
490       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
491       insn->bits3.dp_render_cache.msg_control = msg_control;
492       insn->bits3.dp_render_cache.pixel_scoreboard_clear = pixel_scoreboard_clear;
493       insn->bits3.dp_render_cache.msg_type = msg_type;
494       insn->bits3.dp_render_cache.send_commit_msg = send_commit_msg;
495       insn->bits3.dp_render_cache.header_present = header_present;
496       insn->bits3.dp_render_cache.response_length = response_length;
497       insn->bits3.dp_render_cache.msg_length = msg_length;
498       insn->bits3.dp_render_cache.end_of_thread = end_of_thread;
499
500       /* We always use the render cache for write messages */
501       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
502	/* XXX really need below? */
503       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
504       insn->bits2.send_gen5.end_of_thread = end_of_thread;
505   } else if (intel->gen == 5) {
506       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
507       insn->bits3.dp_write_gen5.msg_control = msg_control;
508       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
509       insn->bits3.dp_write_gen5.msg_type = msg_type;
510       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
511       insn->bits3.dp_write_gen5.header_present = header_present;
512       insn->bits3.dp_write_gen5.response_length = response_length;
513       insn->bits3.dp_write_gen5.msg_length = msg_length;
514       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
515       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
516       insn->bits2.send_gen5.end_of_thread = end_of_thread;
517   } else {
518       insn->bits3.dp_write.binding_table_index = binding_table_index;
519       insn->bits3.dp_write.msg_control = msg_control;
520       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
521       insn->bits3.dp_write.msg_type = msg_type;
522       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
523       insn->bits3.dp_write.response_length = response_length;
524       insn->bits3.dp_write.msg_length = msg_length;
525       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
526       insn->bits3.dp_write.end_of_thread = end_of_thread;
527   }
528}
529
530static void
531brw_set_dp_read_message(struct brw_context *brw,
532			struct brw_instruction *insn,
533			GLuint binding_table_index,
534			GLuint msg_control,
535			GLuint msg_type,
536			GLuint target_cache,
537			GLuint msg_length,
538			GLuint response_length)
539{
540   struct intel_context *intel = &brw->intel;
541   brw_set_src1(insn, brw_imm_d(0));
542
543   if (intel->gen >= 6) {
544       uint32_t target_function;
545
546       if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE)
547	  target_function = BRW_MESSAGE_TARGET_DATAPORT_READ; /* data cache */
548       else
549	  target_function = BRW_MESSAGE_TARGET_DATAPORT_WRITE; /* render cache */
550
551       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
552       insn->bits3.dp_render_cache.msg_control = msg_control;
553       insn->bits3.dp_render_cache.pixel_scoreboard_clear = 0;
554       insn->bits3.dp_render_cache.msg_type = msg_type;
555       insn->bits3.dp_render_cache.send_commit_msg = 0;
556       insn->bits3.dp_render_cache.header_present = 1;
557       insn->bits3.dp_render_cache.response_length = response_length;
558       insn->bits3.dp_render_cache.msg_length = msg_length;
559       insn->bits3.dp_render_cache.end_of_thread = 0;
560       insn->header.destreg__conditionalmod = target_function;
561	/* XXX really need below? */
562       insn->bits2.send_gen5.sfid = target_function;
563       insn->bits2.send_gen5.end_of_thread = 0;
564   } else if (intel->gen == 5) {
565       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
566       insn->bits3.dp_read_gen5.msg_control = msg_control;
567       insn->bits3.dp_read_gen5.msg_type = msg_type;
568       insn->bits3.dp_read_gen5.target_cache = target_cache;
569       insn->bits3.dp_read_gen5.header_present = 1;
570       insn->bits3.dp_read_gen5.response_length = response_length;
571       insn->bits3.dp_read_gen5.msg_length = msg_length;
572       insn->bits3.dp_read_gen5.pad1 = 0;
573       insn->bits3.dp_read_gen5.end_of_thread = 0;
574       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
575       insn->bits2.send_gen5.end_of_thread = 0;
576   } else if (intel->is_g4x) {
577       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
578       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
579       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
580       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
581       insn->bits3.dp_read_g4x.response_length = response_length;  /*16:19*/
582       insn->bits3.dp_read_g4x.msg_length = msg_length;  /*20:23*/
583       insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
584       insn->bits3.dp_read_g4x.pad1 = 0;
585       insn->bits3.dp_read_g4x.end_of_thread = 0;
586   } else {
587       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
588       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
589       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
590       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
591       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
592       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
593       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
594       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
595       insn->bits3.dp_read.end_of_thread = 0;  /*31*/
596   }
597}
598
599static void brw_set_sampler_message(struct brw_context *brw,
600                                    struct brw_instruction *insn,
601                                    GLuint binding_table_index,
602                                    GLuint sampler,
603                                    GLuint msg_type,
604                                    GLuint response_length,
605                                    GLuint msg_length,
606                                    GLboolean eot,
607                                    GLuint header_present,
608                                    GLuint simd_mode)
609{
610   struct intel_context *intel = &brw->intel;
611   assert(eot == 0);
612   brw_set_src1(insn, brw_imm_d(0));
613
614   if (intel->gen >= 5) {
615      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
616      insn->bits3.sampler_gen5.sampler = sampler;
617      insn->bits3.sampler_gen5.msg_type = msg_type;
618      insn->bits3.sampler_gen5.simd_mode = simd_mode;
619      insn->bits3.sampler_gen5.header_present = header_present;
620      insn->bits3.sampler_gen5.response_length = response_length;
621      insn->bits3.sampler_gen5.msg_length = msg_length;
622      insn->bits3.sampler_gen5.end_of_thread = eot;
623      if (intel->gen >= 6)
624	  insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
625      else {
626	  insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
627	  insn->bits2.send_gen5.end_of_thread = eot;
628      }
629   } else if (intel->is_g4x) {
630      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
631      insn->bits3.sampler_g4x.sampler = sampler;
632      insn->bits3.sampler_g4x.msg_type = msg_type;
633      insn->bits3.sampler_g4x.response_length = response_length;
634      insn->bits3.sampler_g4x.msg_length = msg_length;
635      insn->bits3.sampler_g4x.end_of_thread = eot;
636      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
637   } else {
638      insn->bits3.sampler.binding_table_index = binding_table_index;
639      insn->bits3.sampler.sampler = sampler;
640      insn->bits3.sampler.msg_type = msg_type;
641      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
642      insn->bits3.sampler.response_length = response_length;
643      insn->bits3.sampler.msg_length = msg_length;
644      insn->bits3.sampler.end_of_thread = eot;
645      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
646   }
647}
648
649
650
651static struct brw_instruction *next_insn( struct brw_compile *p,
652					  GLuint opcode )
653{
654   struct brw_instruction *insn;
655
656   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
657
658   insn = &p->store[p->nr_insn++];
659   memcpy(insn, p->current, sizeof(*insn));
660
661   /* Reset this one-shot flag:
662    */
663
664   if (p->current->header.destreg__conditionalmod) {
665      p->current->header.destreg__conditionalmod = 0;
666      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
667   }
668
669   insn->header.opcode = opcode;
670   return insn;
671}
672
673
674static struct brw_instruction *brw_alu1( struct brw_compile *p,
675					 GLuint opcode,
676					 struct brw_reg dest,
677					 struct brw_reg src )
678{
679   struct brw_instruction *insn = next_insn(p, opcode);
680   brw_set_dest(p, insn, dest);
681   brw_set_src0(insn, src);
682   return insn;
683}
684
685static struct brw_instruction *brw_alu2(struct brw_compile *p,
686					GLuint opcode,
687					struct brw_reg dest,
688					struct brw_reg src0,
689					struct brw_reg src1 )
690{
691   struct brw_instruction *insn = next_insn(p, opcode);
692   brw_set_dest(p, insn, dest);
693   brw_set_src0(insn, src0);
694   brw_set_src1(insn, src1);
695   return insn;
696}
697
698
699/***********************************************************************
700 * Convenience routines.
701 */
702#define ALU1(OP)					\
703struct brw_instruction *brw_##OP(struct brw_compile *p,	\
704	      struct brw_reg dest,			\
705	      struct brw_reg src0)   			\
706{							\
707   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
708}
709
710#define ALU2(OP)					\
711struct brw_instruction *brw_##OP(struct brw_compile *p,	\
712	      struct brw_reg dest,			\
713	      struct brw_reg src0,			\
714	      struct brw_reg src1)   			\
715{							\
716   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
717}
718
719/* Rounding operations (other than RNDD) require two instructions - the first
720 * stores a rounded value (possibly the wrong way) in the dest register, but
721 * also sets a per-channel "increment bit" in the flag register.  A predicated
722 * add of 1.0 fixes dest to contain the desired result.
723 */
724#define ROUND(OP)							      \
725void brw_##OP(struct brw_compile *p,					      \
726	      struct brw_reg dest,					      \
727	      struct brw_reg src)					      \
728{									      \
729   struct brw_instruction *rnd, *add;					      \
730   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
731   brw_set_dest(p, rnd, dest);						      \
732   brw_set_src0(rnd, src);						      \
733   rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
734									      \
735   add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
736   add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
737}
738
739
740ALU1(MOV)
741ALU2(SEL)
742ALU1(NOT)
743ALU2(AND)
744ALU2(OR)
745ALU2(XOR)
746ALU2(SHR)
747ALU2(SHL)
748ALU2(RSR)
749ALU2(RSL)
750ALU2(ASR)
751ALU1(FRC)
752ALU1(RNDD)
753ALU2(MAC)
754ALU2(MACH)
755ALU1(LZD)
756ALU2(DP4)
757ALU2(DPH)
758ALU2(DP3)
759ALU2(DP2)
760ALU2(LINE)
761ALU2(PLN)
762
763
764ROUND(RNDZ)
765ROUND(RNDE)
766
767
768struct brw_instruction *brw_ADD(struct brw_compile *p,
769				struct brw_reg dest,
770				struct brw_reg src0,
771				struct brw_reg src1)
772{
773   /* 6.2.2: add */
774   if (src0.type == BRW_REGISTER_TYPE_F ||
775       (src0.file == BRW_IMMEDIATE_VALUE &&
776	src0.type == BRW_REGISTER_TYPE_VF)) {
777      assert(src1.type != BRW_REGISTER_TYPE_UD);
778      assert(src1.type != BRW_REGISTER_TYPE_D);
779   }
780
781   if (src1.type == BRW_REGISTER_TYPE_F ||
782       (src1.file == BRW_IMMEDIATE_VALUE &&
783	src1.type == BRW_REGISTER_TYPE_VF)) {
784      assert(src0.type != BRW_REGISTER_TYPE_UD);
785      assert(src0.type != BRW_REGISTER_TYPE_D);
786   }
787
788   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
789}
790
791struct brw_instruction *brw_MUL(struct brw_compile *p,
792				struct brw_reg dest,
793				struct brw_reg src0,
794				struct brw_reg src1)
795{
796   /* 6.32.38: mul */
797   if (src0.type == BRW_REGISTER_TYPE_D ||
798       src0.type == BRW_REGISTER_TYPE_UD ||
799       src1.type == BRW_REGISTER_TYPE_D ||
800       src1.type == BRW_REGISTER_TYPE_UD) {
801      assert(dest.type != BRW_REGISTER_TYPE_F);
802   }
803
804   if (src0.type == BRW_REGISTER_TYPE_F ||
805       (src0.file == BRW_IMMEDIATE_VALUE &&
806	src0.type == BRW_REGISTER_TYPE_VF)) {
807      assert(src1.type != BRW_REGISTER_TYPE_UD);
808      assert(src1.type != BRW_REGISTER_TYPE_D);
809   }
810
811   if (src1.type == BRW_REGISTER_TYPE_F ||
812       (src1.file == BRW_IMMEDIATE_VALUE &&
813	src1.type == BRW_REGISTER_TYPE_VF)) {
814      assert(src0.type != BRW_REGISTER_TYPE_UD);
815      assert(src0.type != BRW_REGISTER_TYPE_D);
816   }
817
818   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
819	  src0.nr != BRW_ARF_ACCUMULATOR);
820   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
821	  src1.nr != BRW_ARF_ACCUMULATOR);
822
823   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
824}
825
826
827void brw_NOP(struct brw_compile *p)
828{
829   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
830   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
831   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
832   brw_set_src1(insn, brw_imm_ud(0x0));
833}
834
835
836
837
838
839/***********************************************************************
840 * Comparisons, if/else/endif
841 */
842
843struct brw_instruction *brw_JMPI(struct brw_compile *p,
844                                 struct brw_reg dest,
845                                 struct brw_reg src0,
846                                 struct brw_reg src1)
847{
848   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
849
850   insn->header.execution_size = 1;
851   insn->header.compression_control = BRW_COMPRESSION_NONE;
852   insn->header.mask_control = BRW_MASK_DISABLE;
853
854   p->current->header.predicate_control = BRW_PREDICATE_NONE;
855
856   return insn;
857}
858
859/* EU takes the value from the flag register and pushes it onto some
860 * sort of a stack (presumably merging with any flag value already on
861 * the stack).  Within an if block, the flags at the top of the stack
862 * control execution on each channel of the unit, eg. on each of the
863 * 16 pixel values in our wm programs.
864 *
865 * When the matching 'else' instruction is reached (presumably by
866 * countdown of the instruction count patched in by our ELSE/ENDIF
867 * functions), the relevent flags are inverted.
868 *
869 * When the matching 'endif' instruction is reached, the flags are
870 * popped off.  If the stack is now empty, normal execution resumes.
871 *
872 * No attempt is made to deal with stack overflow (14 elements?).
873 */
874struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
875{
876   struct intel_context *intel = &p->brw->intel;
877   struct brw_instruction *insn;
878
879   if (p->single_program_flow) {
880      assert(execute_size == BRW_EXECUTE_1);
881
882      insn = next_insn(p, BRW_OPCODE_ADD);
883      insn->header.predicate_inverse = 1;
884   } else {
885      insn = next_insn(p, BRW_OPCODE_IF);
886   }
887
888   /* Override the defaults for this instruction:
889    */
890   if (intel->gen < 6) {
891      brw_set_dest(p, insn, brw_ip_reg());
892      brw_set_src0(insn, brw_ip_reg());
893      brw_set_src1(insn, brw_imm_d(0x0));
894   } else {
895      brw_set_dest(p, insn, brw_imm_w(0));
896      insn->bits1.branch_gen6.jump_count = 0;
897      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
898      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
899   }
900
901   insn->header.execution_size = execute_size;
902   insn->header.compression_control = BRW_COMPRESSION_NONE;
903   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
904   insn->header.mask_control = BRW_MASK_ENABLE;
905   if (!p->single_program_flow)
906       insn->header.thread_control = BRW_THREAD_SWITCH;
907
908   p->current->header.predicate_control = BRW_PREDICATE_NONE;
909
910   return insn;
911}
912
913struct brw_instruction *
914gen6_IF(struct brw_compile *p, uint32_t conditional,
915	struct brw_reg src0, struct brw_reg src1)
916{
917   struct brw_instruction *insn;
918
919   insn = next_insn(p, BRW_OPCODE_IF);
920
921   brw_set_dest(p, insn, brw_imm_w(0));
922   insn->header.execution_size = BRW_EXECUTE_8;
923   insn->bits1.branch_gen6.jump_count = 0;
924   brw_set_src0(insn, src0);
925   brw_set_src1(insn, src1);
926
927   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
928   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
929   insn->header.destreg__conditionalmod = conditional;
930
931   if (!p->single_program_flow)
932       insn->header.thread_control = BRW_THREAD_SWITCH;
933
934   return insn;
935}
936
937struct brw_instruction *brw_ELSE(struct brw_compile *p,
938				 struct brw_instruction *if_insn)
939{
940   struct intel_context *intel = &p->brw->intel;
941   struct brw_instruction *insn;
942   GLuint br = 1;
943
944   /* jump count is for 64bit data chunk each, so one 128bit
945      instruction requires 2 chunks. */
946   if (intel->gen >= 5)
947      br = 2;
948
949   if (p->single_program_flow) {
950      insn = next_insn(p, BRW_OPCODE_ADD);
951   } else {
952      insn = next_insn(p, BRW_OPCODE_ELSE);
953   }
954
955   if (intel->gen < 6) {
956      brw_set_dest(p, insn, brw_ip_reg());
957      brw_set_src0(insn, brw_ip_reg());
958      brw_set_src1(insn, brw_imm_d(0x0));
959   } else {
960      brw_set_dest(p, insn, brw_imm_w(0));
961      insn->bits1.branch_gen6.jump_count = 0;
962      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
963      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
964   }
965
966   insn->header.compression_control = BRW_COMPRESSION_NONE;
967   insn->header.execution_size = if_insn->header.execution_size;
968   insn->header.mask_control = BRW_MASK_ENABLE;
969   if (!p->single_program_flow)
970       insn->header.thread_control = BRW_THREAD_SWITCH;
971
972   /* Patch the if instruction to point at this instruction.
973    */
974   if (p->single_program_flow) {
975      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
976
977      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
978   } else {
979      assert(if_insn->header.opcode == BRW_OPCODE_IF);
980
981      if (intel->gen < 6) {
982	 if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
983	 if_insn->bits3.if_else.pop_count = 0;
984	 if_insn->bits3.if_else.pad0 = 0;
985      } else {
986	 if_insn->bits1.branch_gen6.jump_count = br * (insn - if_insn + 1);
987      }
988   }
989
990   return insn;
991}
992
993void brw_ENDIF(struct brw_compile *p,
994	       struct brw_instruction *patch_insn)
995{
996   struct intel_context *intel = &p->brw->intel;
997   GLuint br = 1;
998
999   if (intel->gen >= 5)
1000      br = 2;
1001
1002   if (p->single_program_flow) {
1003      /* In single program flow mode, there's no need to execute an ENDIF,
1004       * since we don't need to do any stack operations, and if we're executing
1005       * currently, we want to just continue executing.
1006       */
1007      struct brw_instruction *next = &p->store[p->nr_insn];
1008
1009      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
1010
1011      patch_insn->bits3.ud = (next - patch_insn) * 16;
1012   } else {
1013      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
1014
1015      if (intel->gen < 6) {
1016	 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1017	 brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1018	 brw_set_src1(insn, brw_imm_d(0x0));
1019      } else {
1020	 brw_set_dest(p, insn, brw_imm_w(0));
1021	 brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1022	 brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1023      }
1024
1025      insn->header.compression_control = BRW_COMPRESSION_NONE;
1026      insn->header.execution_size = patch_insn->header.execution_size;
1027      insn->header.mask_control = BRW_MASK_ENABLE;
1028      insn->header.thread_control = BRW_THREAD_SWITCH;
1029
1030      if (intel->gen < 6)
1031	 assert(patch_insn->bits3.if_else.jump_count == 0);
1032      else
1033	 assert(patch_insn->bits1.branch_gen6.jump_count == 0);
1034
1035      /* Patch the if or else instructions to point at this or the next
1036       * instruction respectively.
1037       */
1038      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
1039	 if (intel->gen < 6) {
1040	    /* Turn it into an IFF, which means no mask stack operations for
1041	     * all-false and jumping past the ENDIF.
1042	     */
1043	    patch_insn->header.opcode = BRW_OPCODE_IFF;
1044	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
1045	    patch_insn->bits3.if_else.pop_count = 0;
1046	    patch_insn->bits3.if_else.pad0 = 0;
1047	 } else {
1048	    /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1049	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1050	 }
1051      } else {
1052	 assert(patch_insn->header.opcode == BRW_OPCODE_ELSE);
1053	 if (intel->gen < 6) {
1054	    /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1055	     * matching ENDIF.
1056	     */
1057	    patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
1058	    patch_insn->bits3.if_else.pop_count = 1;
1059	    patch_insn->bits3.if_else.pad0 = 0;
1060	 } else {
1061	    /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1062	    patch_insn->bits1.branch_gen6.jump_count = br * (insn - patch_insn);
1063	 }
1064      }
1065
1066      /* Also pop item off the stack in the endif instruction:
1067       */
1068      if (intel->gen < 6) {
1069	 insn->bits3.if_else.jump_count = 0;
1070	 insn->bits3.if_else.pop_count = 1;
1071	 insn->bits3.if_else.pad0 = 0;
1072      } else {
1073	 insn->bits1.branch_gen6.jump_count = 2;
1074      }
1075   }
1076}
1077
1078struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1079{
1080   struct intel_context *intel = &p->brw->intel;
1081   struct brw_instruction *insn;
1082
1083   insn = next_insn(p, BRW_OPCODE_BREAK);
1084   if (intel->gen >= 6) {
1085      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1086      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1087      brw_set_src1(insn, brw_imm_d(0x0));
1088   } else {
1089      brw_set_dest(p, insn, brw_ip_reg());
1090      brw_set_src0(insn, brw_ip_reg());
1091      brw_set_src1(insn, brw_imm_d(0x0));
1092      insn->bits3.if_else.pad0 = 0;
1093      insn->bits3.if_else.pop_count = pop_count;
1094   }
1095   insn->header.compression_control = BRW_COMPRESSION_NONE;
1096   insn->header.execution_size = BRW_EXECUTE_8;
1097
1098   return insn;
1099}
1100
1101struct brw_instruction *gen6_CONT(struct brw_compile *p,
1102				  struct brw_instruction *do_insn)
1103{
1104   struct brw_instruction *insn;
1105   int br = 2;
1106
1107   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1108   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1109   brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1110   brw_set_dest(p, insn, brw_ip_reg());
1111   brw_set_src0(insn, brw_ip_reg());
1112   brw_set_src1(insn, brw_imm_d(0x0));
1113
1114   insn->bits3.break_cont.uip = br * (do_insn - insn);
1115
1116   insn->header.compression_control = BRW_COMPRESSION_NONE;
1117   insn->header.execution_size = BRW_EXECUTE_8;
1118   return insn;
1119}
1120
1121struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1122{
1123   struct brw_instruction *insn;
1124   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1125   brw_set_dest(p, insn, brw_ip_reg());
1126   brw_set_src0(insn, brw_ip_reg());
1127   brw_set_src1(insn, brw_imm_d(0x0));
1128   insn->header.compression_control = BRW_COMPRESSION_NONE;
1129   insn->header.execution_size = BRW_EXECUTE_8;
1130   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1131   insn->bits3.if_else.pad0 = 0;
1132   insn->bits3.if_else.pop_count = pop_count;
1133   return insn;
1134}
1135
1136/* DO/WHILE loop:
1137 *
1138 * The DO/WHILE is just an unterminated loop -- break or continue are
1139 * used for control within the loop.  We have a few ways they can be
1140 * done.
1141 *
1142 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1143 * jip and no DO instruction.
1144 *
1145 * For non-uniform control flow pre-gen6, there's a DO instruction to
1146 * push the mask, and a WHILE to jump back, and BREAK to get out and
1147 * pop the mask.
1148 *
1149 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1150 * just points back to the first instruction of the loop.
1151 */
1152struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1153{
1154   struct intel_context *intel = &p->brw->intel;
1155
1156   if (intel->gen >= 6 || p->single_program_flow) {
1157      return &p->store[p->nr_insn];
1158   } else {
1159      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1160
1161      /* Override the defaults for this instruction:
1162       */
1163      brw_set_dest(p, insn, brw_null_reg());
1164      brw_set_src0(insn, brw_null_reg());
1165      brw_set_src1(insn, brw_null_reg());
1166
1167      insn->header.compression_control = BRW_COMPRESSION_NONE;
1168      insn->header.execution_size = execute_size;
1169      insn->header.predicate_control = BRW_PREDICATE_NONE;
1170      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1171      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1172
1173      return insn;
1174   }
1175}
1176
1177
1178
1179struct brw_instruction *brw_WHILE(struct brw_compile *p,
1180                                  struct brw_instruction *do_insn)
1181{
1182   struct intel_context *intel = &p->brw->intel;
1183   struct brw_instruction *insn;
1184   GLuint br = 1;
1185
1186   if (intel->gen >= 5)
1187      br = 2;
1188
1189   if (intel->gen >= 6) {
1190      insn = next_insn(p, BRW_OPCODE_WHILE);
1191
1192      brw_set_dest(p, insn, brw_imm_w(0));
1193      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1194      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1195      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1196
1197      insn->header.execution_size = do_insn->header.execution_size;
1198      assert(insn->header.execution_size == BRW_EXECUTE_8);
1199   } else {
1200      if (p->single_program_flow) {
1201	 insn = next_insn(p, BRW_OPCODE_ADD);
1202
1203	 brw_set_dest(p, insn, brw_ip_reg());
1204	 brw_set_src0(insn, brw_ip_reg());
1205	 brw_set_src1(insn, brw_imm_d((do_insn - insn) * 16));
1206	 insn->header.execution_size = BRW_EXECUTE_1;
1207      } else {
1208	 insn = next_insn(p, BRW_OPCODE_WHILE);
1209
1210	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1211
1212	 brw_set_dest(p, insn, brw_ip_reg());
1213	 brw_set_src0(insn, brw_ip_reg());
1214	 brw_set_src1(insn, brw_imm_d(0));
1215
1216	 insn->header.execution_size = do_insn->header.execution_size;
1217	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1218	 insn->bits3.if_else.pop_count = 0;
1219	 insn->bits3.if_else.pad0 = 0;
1220      }
1221   }
1222   insn->header.compression_control = BRW_COMPRESSION_NONE;
1223   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1224
1225   return insn;
1226}
1227
1228
1229/* FORWARD JUMPS:
1230 */
1231void brw_land_fwd_jump(struct brw_compile *p,
1232		       struct brw_instruction *jmp_insn)
1233{
1234   struct intel_context *intel = &p->brw->intel;
1235   struct brw_instruction *landing = &p->store[p->nr_insn];
1236   GLuint jmpi = 1;
1237
1238   if (intel->gen >= 5)
1239       jmpi = 2;
1240
1241   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1242   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1243
1244   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1245}
1246
1247
1248
1249/* To integrate with the above, it makes sense that the comparison
1250 * instruction should populate the flag register.  It might be simpler
1251 * just to use the flag reg for most WM tasks?
1252 */
1253void brw_CMP(struct brw_compile *p,
1254	     struct brw_reg dest,
1255	     GLuint conditional,
1256	     struct brw_reg src0,
1257	     struct brw_reg src1)
1258{
1259   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1260
1261   insn->header.destreg__conditionalmod = conditional;
1262   brw_set_dest(p, insn, dest);
1263   brw_set_src0(insn, src0);
1264   brw_set_src1(insn, src1);
1265
1266/*    guess_execution_size(insn, src0); */
1267
1268
1269   /* Make it so that future instructions will use the computed flag
1270    * value until brw_set_predicate_control_flag_value() is called
1271    * again.
1272    */
1273   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1274       dest.nr == 0) {
1275      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1276      p->flag_value = 0xff;
1277   }
1278}
1279
1280/* Issue 'wait' instruction for n1, host could program MMIO
1281   to wake up thread. */
1282void brw_WAIT (struct brw_compile *p)
1283{
1284   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1285   struct brw_reg src = brw_notification_1_reg();
1286
1287   brw_set_dest(p, insn, src);
1288   brw_set_src0(insn, src);
1289   brw_set_src1(insn, brw_null_reg());
1290   insn->header.execution_size = 0; /* must */
1291   insn->header.predicate_control = 0;
1292   insn->header.compression_control = 0;
1293}
1294
1295
1296/***********************************************************************
1297 * Helpers for the various SEND message types:
1298 */
1299
1300/** Extended math function, float[8].
1301 */
1302void brw_math( struct brw_compile *p,
1303	       struct brw_reg dest,
1304	       GLuint function,
1305	       GLuint saturate,
1306	       GLuint msg_reg_nr,
1307	       struct brw_reg src,
1308	       GLuint data_type,
1309	       GLuint precision )
1310{
1311   struct intel_context *intel = &p->brw->intel;
1312
1313   if (intel->gen >= 6) {
1314      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1315
1316      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1317      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1318
1319      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1320      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1321
1322      /* Source modifiers are ignored for extended math instructions. */
1323      assert(!src.negate);
1324      assert(!src.abs);
1325
1326      if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1327	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1328	 assert(src.type == BRW_REGISTER_TYPE_F);
1329      }
1330
1331      /* Math is the same ISA format as other opcodes, except that CondModifier
1332       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1333       */
1334      insn->header.destreg__conditionalmod = function;
1335      insn->header.saturate = saturate;
1336
1337      brw_set_dest(p, insn, dest);
1338      brw_set_src0(insn, src);
1339      brw_set_src1(insn, brw_null_reg());
1340   } else {
1341      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1342      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1343      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1344      /* Example code doesn't set predicate_control for send
1345       * instructions.
1346       */
1347      insn->header.predicate_control = 0;
1348      insn->header.destreg__conditionalmod = msg_reg_nr;
1349
1350      brw_set_dest(p, insn, dest);
1351      brw_set_src0(insn, src);
1352      brw_set_math_message(p->brw,
1353			   insn,
1354			   msg_length, response_length,
1355			   function,
1356			   BRW_MATH_INTEGER_UNSIGNED,
1357			   precision,
1358			   saturate,
1359			   data_type);
1360   }
1361}
1362
1363/** Extended math function, float[8].
1364 */
1365void brw_math2(struct brw_compile *p,
1366	       struct brw_reg dest,
1367	       GLuint function,
1368	       struct brw_reg src0,
1369	       struct brw_reg src1)
1370{
1371   struct intel_context *intel = &p->brw->intel;
1372   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1373
1374   assert(intel->gen >= 6);
1375   (void) intel;
1376
1377
1378   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1379   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1380   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1381
1382   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1383   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1384   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1385
1386   if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1387       function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1388      assert(src0.type == BRW_REGISTER_TYPE_F);
1389      assert(src1.type == BRW_REGISTER_TYPE_F);
1390   }
1391
1392   /* Source modifiers are ignored for extended math instructions. */
1393   assert(!src0.negate);
1394   assert(!src0.abs);
1395   assert(!src1.negate);
1396   assert(!src1.abs);
1397
1398   /* Math is the same ISA format as other opcodes, except that CondModifier
1399    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1400    */
1401   insn->header.destreg__conditionalmod = function;
1402
1403   brw_set_dest(p, insn, dest);
1404   brw_set_src0(insn, src0);
1405   brw_set_src1(insn, src1);
1406}
1407
1408/**
1409 * Extended math function, float[16].
1410 * Use 2 send instructions.
1411 */
1412void brw_math_16( struct brw_compile *p,
1413		  struct brw_reg dest,
1414		  GLuint function,
1415		  GLuint saturate,
1416		  GLuint msg_reg_nr,
1417		  struct brw_reg src,
1418		  GLuint precision )
1419{
1420   struct intel_context *intel = &p->brw->intel;
1421   struct brw_instruction *insn;
1422   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1423   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1424
1425   if (intel->gen >= 6) {
1426      insn = next_insn(p, BRW_OPCODE_MATH);
1427
1428      /* Math is the same ISA format as other opcodes, except that CondModifier
1429       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1430       */
1431      insn->header.destreg__conditionalmod = function;
1432      insn->header.saturate = saturate;
1433
1434      /* Source modifiers are ignored for extended math instructions. */
1435      assert(!src.negate);
1436      assert(!src.abs);
1437
1438      brw_set_dest(p, insn, dest);
1439      brw_set_src0(insn, src);
1440      brw_set_src1(insn, brw_null_reg());
1441      return;
1442   }
1443
1444   /* First instruction:
1445    */
1446   brw_push_insn_state(p);
1447   brw_set_predicate_control_flag_value(p, 0xff);
1448   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1449
1450   insn = next_insn(p, BRW_OPCODE_SEND);
1451   insn->header.destreg__conditionalmod = msg_reg_nr;
1452
1453   brw_set_dest(p, insn, dest);
1454   brw_set_src0(insn, src);
1455   brw_set_math_message(p->brw,
1456			insn,
1457			msg_length, response_length,
1458			function,
1459			BRW_MATH_INTEGER_UNSIGNED,
1460			precision,
1461			saturate,
1462			BRW_MATH_DATA_VECTOR);
1463
1464   /* Second instruction:
1465    */
1466   insn = next_insn(p, BRW_OPCODE_SEND);
1467   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1468   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1469
1470   brw_set_dest(p, insn, offset(dest,1));
1471   brw_set_src0(insn, src);
1472   brw_set_math_message(p->brw,
1473			insn,
1474			msg_length, response_length,
1475			function,
1476			BRW_MATH_INTEGER_UNSIGNED,
1477			precision,
1478			saturate,
1479			BRW_MATH_DATA_VECTOR);
1480
1481   brw_pop_insn_state(p);
1482}
1483
1484
1485/**
1486 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1487 * using a constant offset per channel.
1488 *
1489 * The offset must be aligned to oword size (16 bytes).  Used for
1490 * register spilling.
1491 */
1492void brw_oword_block_write_scratch(struct brw_compile *p,
1493				   struct brw_reg mrf,
1494				   int num_regs,
1495				   GLuint offset)
1496{
1497   struct intel_context *intel = &p->brw->intel;
1498   uint32_t msg_control, msg_type;
1499   int mlen;
1500
1501   if (intel->gen >= 6)
1502      offset /= 16;
1503
1504   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1505
1506   if (num_regs == 1) {
1507      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1508      mlen = 2;
1509   } else {
1510      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1511      mlen = 3;
1512   }
1513
1514   /* Set up the message header.  This is g0, with g0.2 filled with
1515    * the offset.  We don't want to leave our offset around in g0 or
1516    * it'll screw up texture samples, so set it up inside the message
1517    * reg.
1518    */
1519   {
1520      brw_push_insn_state(p);
1521      brw_set_mask_control(p, BRW_MASK_DISABLE);
1522      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1523
1524      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1525
1526      /* set message header global offset field (reg 0, element 2) */
1527      brw_MOV(p,
1528	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1529				  mrf.nr,
1530				  2), BRW_REGISTER_TYPE_UD),
1531	      brw_imm_ud(offset));
1532
1533      brw_pop_insn_state(p);
1534   }
1535
1536   {
1537      struct brw_reg dest;
1538      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1539      int send_commit_msg;
1540      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1541					 BRW_REGISTER_TYPE_UW);
1542
1543      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1544	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1545	 src_header = vec16(src_header);
1546      }
1547      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1548      insn->header.destreg__conditionalmod = mrf.nr;
1549
1550      /* Until gen6, writes followed by reads from the same location
1551       * are not guaranteed to be ordered unless write_commit is set.
1552       * If set, then a no-op write is issued to the destination
1553       * register to set a dependency, and a read from the destination
1554       * can be used to ensure the ordering.
1555       *
1556       * For gen6, only writes between different threads need ordering
1557       * protection.  Our use of DP writes is all about register
1558       * spilling within a thread.
1559       */
1560      if (intel->gen >= 6) {
1561	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1562	 send_commit_msg = 0;
1563      } else {
1564	 dest = src_header;
1565	 send_commit_msg = 1;
1566      }
1567
1568      brw_set_dest(p, insn, dest);
1569      if (intel->gen >= 6) {
1570	 brw_set_src0(insn, mrf);
1571      } else {
1572	 brw_set_src0(insn, brw_null_reg());
1573      }
1574
1575      if (intel->gen >= 6)
1576	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1577      else
1578	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1579
1580      brw_set_dp_write_message(p->brw,
1581			       insn,
1582			       255, /* binding table index (255=stateless) */
1583			       msg_control,
1584			       msg_type,
1585			       mlen,
1586			       GL_TRUE, /* header_present */
1587			       0, /* pixel scoreboard */
1588			       send_commit_msg, /* response_length */
1589			       0, /* eot */
1590			       send_commit_msg);
1591   }
1592}
1593
1594
1595/**
1596 * Read a block of owords (half a GRF each) from the scratch buffer
1597 * using a constant index per channel.
1598 *
1599 * Offset must be aligned to oword size (16 bytes).  Used for register
1600 * spilling.
1601 */
1602void
1603brw_oword_block_read_scratch(struct brw_compile *p,
1604			     struct brw_reg dest,
1605			     struct brw_reg mrf,
1606			     int num_regs,
1607			     GLuint offset)
1608{
1609   struct intel_context *intel = &p->brw->intel;
1610   uint32_t msg_control;
1611   int rlen;
1612
1613   if (intel->gen >= 6)
1614      offset /= 16;
1615
1616   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1617   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1618
1619   if (num_regs == 1) {
1620      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1621      rlen = 1;
1622   } else {
1623      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1624      rlen = 2;
1625   }
1626
1627   {
1628      brw_push_insn_state(p);
1629      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1630      brw_set_mask_control(p, BRW_MASK_DISABLE);
1631
1632      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1633
1634      /* set message header global offset field (reg 0, element 2) */
1635      brw_MOV(p,
1636	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1637				  mrf.nr,
1638				  2), BRW_REGISTER_TYPE_UD),
1639	      brw_imm_ud(offset));
1640
1641      brw_pop_insn_state(p);
1642   }
1643
1644   {
1645      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1646
1647      assert(insn->header.predicate_control == 0);
1648      insn->header.compression_control = BRW_COMPRESSION_NONE;
1649      insn->header.destreg__conditionalmod = mrf.nr;
1650
1651      brw_set_dest(p, insn, dest);	/* UW? */
1652      if (intel->gen >= 6) {
1653	 brw_set_src0(insn, mrf);
1654      } else {
1655	 brw_set_src0(insn, brw_null_reg());
1656      }
1657
1658      brw_set_dp_read_message(p->brw,
1659			      insn,
1660			      255, /* binding table index (255=stateless) */
1661			      msg_control,
1662			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1663			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1664			      1, /* msg_length */
1665			      rlen);
1666   }
1667}
1668
1669/**
1670 * Read a float[4] vector from the data port Data Cache (const buffer).
1671 * Location (in buffer) should be a multiple of 16.
1672 * Used for fetching shader constants.
1673 */
1674void brw_oword_block_read(struct brw_compile *p,
1675			  struct brw_reg dest,
1676			  struct brw_reg mrf,
1677			  uint32_t offset,
1678			  uint32_t bind_table_index)
1679{
1680   struct intel_context *intel = &p->brw->intel;
1681
1682   /* On newer hardware, offset is in units of owords. */
1683   if (intel->gen >= 6)
1684      offset /= 16;
1685
1686   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1687
1688   brw_push_insn_state(p);
1689   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1690   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1691   brw_set_mask_control(p, BRW_MASK_DISABLE);
1692
1693   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1694
1695   /* set message header global offset field (reg 0, element 2) */
1696   brw_MOV(p,
1697	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1698			       mrf.nr,
1699			       2), BRW_REGISTER_TYPE_UD),
1700	   brw_imm_ud(offset));
1701
1702   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1703   insn->header.destreg__conditionalmod = mrf.nr;
1704
1705   /* cast dest to a uword[8] vector */
1706   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1707
1708   brw_set_dest(p, insn, dest);
1709   if (intel->gen >= 6) {
1710      brw_set_src0(insn, mrf);
1711   } else {
1712      brw_set_src0(insn, brw_null_reg());
1713   }
1714
1715   brw_set_dp_read_message(p->brw,
1716			   insn,
1717			   bind_table_index,
1718			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1719			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1720			   0, /* source cache = data cache */
1721			   1, /* msg_length */
1722			   1); /* response_length (1 reg, 2 owords!) */
1723
1724   brw_pop_insn_state(p);
1725}
1726
1727/**
1728 * Read a set of dwords from the data port Data Cache (const buffer).
1729 *
1730 * Location (in buffer) appears as UD offsets in the register after
1731 * the provided mrf header reg.
1732 */
1733void brw_dword_scattered_read(struct brw_compile *p,
1734			      struct brw_reg dest,
1735			      struct brw_reg mrf,
1736			      uint32_t bind_table_index)
1737{
1738   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1739
1740   brw_push_insn_state(p);
1741   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1742   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1743   brw_set_mask_control(p, BRW_MASK_DISABLE);
1744   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1745   brw_pop_insn_state(p);
1746
1747   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1748   insn->header.destreg__conditionalmod = mrf.nr;
1749
1750   /* cast dest to a uword[8] vector */
1751   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1752
1753   brw_set_dest(p, insn, dest);
1754   brw_set_src0(insn, brw_null_reg());
1755
1756   brw_set_dp_read_message(p->brw,
1757			   insn,
1758			   bind_table_index,
1759			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1760			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1761			   0, /* source cache = data cache */
1762			   2, /* msg_length */
1763			   1); /* response_length */
1764}
1765
1766
1767
1768/**
1769 * Read float[4] constant(s) from VS constant buffer.
1770 * For relative addressing, two float[4] constants will be read into 'dest'.
1771 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1772 */
1773void brw_dp_READ_4_vs(struct brw_compile *p,
1774                      struct brw_reg dest,
1775                      GLuint location,
1776                      GLuint bind_table_index)
1777{
1778   struct intel_context *intel = &p->brw->intel;
1779   struct brw_instruction *insn;
1780   GLuint msg_reg_nr = 1;
1781
1782   if (intel->gen >= 6)
1783      location /= 16;
1784
1785   /* Setup MRF[1] with location/offset into const buffer */
1786   brw_push_insn_state(p);
1787   brw_set_access_mode(p, BRW_ALIGN_1);
1788   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1789   brw_set_mask_control(p, BRW_MASK_DISABLE);
1790   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1791   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1792		     BRW_REGISTER_TYPE_UD),
1793	   brw_imm_ud(location));
1794   brw_pop_insn_state(p);
1795
1796   insn = next_insn(p, BRW_OPCODE_SEND);
1797
1798   insn->header.predicate_control = BRW_PREDICATE_NONE;
1799   insn->header.compression_control = BRW_COMPRESSION_NONE;
1800   insn->header.destreg__conditionalmod = msg_reg_nr;
1801   insn->header.mask_control = BRW_MASK_DISABLE;
1802
1803   brw_set_dest(p, insn, dest);
1804   if (intel->gen >= 6) {
1805      brw_set_src0(insn, brw_message_reg(msg_reg_nr));
1806   } else {
1807      brw_set_src0(insn, brw_null_reg());
1808   }
1809
1810   brw_set_dp_read_message(p->brw,
1811			   insn,
1812			   bind_table_index,
1813			   0,
1814			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1815			   0, /* source cache = data cache */
1816			   1, /* msg_length */
1817			   1); /* response_length (1 Oword) */
1818}
1819
1820/**
1821 * Read a float[4] constant per vertex from VS constant buffer, with
1822 * relative addressing.
1823 */
1824void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1825			       struct brw_reg dest,
1826			       struct brw_reg addr_reg,
1827			       GLuint offset,
1828			       GLuint bind_table_index)
1829{
1830   struct intel_context *intel = &p->brw->intel;
1831   struct brw_reg src = brw_vec8_grf(0, 0);
1832   int msg_type;
1833
1834   /* Setup MRF[1] with offset into const buffer */
1835   brw_push_insn_state(p);
1836   brw_set_access_mode(p, BRW_ALIGN_1);
1837   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1838   brw_set_mask_control(p, BRW_MASK_DISABLE);
1839   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1840
1841   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1842    * fields ignored.
1843    */
1844   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
1845	   addr_reg, brw_imm_d(offset));
1846   brw_pop_insn_state(p);
1847
1848   gen6_resolve_implied_move(p, &src, 0);
1849   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1850
1851   insn->header.predicate_control = BRW_PREDICATE_NONE;
1852   insn->header.compression_control = BRW_COMPRESSION_NONE;
1853   insn->header.destreg__conditionalmod = 0;
1854   insn->header.mask_control = BRW_MASK_DISABLE;
1855
1856   brw_set_dest(p, insn, dest);
1857   brw_set_src0(insn, src);
1858
1859   if (intel->gen == 6)
1860      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1861   else if (intel->gen == 5 || intel->is_g4x)
1862      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1863   else
1864      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1865
1866   brw_set_dp_read_message(p->brw,
1867			   insn,
1868			   bind_table_index,
1869			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1870			   msg_type,
1871			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1872			   2, /* msg_length */
1873			   1); /* response_length */
1874}
1875
1876
1877
1878void brw_fb_WRITE(struct brw_compile *p,
1879		  int dispatch_width,
1880                  struct brw_reg dest,
1881                  GLuint msg_reg_nr,
1882                  struct brw_reg src0,
1883                  GLuint binding_table_index,
1884                  GLuint msg_length,
1885                  GLuint response_length,
1886                  GLboolean eot,
1887                  GLboolean header_present)
1888{
1889   struct intel_context *intel = &p->brw->intel;
1890   struct brw_instruction *insn;
1891   GLuint msg_control, msg_type;
1892
1893   if (intel->gen >= 6 && binding_table_index == 0) {
1894      insn = next_insn(p, BRW_OPCODE_SENDC);
1895   } else {
1896      insn = next_insn(p, BRW_OPCODE_SEND);
1897   }
1898   /* The execution mask is ignored for render target writes. */
1899   insn->header.predicate_control = 0;
1900   insn->header.compression_control = BRW_COMPRESSION_NONE;
1901
1902   if (intel->gen >= 6) {
1903       /* headerless version, just submit color payload */
1904       src0 = brw_message_reg(msg_reg_nr);
1905
1906       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1907   } else {
1908      insn->header.destreg__conditionalmod = msg_reg_nr;
1909
1910      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1911   }
1912
1913   if (dispatch_width == 16)
1914      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1915   else
1916      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1917
1918   brw_set_dest(p, insn, dest);
1919   brw_set_src0(insn, src0);
1920   brw_set_dp_write_message(p->brw,
1921			    insn,
1922			    binding_table_index,
1923			    msg_control,
1924			    msg_type,
1925			    msg_length,
1926			    header_present,
1927			    1,	/* pixel scoreboard */
1928			    response_length,
1929			    eot,
1930			    0 /* send_commit_msg */);
1931}
1932
1933
1934/**
1935 * Texture sample instruction.
1936 * Note: the msg_type plus msg_length values determine exactly what kind
1937 * of sampling operation is performed.  See volume 4, page 161 of docs.
1938 */
1939void brw_SAMPLE(struct brw_compile *p,
1940		struct brw_reg dest,
1941		GLuint msg_reg_nr,
1942		struct brw_reg src0,
1943		GLuint binding_table_index,
1944		GLuint sampler,
1945		GLuint writemask,
1946		GLuint msg_type,
1947		GLuint response_length,
1948		GLuint msg_length,
1949		GLboolean eot,
1950		GLuint header_present,
1951		GLuint simd_mode)
1952{
1953   struct intel_context *intel = &p->brw->intel;
1954   GLboolean need_stall = 0;
1955
1956   if (writemask == 0) {
1957      /*printf("%s: zero writemask??\n", __FUNCTION__); */
1958      return;
1959   }
1960
1961   /* Hardware doesn't do destination dependency checking on send
1962    * instructions properly.  Add a workaround which generates the
1963    * dependency by other means.  In practice it seems like this bug
1964    * only crops up for texture samples, and only where registers are
1965    * written by the send and then written again later without being
1966    * read in between.  Luckily for us, we already track that
1967    * information and use it to modify the writemask for the
1968    * instruction, so that is a guide for whether a workaround is
1969    * needed.
1970    */
1971   if (writemask != WRITEMASK_XYZW) {
1972      GLuint dst_offset = 0;
1973      GLuint i, newmask = 0, len = 0;
1974
1975      for (i = 0; i < 4; i++) {
1976	 if (writemask & (1<<i))
1977	    break;
1978	 dst_offset += 2;
1979      }
1980      for (; i < 4; i++) {
1981	 if (!(writemask & (1<<i)))
1982	    break;
1983	 newmask |= 1<<i;
1984	 len++;
1985      }
1986
1987      if (newmask != writemask) {
1988	 need_stall = 1;
1989         /* printf("need stall %x %x\n", newmask , writemask); */
1990      }
1991      else {
1992	 GLboolean dispatch_16 = GL_FALSE;
1993
1994	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
1995
1996	 guess_execution_size(p, p->current, dest);
1997	 if (p->current->header.execution_size == BRW_EXECUTE_16)
1998	    dispatch_16 = GL_TRUE;
1999
2000	 newmask = ~newmask & WRITEMASK_XYZW;
2001
2002	 brw_push_insn_state(p);
2003
2004	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2005	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2006
2007	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2008		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2009  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2010
2011	 brw_pop_insn_state(p);
2012
2013  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2014	 dest = offset(dest, dst_offset);
2015
2016	 /* For 16-wide dispatch, masked channels are skipped in the
2017	  * response.  For 8-wide, masked channels still take up slots,
2018	  * and are just not written to.
2019	  */
2020	 if (dispatch_16)
2021	    response_length = len * 2;
2022      }
2023   }
2024
2025   {
2026      struct brw_instruction *insn;
2027
2028      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2029
2030      insn = next_insn(p, BRW_OPCODE_SEND);
2031      insn->header.predicate_control = 0; /* XXX */
2032      insn->header.compression_control = BRW_COMPRESSION_NONE;
2033      if (intel->gen < 6)
2034	  insn->header.destreg__conditionalmod = msg_reg_nr;
2035
2036      brw_set_dest(p, insn, dest);
2037      brw_set_src0(insn, src0);
2038      brw_set_sampler_message(p->brw, insn,
2039			      binding_table_index,
2040			      sampler,
2041			      msg_type,
2042			      response_length,
2043			      msg_length,
2044			      eot,
2045			      header_present,
2046			      simd_mode);
2047   }
2048
2049   if (need_stall) {
2050      struct brw_reg reg = vec8(offset(dest, response_length-1));
2051
2052      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2053       */
2054      brw_push_insn_state(p);
2055      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2056      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2057	      retype(reg, BRW_REGISTER_TYPE_UD));
2058      brw_pop_insn_state(p);
2059   }
2060
2061}
2062
2063/* All these variables are pretty confusing - we might be better off
2064 * using bitmasks and macros for this, in the old style.  Or perhaps
2065 * just having the caller instantiate the fields in dword3 itself.
2066 */
2067void brw_urb_WRITE(struct brw_compile *p,
2068		   struct brw_reg dest,
2069		   GLuint msg_reg_nr,
2070		   struct brw_reg src0,
2071		   GLboolean allocate,
2072		   GLboolean used,
2073		   GLuint msg_length,
2074		   GLuint response_length,
2075		   GLboolean eot,
2076		   GLboolean writes_complete,
2077		   GLuint offset,
2078		   GLuint swizzle)
2079{
2080   struct intel_context *intel = &p->brw->intel;
2081   struct brw_instruction *insn;
2082
2083   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2084
2085   insn = next_insn(p, BRW_OPCODE_SEND);
2086
2087   assert(msg_length < BRW_MAX_MRF);
2088
2089   brw_set_dest(p, insn, dest);
2090   brw_set_src0(insn, src0);
2091   brw_set_src1(insn, brw_imm_d(0));
2092
2093   if (intel->gen < 6)
2094      insn->header.destreg__conditionalmod = msg_reg_nr;
2095
2096   brw_set_urb_message(p->brw,
2097		       insn,
2098		       allocate,
2099		       used,
2100		       msg_length,
2101		       response_length,
2102		       eot,
2103		       writes_complete,
2104		       offset,
2105		       swizzle);
2106}
2107
2108static int
2109brw_find_next_block_end(struct brw_compile *p, int start)
2110{
2111   int ip;
2112
2113   for (ip = start + 1; ip < p->nr_insn; ip++) {
2114      struct brw_instruction *insn = &p->store[ip];
2115
2116      switch (insn->header.opcode) {
2117      case BRW_OPCODE_ENDIF:
2118      case BRW_OPCODE_ELSE:
2119      case BRW_OPCODE_WHILE:
2120	 return ip;
2121      }
2122   }
2123   assert(!"not reached");
2124   return start + 1;
2125}
2126
2127/* There is no DO instruction on gen6, so to find the end of the loop
2128 * we have to see if the loop is jumping back before our start
2129 * instruction.
2130 */
2131static int
2132brw_find_loop_end(struct brw_compile *p, int start)
2133{
2134   int ip;
2135   int br = 2;
2136
2137   for (ip = start + 1; ip < p->nr_insn; ip++) {
2138      struct brw_instruction *insn = &p->store[ip];
2139
2140      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2141	 if (ip + insn->bits1.branch_gen6.jump_count / br < start)
2142	    return ip;
2143      }
2144   }
2145   assert(!"not reached");
2146   return start + 1;
2147}
2148
2149/* After program generation, go back and update the UIP and JIP of
2150 * BREAK and CONT instructions to their correct locations.
2151 */
2152void
2153brw_set_uip_jip(struct brw_compile *p)
2154{
2155   struct intel_context *intel = &p->brw->intel;
2156   int ip;
2157   int br = 2;
2158
2159   if (intel->gen < 6)
2160      return;
2161
2162   for (ip = 0; ip < p->nr_insn; ip++) {
2163      struct brw_instruction *insn = &p->store[ip];
2164
2165      switch (insn->header.opcode) {
2166      case BRW_OPCODE_BREAK:
2167	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2168	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
2169	 break;
2170      case BRW_OPCODE_CONTINUE:
2171	 /* JIP is set at CONTINUE emit time, since that's when we
2172	  * know where the start of the loop is.
2173	  */
2174	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2175	 assert(insn->bits3.break_cont.uip != 0);
2176	 assert(insn->bits3.break_cont.jip != 0);
2177	 break;
2178      }
2179   }
2180}
2181
2182void brw_ff_sync(struct brw_compile *p,
2183		   struct brw_reg dest,
2184		   GLuint msg_reg_nr,
2185		   struct brw_reg src0,
2186		   GLboolean allocate,
2187		   GLuint response_length,
2188		   GLboolean eot)
2189{
2190   struct intel_context *intel = &p->brw->intel;
2191   struct brw_instruction *insn;
2192
2193   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2194
2195   insn = next_insn(p, BRW_OPCODE_SEND);
2196   brw_set_dest(p, insn, dest);
2197   brw_set_src0(insn, src0);
2198   brw_set_src1(insn, brw_imm_d(0));
2199
2200   if (intel->gen < 6)
2201       insn->header.destreg__conditionalmod = msg_reg_nr;
2202
2203   brw_set_ff_sync_message(p->brw,
2204			   insn,
2205			   allocate,
2206			   response_length,
2207			   eot);
2208}
2209