brw_eu_emit.c revision c638180fc715aff84422c1092926120af966d417
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "../glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61static void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct intel_context *intel = &p->brw->intel;
67   if (intel->gen != 6)
68      return;
69
70   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
71      brw_push_insn_state(p);
72      brw_set_mask_control(p, BRW_MASK_DISABLE);
73      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
74      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
75	      retype(*src, BRW_REGISTER_TYPE_UD));
76      brw_pop_insn_state(p);
77   }
78   *src = brw_message_reg(msg_reg_nr);
79}
80
81
82static void brw_set_dest(struct brw_compile *p,
83			 struct brw_instruction *insn,
84			 struct brw_reg dest)
85{
86   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
87       dest.file != BRW_MESSAGE_REGISTER_FILE)
88      assert(dest.nr < 128);
89
90   insn->bits1.da1.dest_reg_file = dest.file;
91   insn->bits1.da1.dest_reg_type = dest.type;
92   insn->bits1.da1.dest_address_mode = dest.address_mode;
93
94   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
95      insn->bits1.da1.dest_reg_nr = dest.nr;
96
97      if (insn->header.access_mode == BRW_ALIGN_1) {
98	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
99	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
100	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
101	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
102      }
103      else {
104	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
105	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
106	 /* even ignored in da16, still need to set as '01' */
107	 insn->bits1.da16.dest_horiz_stride = 1;
108      }
109   }
110   else {
111      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
112
113      /* These are different sizes in align1 vs align16:
114       */
115      if (insn->header.access_mode == BRW_ALIGN_1) {
116	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
117	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
118	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
119	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
120      }
121      else {
122	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
123	 /* even ignored in da16, still need to set as '01' */
124	 insn->bits1.ia16.dest_horiz_stride = 1;
125      }
126   }
127
128   /* NEW: Set the execution size based on dest.width and
129    * insn->compression_control:
130    */
131   guess_execution_size(p, insn, dest);
132}
133
134extern int reg_type_size[];
135
136static void
137validate_reg(struct brw_instruction *insn, struct brw_reg reg)
138{
139   int hstride_for_reg[] = {0, 1, 2, 4};
140   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
141   int width_for_reg[] = {1, 2, 4, 8, 16};
142   int execsize_for_reg[] = {1, 2, 4, 8, 16};
143   int width, hstride, vstride, execsize;
144
145   if (reg.file == BRW_IMMEDIATE_VALUE) {
146      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
147       * mean the destination has to be 128-bit aligned and the
148       * destination horiz stride has to be a word.
149       */
150      if (reg.type == BRW_REGISTER_TYPE_V) {
151	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
152		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
153      }
154
155      return;
156   }
157
158   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
159       reg.file == BRW_ARF_NULL)
160      return;
161
162   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
163   hstride = hstride_for_reg[reg.hstride];
164
165   if (reg.vstride == 0xf) {
166      vstride = -1;
167   } else {
168      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
169      vstride = vstride_for_reg[reg.vstride];
170   }
171
172   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
173   width = width_for_reg[reg.width];
174
175   assert(insn->header.execution_size >= 0 &&
176	  insn->header.execution_size < Elements(execsize_for_reg));
177   execsize = execsize_for_reg[insn->header.execution_size];
178
179   /* Restrictions from 3.3.10: Register Region Restrictions. */
180   /* 3. */
181   assert(execsize >= width);
182
183   /* 4. */
184   if (execsize == width && hstride != 0) {
185      assert(vstride == -1 || vstride == width * hstride);
186   }
187
188   /* 5. */
189   if (execsize == width && hstride == 0) {
190      /* no restriction on vstride. */
191   }
192
193   /* 6. */
194   if (width == 1) {
195      assert(hstride == 0);
196   }
197
198   /* 7. */
199   if (execsize == 1 && width == 1) {
200      assert(hstride == 0);
201      assert(vstride == 0);
202   }
203
204   /* 8. */
205   if (vstride == 0 && hstride == 0) {
206      assert(width == 1);
207   }
208
209   /* 10. Check destination issues. */
210}
211
212static void brw_set_src0(struct brw_compile *p,
213			 struct brw_instruction *insn,
214			 struct brw_reg reg)
215{
216   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
217      assert(reg.nr < 128);
218
219   validate_reg(insn, reg);
220
221   insn->bits1.da1.src0_reg_file = reg.file;
222   insn->bits1.da1.src0_reg_type = reg.type;
223   insn->bits2.da1.src0_abs = reg.abs;
224   insn->bits2.da1.src0_negate = reg.negate;
225   insn->bits2.da1.src0_address_mode = reg.address_mode;
226
227   if (reg.file == BRW_IMMEDIATE_VALUE) {
228      insn->bits3.ud = reg.dw1.ud;
229
230      /* Required to set some fields in src1 as well:
231       */
232      insn->bits1.da1.src1_reg_file = 0; /* arf */
233      insn->bits1.da1.src1_reg_type = reg.type;
234   }
235   else
236   {
237      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
238	 if (insn->header.access_mode == BRW_ALIGN_1) {
239	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
240	    insn->bits2.da1.src0_reg_nr = reg.nr;
241	 }
242	 else {
243	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
244	    insn->bits2.da16.src0_reg_nr = reg.nr;
245	 }
246      }
247      else {
248	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
249
250	 if (insn->header.access_mode == BRW_ALIGN_1) {
251	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
252	 }
253	 else {
254	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
255	 }
256      }
257
258      if (insn->header.access_mode == BRW_ALIGN_1) {
259	 if (reg.width == BRW_WIDTH_1 &&
260	     insn->header.execution_size == BRW_EXECUTE_1) {
261	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
262	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
263	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
264	 }
265	 else {
266	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
267	    insn->bits2.da1.src0_width = reg.width;
268	    insn->bits2.da1.src0_vert_stride = reg.vstride;
269	 }
270      }
271      else {
272	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
273	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
274	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
275	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
276
277	 /* This is an oddity of the fact we're using the same
278	  * descriptions for registers in align_16 as align_1:
279	  */
280	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
281	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
282	 else
283	    insn->bits2.da16.src0_vert_stride = reg.vstride;
284      }
285   }
286}
287
288
289void brw_set_src1(struct brw_compile *p,
290		  struct brw_instruction *insn,
291		  struct brw_reg reg)
292{
293   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
294
295   assert(reg.nr < 128);
296
297   validate_reg(insn, reg);
298
299   insn->bits1.da1.src1_reg_file = reg.file;
300   insn->bits1.da1.src1_reg_type = reg.type;
301   insn->bits3.da1.src1_abs = reg.abs;
302   insn->bits3.da1.src1_negate = reg.negate;
303
304   /* Only src1 can be immediate in two-argument instructions.
305    */
306   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
307
308   if (reg.file == BRW_IMMEDIATE_VALUE) {
309      insn->bits3.ud = reg.dw1.ud;
310   }
311   else {
312      /* This is a hardware restriction, which may or may not be lifted
313       * in the future:
314       */
315      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
316      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
317
318      if (insn->header.access_mode == BRW_ALIGN_1) {
319	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
320	 insn->bits3.da1.src1_reg_nr = reg.nr;
321      }
322      else {
323	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
324	 insn->bits3.da16.src1_reg_nr = reg.nr;
325      }
326
327      if (insn->header.access_mode == BRW_ALIGN_1) {
328	 if (reg.width == BRW_WIDTH_1 &&
329	     insn->header.execution_size == BRW_EXECUTE_1) {
330	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
331	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
332	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
333	 }
334	 else {
335	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
336	    insn->bits3.da1.src1_width = reg.width;
337	    insn->bits3.da1.src1_vert_stride = reg.vstride;
338	 }
339      }
340      else {
341	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
342	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
343	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
344	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
345
346	 /* This is an oddity of the fact we're using the same
347	  * descriptions for registers in align_16 as align_1:
348	  */
349	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
350	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
351	 else
352	    insn->bits3.da16.src1_vert_stride = reg.vstride;
353      }
354   }
355}
356
357
358
359static void brw_set_math_message( struct brw_compile *p,
360				  struct brw_instruction *insn,
361				  GLuint msg_length,
362				  GLuint response_length,
363				  GLuint function,
364				  GLuint integer_type,
365				  GLboolean low_precision,
366				  GLboolean saturate,
367				  GLuint dataType )
368{
369   struct brw_context *brw = p->brw;
370   struct intel_context *intel = &brw->intel;
371   brw_set_src1(p, insn, brw_imm_d(0));
372
373   if (intel->gen == 5) {
374       insn->bits3.math_gen5.function = function;
375       insn->bits3.math_gen5.int_type = integer_type;
376       insn->bits3.math_gen5.precision = low_precision;
377       insn->bits3.math_gen5.saturate = saturate;
378       insn->bits3.math_gen5.data_type = dataType;
379       insn->bits3.math_gen5.snapshot = 0;
380       insn->bits3.math_gen5.header_present = 0;
381       insn->bits3.math_gen5.response_length = response_length;
382       insn->bits3.math_gen5.msg_length = msg_length;
383       insn->bits3.math_gen5.end_of_thread = 0;
384       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
385       insn->bits2.send_gen5.end_of_thread = 0;
386   } else {
387       insn->bits3.math.function = function;
388       insn->bits3.math.int_type = integer_type;
389       insn->bits3.math.precision = low_precision;
390       insn->bits3.math.saturate = saturate;
391       insn->bits3.math.data_type = dataType;
392       insn->bits3.math.response_length = response_length;
393       insn->bits3.math.msg_length = msg_length;
394       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
395       insn->bits3.math.end_of_thread = 0;
396   }
397}
398
399
400static void brw_set_ff_sync_message(struct brw_compile *p,
401				    struct brw_instruction *insn,
402				    GLboolean allocate,
403				    GLuint response_length,
404				    GLboolean end_of_thread)
405{
406	struct brw_context *brw = p->brw;
407	struct intel_context *intel = &brw->intel;
408	brw_set_src1(p, insn, brw_imm_d(0));
409
410	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
411	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
412	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
413	insn->bits3.urb_gen5.allocate = allocate;
414	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
415	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
416	insn->bits3.urb_gen5.header_present = 1;
417	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
418	insn->bits3.urb_gen5.msg_length = 1;
419	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
420	if (intel->gen >= 6) {
421	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
422	} else {
423	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
424	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
425	}
426}
427
428static void brw_set_urb_message( struct brw_compile *p,
429				 struct brw_instruction *insn,
430				 GLboolean allocate,
431				 GLboolean used,
432				 GLuint msg_length,
433				 GLuint response_length,
434				 GLboolean end_of_thread,
435				 GLboolean complete,
436				 GLuint offset,
437				 GLuint swizzle_control )
438{
439    struct brw_context *brw = p->brw;
440    struct intel_context *intel = &brw->intel;
441    brw_set_src1(p, insn, brw_imm_d(0));
442
443    if (intel->gen >= 5) {
444        insn->bits3.urb_gen5.opcode = 0;	/* ? */
445        insn->bits3.urb_gen5.offset = offset;
446        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
447        insn->bits3.urb_gen5.allocate = allocate;
448        insn->bits3.urb_gen5.used = used;	/* ? */
449        insn->bits3.urb_gen5.complete = complete;
450        insn->bits3.urb_gen5.header_present = 1;
451        insn->bits3.urb_gen5.response_length = response_length;
452        insn->bits3.urb_gen5.msg_length = msg_length;
453        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
454	if (intel->gen >= 6) {
455	   /* For SNB, the SFID bits moved to the condmod bits, and
456	    * EOT stayed in bits3 above.  Does the EOT bit setting
457	    * below on Ironlake even do anything?
458	    */
459	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
460	} else {
461	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
462	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
463	}
464    } else {
465        insn->bits3.urb.opcode = 0;	/* ? */
466        insn->bits3.urb.offset = offset;
467        insn->bits3.urb.swizzle_control = swizzle_control;
468        insn->bits3.urb.allocate = allocate;
469        insn->bits3.urb.used = used;	/* ? */
470        insn->bits3.urb.complete = complete;
471        insn->bits3.urb.response_length = response_length;
472        insn->bits3.urb.msg_length = msg_length;
473        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
474        insn->bits3.urb.end_of_thread = end_of_thread;
475    }
476}
477
478static void brw_set_dp_write_message( struct brw_compile *p,
479				      struct brw_instruction *insn,
480				      GLuint binding_table_index,
481				      GLuint msg_control,
482				      GLuint msg_type,
483				      GLuint msg_length,
484				      GLboolean header_present,
485				      GLuint pixel_scoreboard_clear,
486				      GLuint response_length,
487				      GLuint end_of_thread,
488				      GLuint send_commit_msg)
489{
490   struct brw_context *brw = p->brw;
491   struct intel_context *intel = &brw->intel;
492   brw_set_src1(p, insn, brw_imm_ud(0));
493
494   if (intel->gen >= 6) {
495       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
496       insn->bits3.gen6_dp.msg_control = msg_control;
497       insn->bits3.gen6_dp.pixel_scoreboard_clear = pixel_scoreboard_clear;
498       insn->bits3.gen6_dp.msg_type = msg_type;
499       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
500       insn->bits3.gen6_dp.header_present = header_present;
501       insn->bits3.gen6_dp.response_length = response_length;
502       insn->bits3.gen6_dp.msg_length = msg_length;
503       insn->bits3.gen6_dp.end_of_thread = end_of_thread;
504
505       /* We always use the render cache for write messages */
506       insn->header.destreg__conditionalmod = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
507   } else if (intel->gen == 5) {
508       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
509       insn->bits3.dp_write_gen5.msg_control = msg_control;
510       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
511       insn->bits3.dp_write_gen5.msg_type = msg_type;
512       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
513       insn->bits3.dp_write_gen5.header_present = header_present;
514       insn->bits3.dp_write_gen5.response_length = response_length;
515       insn->bits3.dp_write_gen5.msg_length = msg_length;
516       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
517       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
518       insn->bits2.send_gen5.end_of_thread = end_of_thread;
519   } else {
520       insn->bits3.dp_write.binding_table_index = binding_table_index;
521       insn->bits3.dp_write.msg_control = msg_control;
522       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
523       insn->bits3.dp_write.msg_type = msg_type;
524       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
525       insn->bits3.dp_write.response_length = response_length;
526       insn->bits3.dp_write.msg_length = msg_length;
527       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
528       insn->bits3.dp_write.end_of_thread = end_of_thread;
529   }
530}
531
532static void
533brw_set_dp_read_message(struct brw_compile *p,
534			struct brw_instruction *insn,
535			GLuint binding_table_index,
536			GLuint msg_control,
537			GLuint msg_type,
538			GLuint target_cache,
539			GLuint msg_length,
540			GLuint response_length)
541{
542   struct brw_context *brw = p->brw;
543   struct intel_context *intel = &brw->intel;
544   brw_set_src1(p, insn, brw_imm_d(0));
545
546   if (intel->gen >= 6) {
547       uint32_t target_function;
548
549       if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE)
550	  target_function = GEN6_MESSAGE_TARGET_DP_SAMPLER_CACHE;
551       else
552	  target_function = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
553
554       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
555       insn->bits3.gen6_dp.msg_control = msg_control;
556       insn->bits3.gen6_dp.pixel_scoreboard_clear = 0;
557       insn->bits3.gen6_dp.msg_type = msg_type;
558       insn->bits3.gen6_dp.send_commit_msg = 0;
559       insn->bits3.gen6_dp.header_present = 1;
560       insn->bits3.gen6_dp.response_length = response_length;
561       insn->bits3.gen6_dp.msg_length = msg_length;
562       insn->bits3.gen6_dp.end_of_thread = 0;
563       insn->header.destreg__conditionalmod = target_function;
564   } else if (intel->gen == 5) {
565       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
566       insn->bits3.dp_read_gen5.msg_control = msg_control;
567       insn->bits3.dp_read_gen5.msg_type = msg_type;
568       insn->bits3.dp_read_gen5.target_cache = target_cache;
569       insn->bits3.dp_read_gen5.header_present = 1;
570       insn->bits3.dp_read_gen5.response_length = response_length;
571       insn->bits3.dp_read_gen5.msg_length = msg_length;
572       insn->bits3.dp_read_gen5.pad1 = 0;
573       insn->bits3.dp_read_gen5.end_of_thread = 0;
574       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
575       insn->bits2.send_gen5.end_of_thread = 0;
576   } else if (intel->is_g4x) {
577       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
578       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
579       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
580       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
581       insn->bits3.dp_read_g4x.response_length = response_length;  /*16:19*/
582       insn->bits3.dp_read_g4x.msg_length = msg_length;  /*20:23*/
583       insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
584       insn->bits3.dp_read_g4x.pad1 = 0;
585       insn->bits3.dp_read_g4x.end_of_thread = 0;
586   } else {
587       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
588       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
589       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
590       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
591       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
592       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
593       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
594       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
595       insn->bits3.dp_read.end_of_thread = 0;  /*31*/
596   }
597}
598
599static void brw_set_sampler_message(struct brw_compile *p,
600                                    struct brw_instruction *insn,
601                                    GLuint binding_table_index,
602                                    GLuint sampler,
603                                    GLuint msg_type,
604                                    GLuint response_length,
605                                    GLuint msg_length,
606                                    GLboolean eot,
607                                    GLuint header_present,
608                                    GLuint simd_mode)
609{
610   struct brw_context *brw = p->brw;
611   struct intel_context *intel = &brw->intel;
612   assert(eot == 0);
613   brw_set_src1(p, insn, brw_imm_d(0));
614
615   if (intel->gen >= 5) {
616      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
617      insn->bits3.sampler_gen5.sampler = sampler;
618      insn->bits3.sampler_gen5.msg_type = msg_type;
619      insn->bits3.sampler_gen5.simd_mode = simd_mode;
620      insn->bits3.sampler_gen5.header_present = header_present;
621      insn->bits3.sampler_gen5.response_length = response_length;
622      insn->bits3.sampler_gen5.msg_length = msg_length;
623      insn->bits3.sampler_gen5.end_of_thread = eot;
624      if (intel->gen >= 6)
625	  insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
626      else {
627	  insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
628	  insn->bits2.send_gen5.end_of_thread = eot;
629      }
630   } else if (intel->is_g4x) {
631      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
632      insn->bits3.sampler_g4x.sampler = sampler;
633      insn->bits3.sampler_g4x.msg_type = msg_type;
634      insn->bits3.sampler_g4x.response_length = response_length;
635      insn->bits3.sampler_g4x.msg_length = msg_length;
636      insn->bits3.sampler_g4x.end_of_thread = eot;
637      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
638   } else {
639      insn->bits3.sampler.binding_table_index = binding_table_index;
640      insn->bits3.sampler.sampler = sampler;
641      insn->bits3.sampler.msg_type = msg_type;
642      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
643      insn->bits3.sampler.response_length = response_length;
644      insn->bits3.sampler.msg_length = msg_length;
645      insn->bits3.sampler.end_of_thread = eot;
646      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
647   }
648}
649
650
651
652static struct brw_instruction *next_insn( struct brw_compile *p,
653					  GLuint opcode )
654{
655   struct brw_instruction *insn;
656
657   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
658
659   insn = &p->store[p->nr_insn++];
660   memcpy(insn, p->current, sizeof(*insn));
661
662   /* Reset this one-shot flag:
663    */
664
665   if (p->current->header.destreg__conditionalmod) {
666      p->current->header.destreg__conditionalmod = 0;
667      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
668   }
669
670   insn->header.opcode = opcode;
671   return insn;
672}
673
674
675static struct brw_instruction *brw_alu1( struct brw_compile *p,
676					 GLuint opcode,
677					 struct brw_reg dest,
678					 struct brw_reg src )
679{
680   struct brw_instruction *insn = next_insn(p, opcode);
681   brw_set_dest(p, insn, dest);
682   brw_set_src0(p, insn, src);
683   return insn;
684}
685
686static struct brw_instruction *brw_alu2(struct brw_compile *p,
687					GLuint opcode,
688					struct brw_reg dest,
689					struct brw_reg src0,
690					struct brw_reg src1 )
691{
692   struct brw_instruction *insn = next_insn(p, opcode);
693   brw_set_dest(p, insn, dest);
694   brw_set_src0(p, insn, src0);
695   brw_set_src1(p, insn, src1);
696   return insn;
697}
698
699
700/***********************************************************************
701 * Convenience routines.
702 */
703#define ALU1(OP)					\
704struct brw_instruction *brw_##OP(struct brw_compile *p,	\
705	      struct brw_reg dest,			\
706	      struct brw_reg src0)   			\
707{							\
708   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
709}
710
711#define ALU2(OP)					\
712struct brw_instruction *brw_##OP(struct brw_compile *p,	\
713	      struct brw_reg dest,			\
714	      struct brw_reg src0,			\
715	      struct brw_reg src1)   			\
716{							\
717   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
718}
719
720/* Rounding operations (other than RNDD) require two instructions - the first
721 * stores a rounded value (possibly the wrong way) in the dest register, but
722 * also sets a per-channel "increment bit" in the flag register.  A predicated
723 * add of 1.0 fixes dest to contain the desired result.
724 */
725#define ROUND(OP)							      \
726void brw_##OP(struct brw_compile *p,					      \
727	      struct brw_reg dest,					      \
728	      struct brw_reg src)					      \
729{									      \
730   struct brw_instruction *rnd, *add;					      \
731   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
732   brw_set_dest(p, rnd, dest);						      \
733   brw_set_src0(p, rnd, src);						      \
734   rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
735									      \
736   add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
737   add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
738}
739
740
741ALU1(MOV)
742ALU2(SEL)
743ALU1(NOT)
744ALU2(AND)
745ALU2(OR)
746ALU2(XOR)
747ALU2(SHR)
748ALU2(SHL)
749ALU2(RSR)
750ALU2(RSL)
751ALU2(ASR)
752ALU1(FRC)
753ALU1(RNDD)
754ALU2(MAC)
755ALU2(MACH)
756ALU1(LZD)
757ALU2(DP4)
758ALU2(DPH)
759ALU2(DP3)
760ALU2(DP2)
761ALU2(LINE)
762ALU2(PLN)
763
764
765ROUND(RNDZ)
766ROUND(RNDE)
767
768
769struct brw_instruction *brw_ADD(struct brw_compile *p,
770				struct brw_reg dest,
771				struct brw_reg src0,
772				struct brw_reg src1)
773{
774   /* 6.2.2: add */
775   if (src0.type == BRW_REGISTER_TYPE_F ||
776       (src0.file == BRW_IMMEDIATE_VALUE &&
777	src0.type == BRW_REGISTER_TYPE_VF)) {
778      assert(src1.type != BRW_REGISTER_TYPE_UD);
779      assert(src1.type != BRW_REGISTER_TYPE_D);
780   }
781
782   if (src1.type == BRW_REGISTER_TYPE_F ||
783       (src1.file == BRW_IMMEDIATE_VALUE &&
784	src1.type == BRW_REGISTER_TYPE_VF)) {
785      assert(src0.type != BRW_REGISTER_TYPE_UD);
786      assert(src0.type != BRW_REGISTER_TYPE_D);
787   }
788
789   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
790}
791
792struct brw_instruction *brw_MUL(struct brw_compile *p,
793				struct brw_reg dest,
794				struct brw_reg src0,
795				struct brw_reg src1)
796{
797   /* 6.32.38: mul */
798   if (src0.type == BRW_REGISTER_TYPE_D ||
799       src0.type == BRW_REGISTER_TYPE_UD ||
800       src1.type == BRW_REGISTER_TYPE_D ||
801       src1.type == BRW_REGISTER_TYPE_UD) {
802      assert(dest.type != BRW_REGISTER_TYPE_F);
803   }
804
805   if (src0.type == BRW_REGISTER_TYPE_F ||
806       (src0.file == BRW_IMMEDIATE_VALUE &&
807	src0.type == BRW_REGISTER_TYPE_VF)) {
808      assert(src1.type != BRW_REGISTER_TYPE_UD);
809      assert(src1.type != BRW_REGISTER_TYPE_D);
810   }
811
812   if (src1.type == BRW_REGISTER_TYPE_F ||
813       (src1.file == BRW_IMMEDIATE_VALUE &&
814	src1.type == BRW_REGISTER_TYPE_VF)) {
815      assert(src0.type != BRW_REGISTER_TYPE_UD);
816      assert(src0.type != BRW_REGISTER_TYPE_D);
817   }
818
819   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
820	  src0.nr != BRW_ARF_ACCUMULATOR);
821   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
822	  src1.nr != BRW_ARF_ACCUMULATOR);
823
824   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
825}
826
827
828void brw_NOP(struct brw_compile *p)
829{
830   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
831   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
832   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
833   brw_set_src1(p, insn, brw_imm_ud(0x0));
834}
835
836
837
838
839
840/***********************************************************************
841 * Comparisons, if/else/endif
842 */
843
844struct brw_instruction *brw_JMPI(struct brw_compile *p,
845                                 struct brw_reg dest,
846                                 struct brw_reg src0,
847                                 struct brw_reg src1)
848{
849   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
850
851   insn->header.execution_size = 1;
852   insn->header.compression_control = BRW_COMPRESSION_NONE;
853   insn->header.mask_control = BRW_MASK_DISABLE;
854
855   p->current->header.predicate_control = BRW_PREDICATE_NONE;
856
857   return insn;
858}
859
860static void
861push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
862{
863   p->if_stack[p->if_stack_depth] = inst;
864
865   p->if_stack_depth++;
866   if (p->if_stack_array_size <= p->if_stack_depth) {
867      p->if_stack_array_size *= 2;
868      p->if_stack = reralloc(p->mem_ctx, p->if_stack, struct brw_instruction *,
869			     p->if_stack_array_size);
870   }
871}
872
873/* EU takes the value from the flag register and pushes it onto some
874 * sort of a stack (presumably merging with any flag value already on
875 * the stack).  Within an if block, the flags at the top of the stack
876 * control execution on each channel of the unit, eg. on each of the
877 * 16 pixel values in our wm programs.
878 *
879 * When the matching 'else' instruction is reached (presumably by
880 * countdown of the instruction count patched in by our ELSE/ENDIF
881 * functions), the relevent flags are inverted.
882 *
883 * When the matching 'endif' instruction is reached, the flags are
884 * popped off.  If the stack is now empty, normal execution resumes.
885 */
886struct brw_instruction *
887brw_IF(struct brw_compile *p, GLuint execute_size)
888{
889   struct intel_context *intel = &p->brw->intel;
890   struct brw_instruction *insn;
891
892   insn = next_insn(p, BRW_OPCODE_IF);
893
894   /* Override the defaults for this instruction:
895    */
896   if (intel->gen < 6) {
897      brw_set_dest(p, insn, brw_ip_reg());
898      brw_set_src0(p, insn, brw_ip_reg());
899      brw_set_src1(p, insn, brw_imm_d(0x0));
900   } else {
901      brw_set_dest(p, insn, brw_imm_w(0));
902      insn->bits1.branch_gen6.jump_count = 0;
903      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
904      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
905   }
906
907   insn->header.execution_size = execute_size;
908   insn->header.compression_control = BRW_COMPRESSION_NONE;
909   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
910   insn->header.mask_control = BRW_MASK_ENABLE;
911   if (!p->single_program_flow)
912       insn->header.thread_control = BRW_THREAD_SWITCH;
913
914   p->current->header.predicate_control = BRW_PREDICATE_NONE;
915
916   push_if_stack(p, insn);
917   return insn;
918}
919
920struct brw_instruction *
921gen6_IF(struct brw_compile *p, uint32_t conditional,
922	struct brw_reg src0, struct brw_reg src1)
923{
924   struct brw_instruction *insn;
925
926   insn = next_insn(p, BRW_OPCODE_IF);
927
928   brw_set_dest(p, insn, brw_imm_w(0));
929   insn->header.execution_size = BRW_EXECUTE_8;
930   insn->bits1.branch_gen6.jump_count = 0;
931   brw_set_src0(p, insn, src0);
932   brw_set_src1(p, insn, src1);
933
934   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
935   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
936   insn->header.destreg__conditionalmod = conditional;
937
938   if (!p->single_program_flow)
939       insn->header.thread_control = BRW_THREAD_SWITCH;
940
941   push_if_stack(p, insn);
942   return insn;
943}
944
945/**
946 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
947 */
948static void
949convert_IF_ELSE_to_ADD(struct brw_compile *p,
950		       struct brw_instruction *if_inst,
951		       struct brw_instruction *else_inst)
952{
953   /* The next instruction (where the ENDIF would be, if it existed) */
954   struct brw_instruction *next_inst = &p->store[p->nr_insn];
955
956   assert(p->single_program_flow);
957   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
958   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
959   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
960
961   /* Convert IF to an ADD instruction that moves the instruction pointer
962    * to the first instruction of the ELSE block.  If there is no ELSE
963    * block, point to where ENDIF would be.  Reverse the predicate.
964    *
965    * There's no need to execute an ENDIF since we don't need to do any
966    * stack operations, and if we're currently executing, we just want to
967    * continue normally.
968    */
969   if_inst->header.opcode = BRW_OPCODE_ADD;
970   if_inst->header.predicate_inverse = 1;
971
972   if (else_inst != NULL) {
973      /* Convert ELSE to an ADD instruction that points where the ENDIF
974       * would be.
975       */
976      else_inst->header.opcode = BRW_OPCODE_ADD;
977
978      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
979      else_inst->bits3.ud = (next_inst - else_inst) * 16;
980   } else {
981      if_inst->bits3.ud = (next_inst - if_inst) * 16;
982   }
983}
984
985/**
986 * Patch IF and ELSE instructions with appropriate jump targets.
987 */
988static void
989patch_IF_ELSE(struct brw_compile *p,
990	      struct brw_instruction *if_inst,
991	      struct brw_instruction *else_inst,
992	      struct brw_instruction *endif_inst)
993{
994   struct intel_context *intel = &p->brw->intel;
995
996   assert(!p->single_program_flow);
997   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
998   assert(endif_inst != NULL);
999   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1000
1001   unsigned br = 1;
1002   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1003    * requires 2 chunks.
1004    */
1005   if (intel->gen >= 5)
1006      br = 2;
1007
1008   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1009   endif_inst->header.execution_size = if_inst->header.execution_size;
1010
1011   if (else_inst == NULL) {
1012      /* Patch IF -> ENDIF */
1013      if (intel->gen < 6) {
1014	 /* Turn it into an IFF, which means no mask stack operations for
1015	  * all-false and jumping past the ENDIF.
1016	  */
1017	 if_inst->header.opcode = BRW_OPCODE_IFF;
1018	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1019	 if_inst->bits3.if_else.pop_count = 0;
1020	 if_inst->bits3.if_else.pad0 = 0;
1021      } else {
1022	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1023	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1024      }
1025   } else {
1026      else_inst->header.execution_size = if_inst->header.execution_size;
1027
1028      /* Patch IF -> ELSE */
1029      if (intel->gen < 6) {
1030	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1031	 if_inst->bits3.if_else.pop_count = 0;
1032	 if_inst->bits3.if_else.pad0 = 0;
1033      } else if (intel->gen == 6) {
1034	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1035      }
1036
1037      /* Patch ELSE -> ENDIF */
1038      if (intel->gen < 6) {
1039	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1040	  * matching ENDIF.
1041	  */
1042	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1043	 else_inst->bits3.if_else.pop_count = 1;
1044	 else_inst->bits3.if_else.pad0 = 0;
1045      } else {
1046	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1047	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1048      }
1049   }
1050}
1051
1052void
1053brw_ELSE(struct brw_compile *p)
1054{
1055   struct intel_context *intel = &p->brw->intel;
1056   struct brw_instruction *insn;
1057
1058   insn = next_insn(p, BRW_OPCODE_ELSE);
1059
1060   if (intel->gen < 6) {
1061      brw_set_dest(p, insn, brw_ip_reg());
1062      brw_set_src0(p, insn, brw_ip_reg());
1063      brw_set_src1(p, insn, brw_imm_d(0x0));
1064   } else {
1065      brw_set_dest(p, insn, brw_imm_w(0));
1066      insn->bits1.branch_gen6.jump_count = 0;
1067      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1068      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1069   }
1070
1071   insn->header.compression_control = BRW_COMPRESSION_NONE;
1072   insn->header.mask_control = BRW_MASK_ENABLE;
1073   if (!p->single_program_flow)
1074       insn->header.thread_control = BRW_THREAD_SWITCH;
1075
1076   push_if_stack(p, insn);
1077}
1078
1079void
1080brw_ENDIF(struct brw_compile *p)
1081{
1082   struct intel_context *intel = &p->brw->intel;
1083   struct brw_instruction *insn;
1084   struct brw_instruction *else_inst = NULL;
1085   struct brw_instruction *if_inst = NULL;
1086
1087   /* Pop the IF and (optional) ELSE instructions from the stack */
1088   p->if_stack_depth--;
1089   if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
1090      else_inst = p->if_stack[p->if_stack_depth];
1091      p->if_stack_depth--;
1092   }
1093   if_inst = p->if_stack[p->if_stack_depth];
1094
1095   if (p->single_program_flow) {
1096      /* ENDIF is useless; don't bother emitting it. */
1097      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1098      return;
1099   }
1100
1101   insn = next_insn(p, BRW_OPCODE_ENDIF);
1102
1103   if (intel->gen < 6) {
1104      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1105      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1106      brw_set_src1(p, insn, brw_imm_d(0x0));
1107   } else {
1108      brw_set_dest(p, insn, brw_imm_w(0));
1109      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1110      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1111   }
1112
1113   insn->header.compression_control = BRW_COMPRESSION_NONE;
1114   insn->header.mask_control = BRW_MASK_ENABLE;
1115   insn->header.thread_control = BRW_THREAD_SWITCH;
1116
1117   /* Also pop item off the stack in the endif instruction: */
1118   if (intel->gen < 6) {
1119      insn->bits3.if_else.jump_count = 0;
1120      insn->bits3.if_else.pop_count = 1;
1121      insn->bits3.if_else.pad0 = 0;
1122   } else {
1123      insn->bits1.branch_gen6.jump_count = 2;
1124   }
1125   patch_IF_ELSE(p, if_inst, else_inst, insn);
1126}
1127
1128struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1129{
1130   struct intel_context *intel = &p->brw->intel;
1131   struct brw_instruction *insn;
1132
1133   insn = next_insn(p, BRW_OPCODE_BREAK);
1134   if (intel->gen >= 6) {
1135      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1136      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1137      brw_set_src1(p, insn, brw_imm_d(0x0));
1138   } else {
1139      brw_set_dest(p, insn, brw_ip_reg());
1140      brw_set_src0(p, insn, brw_ip_reg());
1141      brw_set_src1(p, insn, brw_imm_d(0x0));
1142      insn->bits3.if_else.pad0 = 0;
1143      insn->bits3.if_else.pop_count = pop_count;
1144   }
1145   insn->header.compression_control = BRW_COMPRESSION_NONE;
1146   insn->header.execution_size = BRW_EXECUTE_8;
1147
1148   return insn;
1149}
1150
1151struct brw_instruction *gen6_CONT(struct brw_compile *p,
1152				  struct brw_instruction *do_insn)
1153{
1154   struct brw_instruction *insn;
1155   int br = 2;
1156
1157   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1158   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1159   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1160   brw_set_dest(p, insn, brw_ip_reg());
1161   brw_set_src0(p, insn, brw_ip_reg());
1162   brw_set_src1(p, insn, brw_imm_d(0x0));
1163
1164   insn->bits3.break_cont.uip = br * (do_insn - insn);
1165
1166   insn->header.compression_control = BRW_COMPRESSION_NONE;
1167   insn->header.execution_size = BRW_EXECUTE_8;
1168   return insn;
1169}
1170
1171struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1172{
1173   struct brw_instruction *insn;
1174   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1175   brw_set_dest(p, insn, brw_ip_reg());
1176   brw_set_src0(p, insn, brw_ip_reg());
1177   brw_set_src1(p, insn, brw_imm_d(0x0));
1178   insn->header.compression_control = BRW_COMPRESSION_NONE;
1179   insn->header.execution_size = BRW_EXECUTE_8;
1180   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1181   insn->bits3.if_else.pad0 = 0;
1182   insn->bits3.if_else.pop_count = pop_count;
1183   return insn;
1184}
1185
1186/* DO/WHILE loop:
1187 *
1188 * The DO/WHILE is just an unterminated loop -- break or continue are
1189 * used for control within the loop.  We have a few ways they can be
1190 * done.
1191 *
1192 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1193 * jip and no DO instruction.
1194 *
1195 * For non-uniform control flow pre-gen6, there's a DO instruction to
1196 * push the mask, and a WHILE to jump back, and BREAK to get out and
1197 * pop the mask.
1198 *
1199 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1200 * just points back to the first instruction of the loop.
1201 */
1202struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1203{
1204   struct intel_context *intel = &p->brw->intel;
1205
1206   if (intel->gen >= 6 || p->single_program_flow) {
1207      return &p->store[p->nr_insn];
1208   } else {
1209      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1210
1211      /* Override the defaults for this instruction:
1212       */
1213      brw_set_dest(p, insn, brw_null_reg());
1214      brw_set_src0(p, insn, brw_null_reg());
1215      brw_set_src1(p, insn, brw_null_reg());
1216
1217      insn->header.compression_control = BRW_COMPRESSION_NONE;
1218      insn->header.execution_size = execute_size;
1219      insn->header.predicate_control = BRW_PREDICATE_NONE;
1220      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1221      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1222
1223      return insn;
1224   }
1225}
1226
1227
1228
1229struct brw_instruction *brw_WHILE(struct brw_compile *p,
1230                                  struct brw_instruction *do_insn)
1231{
1232   struct intel_context *intel = &p->brw->intel;
1233   struct brw_instruction *insn;
1234   GLuint br = 1;
1235
1236   if (intel->gen >= 5)
1237      br = 2;
1238
1239   if (intel->gen >= 6) {
1240      insn = next_insn(p, BRW_OPCODE_WHILE);
1241
1242      brw_set_dest(p, insn, brw_imm_w(0));
1243      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1244      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1245      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1246
1247      insn->header.execution_size = do_insn->header.execution_size;
1248      assert(insn->header.execution_size == BRW_EXECUTE_8);
1249   } else {
1250      if (p->single_program_flow) {
1251	 insn = next_insn(p, BRW_OPCODE_ADD);
1252
1253	 brw_set_dest(p, insn, brw_ip_reg());
1254	 brw_set_src0(p, insn, brw_ip_reg());
1255	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1256	 insn->header.execution_size = BRW_EXECUTE_1;
1257      } else {
1258	 insn = next_insn(p, BRW_OPCODE_WHILE);
1259
1260	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1261
1262	 brw_set_dest(p, insn, brw_ip_reg());
1263	 brw_set_src0(p, insn, brw_ip_reg());
1264	 brw_set_src1(p, insn, brw_imm_d(0));
1265
1266	 insn->header.execution_size = do_insn->header.execution_size;
1267	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1268	 insn->bits3.if_else.pop_count = 0;
1269	 insn->bits3.if_else.pad0 = 0;
1270      }
1271   }
1272   insn->header.compression_control = BRW_COMPRESSION_NONE;
1273   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1274
1275   return insn;
1276}
1277
1278
1279/* FORWARD JUMPS:
1280 */
1281void brw_land_fwd_jump(struct brw_compile *p,
1282		       struct brw_instruction *jmp_insn)
1283{
1284   struct intel_context *intel = &p->brw->intel;
1285   struct brw_instruction *landing = &p->store[p->nr_insn];
1286   GLuint jmpi = 1;
1287
1288   if (intel->gen >= 5)
1289       jmpi = 2;
1290
1291   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1292   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1293
1294   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1295}
1296
1297
1298
1299/* To integrate with the above, it makes sense that the comparison
1300 * instruction should populate the flag register.  It might be simpler
1301 * just to use the flag reg for most WM tasks?
1302 */
1303void brw_CMP(struct brw_compile *p,
1304	     struct brw_reg dest,
1305	     GLuint conditional,
1306	     struct brw_reg src0,
1307	     struct brw_reg src1)
1308{
1309   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1310
1311   insn->header.destreg__conditionalmod = conditional;
1312   brw_set_dest(p, insn, dest);
1313   brw_set_src0(p, insn, src0);
1314   brw_set_src1(p, insn, src1);
1315
1316/*    guess_execution_size(insn, src0); */
1317
1318
1319   /* Make it so that future instructions will use the computed flag
1320    * value until brw_set_predicate_control_flag_value() is called
1321    * again.
1322    */
1323   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1324       dest.nr == 0) {
1325      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1326      p->flag_value = 0xff;
1327   }
1328}
1329
1330/* Issue 'wait' instruction for n1, host could program MMIO
1331   to wake up thread. */
1332void brw_WAIT (struct brw_compile *p)
1333{
1334   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1335   struct brw_reg src = brw_notification_1_reg();
1336
1337   brw_set_dest(p, insn, src);
1338   brw_set_src0(p, insn, src);
1339   brw_set_src1(p, insn, brw_null_reg());
1340   insn->header.execution_size = 0; /* must */
1341   insn->header.predicate_control = 0;
1342   insn->header.compression_control = 0;
1343}
1344
1345
1346/***********************************************************************
1347 * Helpers for the various SEND message types:
1348 */
1349
1350/** Extended math function, float[8].
1351 */
1352void brw_math( struct brw_compile *p,
1353	       struct brw_reg dest,
1354	       GLuint function,
1355	       GLuint saturate,
1356	       GLuint msg_reg_nr,
1357	       struct brw_reg src,
1358	       GLuint data_type,
1359	       GLuint precision )
1360{
1361   struct intel_context *intel = &p->brw->intel;
1362
1363   if (intel->gen >= 6) {
1364      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1365
1366      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1367      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1368
1369      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1370      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1371
1372      /* Source modifiers are ignored for extended math instructions. */
1373      assert(!src.negate);
1374      assert(!src.abs);
1375
1376      if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1377	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1378	 assert(src.type == BRW_REGISTER_TYPE_F);
1379      }
1380
1381      /* Math is the same ISA format as other opcodes, except that CondModifier
1382       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1383       */
1384      insn->header.destreg__conditionalmod = function;
1385      insn->header.saturate = saturate;
1386
1387      brw_set_dest(p, insn, dest);
1388      brw_set_src0(p, insn, src);
1389      brw_set_src1(p, insn, brw_null_reg());
1390   } else {
1391      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1392      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1393      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1394      /* Example code doesn't set predicate_control for send
1395       * instructions.
1396       */
1397      insn->header.predicate_control = 0;
1398      insn->header.destreg__conditionalmod = msg_reg_nr;
1399
1400      brw_set_dest(p, insn, dest);
1401      brw_set_src0(p, insn, src);
1402      brw_set_math_message(p,
1403			   insn,
1404			   msg_length, response_length,
1405			   function,
1406			   BRW_MATH_INTEGER_UNSIGNED,
1407			   precision,
1408			   saturate,
1409			   data_type);
1410   }
1411}
1412
1413/** Extended math function, float[8].
1414 */
1415void brw_math2(struct brw_compile *p,
1416	       struct brw_reg dest,
1417	       GLuint function,
1418	       struct brw_reg src0,
1419	       struct brw_reg src1)
1420{
1421   struct intel_context *intel = &p->brw->intel;
1422   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1423
1424   assert(intel->gen >= 6);
1425   (void) intel;
1426
1427
1428   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1429   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1430   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1431
1432   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1433   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1434   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1435
1436   if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1437       function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1438      assert(src0.type == BRW_REGISTER_TYPE_F);
1439      assert(src1.type == BRW_REGISTER_TYPE_F);
1440   }
1441
1442   /* Source modifiers are ignored for extended math instructions. */
1443   assert(!src0.negate);
1444   assert(!src0.abs);
1445   assert(!src1.negate);
1446   assert(!src1.abs);
1447
1448   /* Math is the same ISA format as other opcodes, except that CondModifier
1449    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1450    */
1451   insn->header.destreg__conditionalmod = function;
1452
1453   brw_set_dest(p, insn, dest);
1454   brw_set_src0(p, insn, src0);
1455   brw_set_src1(p, insn, src1);
1456}
1457
1458/**
1459 * Extended math function, float[16].
1460 * Use 2 send instructions.
1461 */
1462void brw_math_16( struct brw_compile *p,
1463		  struct brw_reg dest,
1464		  GLuint function,
1465		  GLuint saturate,
1466		  GLuint msg_reg_nr,
1467		  struct brw_reg src,
1468		  GLuint precision )
1469{
1470   struct intel_context *intel = &p->brw->intel;
1471   struct brw_instruction *insn;
1472   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1473   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1474
1475   if (intel->gen >= 6) {
1476      insn = next_insn(p, BRW_OPCODE_MATH);
1477
1478      /* Math is the same ISA format as other opcodes, except that CondModifier
1479       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1480       */
1481      insn->header.destreg__conditionalmod = function;
1482      insn->header.saturate = saturate;
1483
1484      /* Source modifiers are ignored for extended math instructions. */
1485      assert(!src.negate);
1486      assert(!src.abs);
1487
1488      brw_set_dest(p, insn, dest);
1489      brw_set_src0(p, insn, src);
1490      brw_set_src1(p, insn, brw_null_reg());
1491      return;
1492   }
1493
1494   /* First instruction:
1495    */
1496   brw_push_insn_state(p);
1497   brw_set_predicate_control_flag_value(p, 0xff);
1498   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1499
1500   insn = next_insn(p, BRW_OPCODE_SEND);
1501   insn->header.destreg__conditionalmod = msg_reg_nr;
1502
1503   brw_set_dest(p, insn, dest);
1504   brw_set_src0(p, insn, src);
1505   brw_set_math_message(p,
1506			insn,
1507			msg_length, response_length,
1508			function,
1509			BRW_MATH_INTEGER_UNSIGNED,
1510			precision,
1511			saturate,
1512			BRW_MATH_DATA_VECTOR);
1513
1514   /* Second instruction:
1515    */
1516   insn = next_insn(p, BRW_OPCODE_SEND);
1517   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1518   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1519
1520   brw_set_dest(p, insn, offset(dest,1));
1521   brw_set_src0(p, insn, src);
1522   brw_set_math_message(p,
1523			insn,
1524			msg_length, response_length,
1525			function,
1526			BRW_MATH_INTEGER_UNSIGNED,
1527			precision,
1528			saturate,
1529			BRW_MATH_DATA_VECTOR);
1530
1531   brw_pop_insn_state(p);
1532}
1533
1534
1535/**
1536 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1537 * using a constant offset per channel.
1538 *
1539 * The offset must be aligned to oword size (16 bytes).  Used for
1540 * register spilling.
1541 */
1542void brw_oword_block_write_scratch(struct brw_compile *p,
1543				   struct brw_reg mrf,
1544				   int num_regs,
1545				   GLuint offset)
1546{
1547   struct intel_context *intel = &p->brw->intel;
1548   uint32_t msg_control, msg_type;
1549   int mlen;
1550
1551   if (intel->gen >= 6)
1552      offset /= 16;
1553
1554   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1555
1556   if (num_regs == 1) {
1557      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1558      mlen = 2;
1559   } else {
1560      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1561      mlen = 3;
1562   }
1563
1564   /* Set up the message header.  This is g0, with g0.2 filled with
1565    * the offset.  We don't want to leave our offset around in g0 or
1566    * it'll screw up texture samples, so set it up inside the message
1567    * reg.
1568    */
1569   {
1570      brw_push_insn_state(p);
1571      brw_set_mask_control(p, BRW_MASK_DISABLE);
1572      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1573
1574      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1575
1576      /* set message header global offset field (reg 0, element 2) */
1577      brw_MOV(p,
1578	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1579				  mrf.nr,
1580				  2), BRW_REGISTER_TYPE_UD),
1581	      brw_imm_ud(offset));
1582
1583      brw_pop_insn_state(p);
1584   }
1585
1586   {
1587      struct brw_reg dest;
1588      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1589      int send_commit_msg;
1590      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1591					 BRW_REGISTER_TYPE_UW);
1592
1593      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1594	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1595	 src_header = vec16(src_header);
1596      }
1597      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1598      insn->header.destreg__conditionalmod = mrf.nr;
1599
1600      /* Until gen6, writes followed by reads from the same location
1601       * are not guaranteed to be ordered unless write_commit is set.
1602       * If set, then a no-op write is issued to the destination
1603       * register to set a dependency, and a read from the destination
1604       * can be used to ensure the ordering.
1605       *
1606       * For gen6, only writes between different threads need ordering
1607       * protection.  Our use of DP writes is all about register
1608       * spilling within a thread.
1609       */
1610      if (intel->gen >= 6) {
1611	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1612	 send_commit_msg = 0;
1613      } else {
1614	 dest = src_header;
1615	 send_commit_msg = 1;
1616      }
1617
1618      brw_set_dest(p, insn, dest);
1619      if (intel->gen >= 6) {
1620	 brw_set_src0(p, insn, mrf);
1621      } else {
1622	 brw_set_src0(p, insn, brw_null_reg());
1623      }
1624
1625      if (intel->gen >= 6)
1626	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1627      else
1628	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1629
1630      brw_set_dp_write_message(p,
1631			       insn,
1632			       255, /* binding table index (255=stateless) */
1633			       msg_control,
1634			       msg_type,
1635			       mlen,
1636			       GL_TRUE, /* header_present */
1637			       0, /* pixel scoreboard */
1638			       send_commit_msg, /* response_length */
1639			       0, /* eot */
1640			       send_commit_msg);
1641   }
1642}
1643
1644
1645/**
1646 * Read a block of owords (half a GRF each) from the scratch buffer
1647 * using a constant index per channel.
1648 *
1649 * Offset must be aligned to oword size (16 bytes).  Used for register
1650 * spilling.
1651 */
1652void
1653brw_oword_block_read_scratch(struct brw_compile *p,
1654			     struct brw_reg dest,
1655			     struct brw_reg mrf,
1656			     int num_regs,
1657			     GLuint offset)
1658{
1659   struct intel_context *intel = &p->brw->intel;
1660   uint32_t msg_control;
1661   int rlen;
1662
1663   if (intel->gen >= 6)
1664      offset /= 16;
1665
1666   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1667   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1668
1669   if (num_regs == 1) {
1670      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1671      rlen = 1;
1672   } else {
1673      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1674      rlen = 2;
1675   }
1676
1677   {
1678      brw_push_insn_state(p);
1679      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1680      brw_set_mask_control(p, BRW_MASK_DISABLE);
1681
1682      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1683
1684      /* set message header global offset field (reg 0, element 2) */
1685      brw_MOV(p,
1686	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1687				  mrf.nr,
1688				  2), BRW_REGISTER_TYPE_UD),
1689	      brw_imm_ud(offset));
1690
1691      brw_pop_insn_state(p);
1692   }
1693
1694   {
1695      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1696
1697      assert(insn->header.predicate_control == 0);
1698      insn->header.compression_control = BRW_COMPRESSION_NONE;
1699      insn->header.destreg__conditionalmod = mrf.nr;
1700
1701      brw_set_dest(p, insn, dest);	/* UW? */
1702      if (intel->gen >= 6) {
1703	 brw_set_src0(p, insn, mrf);
1704      } else {
1705	 brw_set_src0(p, insn, brw_null_reg());
1706      }
1707
1708      brw_set_dp_read_message(p,
1709			      insn,
1710			      255, /* binding table index (255=stateless) */
1711			      msg_control,
1712			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1713			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1714			      1, /* msg_length */
1715			      rlen);
1716   }
1717}
1718
1719/**
1720 * Read a float[4] vector from the data port Data Cache (const buffer).
1721 * Location (in buffer) should be a multiple of 16.
1722 * Used for fetching shader constants.
1723 */
1724void brw_oword_block_read(struct brw_compile *p,
1725			  struct brw_reg dest,
1726			  struct brw_reg mrf,
1727			  uint32_t offset,
1728			  uint32_t bind_table_index)
1729{
1730   struct intel_context *intel = &p->brw->intel;
1731
1732   /* On newer hardware, offset is in units of owords. */
1733   if (intel->gen >= 6)
1734      offset /= 16;
1735
1736   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1737
1738   brw_push_insn_state(p);
1739   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1740   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1741   brw_set_mask_control(p, BRW_MASK_DISABLE);
1742
1743   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1744
1745   /* set message header global offset field (reg 0, element 2) */
1746   brw_MOV(p,
1747	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1748			       mrf.nr,
1749			       2), BRW_REGISTER_TYPE_UD),
1750	   brw_imm_ud(offset));
1751
1752   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1753   insn->header.destreg__conditionalmod = mrf.nr;
1754
1755   /* cast dest to a uword[8] vector */
1756   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1757
1758   brw_set_dest(p, insn, dest);
1759   if (intel->gen >= 6) {
1760      brw_set_src0(p, insn, mrf);
1761   } else {
1762      brw_set_src0(p, insn, brw_null_reg());
1763   }
1764
1765   brw_set_dp_read_message(p,
1766			   insn,
1767			   bind_table_index,
1768			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1769			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1770			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1771			   1, /* msg_length */
1772			   1); /* response_length (1 reg, 2 owords!) */
1773
1774   brw_pop_insn_state(p);
1775}
1776
1777/**
1778 * Read a set of dwords from the data port Data Cache (const buffer).
1779 *
1780 * Location (in buffer) appears as UD offsets in the register after
1781 * the provided mrf header reg.
1782 */
1783void brw_dword_scattered_read(struct brw_compile *p,
1784			      struct brw_reg dest,
1785			      struct brw_reg mrf,
1786			      uint32_t bind_table_index)
1787{
1788   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1789
1790   brw_push_insn_state(p);
1791   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1792   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1793   brw_set_mask_control(p, BRW_MASK_DISABLE);
1794   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1795   brw_pop_insn_state(p);
1796
1797   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1798   insn->header.destreg__conditionalmod = mrf.nr;
1799
1800   /* cast dest to a uword[8] vector */
1801   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1802
1803   brw_set_dest(p, insn, dest);
1804   brw_set_src0(p, insn, brw_null_reg());
1805
1806   brw_set_dp_read_message(p,
1807			   insn,
1808			   bind_table_index,
1809			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1810			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1811			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1812			   2, /* msg_length */
1813			   1); /* response_length */
1814}
1815
1816
1817
1818/**
1819 * Read float[4] constant(s) from VS constant buffer.
1820 * For relative addressing, two float[4] constants will be read into 'dest'.
1821 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1822 */
1823void brw_dp_READ_4_vs(struct brw_compile *p,
1824                      struct brw_reg dest,
1825                      GLuint location,
1826                      GLuint bind_table_index)
1827{
1828   struct intel_context *intel = &p->brw->intel;
1829   struct brw_instruction *insn;
1830   GLuint msg_reg_nr = 1;
1831
1832   if (intel->gen >= 6)
1833      location /= 16;
1834
1835   /* Setup MRF[1] with location/offset into const buffer */
1836   brw_push_insn_state(p);
1837   brw_set_access_mode(p, BRW_ALIGN_1);
1838   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1839   brw_set_mask_control(p, BRW_MASK_DISABLE);
1840   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1841   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1842		     BRW_REGISTER_TYPE_UD),
1843	   brw_imm_ud(location));
1844   brw_pop_insn_state(p);
1845
1846   insn = next_insn(p, BRW_OPCODE_SEND);
1847
1848   insn->header.predicate_control = BRW_PREDICATE_NONE;
1849   insn->header.compression_control = BRW_COMPRESSION_NONE;
1850   insn->header.destreg__conditionalmod = msg_reg_nr;
1851   insn->header.mask_control = BRW_MASK_DISABLE;
1852
1853   brw_set_dest(p, insn, dest);
1854   if (intel->gen >= 6) {
1855      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
1856   } else {
1857      brw_set_src0(p, insn, brw_null_reg());
1858   }
1859
1860   brw_set_dp_read_message(p,
1861			   insn,
1862			   bind_table_index,
1863			   0,
1864			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1865			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1866			   1, /* msg_length */
1867			   1); /* response_length (1 Oword) */
1868}
1869
1870/**
1871 * Read a float[4] constant per vertex from VS constant buffer, with
1872 * relative addressing.
1873 */
1874void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1875			       struct brw_reg dest,
1876			       struct brw_reg addr_reg,
1877			       GLuint offset,
1878			       GLuint bind_table_index)
1879{
1880   struct intel_context *intel = &p->brw->intel;
1881   struct brw_reg src = brw_vec8_grf(0, 0);
1882   int msg_type;
1883
1884   /* Setup MRF[1] with offset into const buffer */
1885   brw_push_insn_state(p);
1886   brw_set_access_mode(p, BRW_ALIGN_1);
1887   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1888   brw_set_mask_control(p, BRW_MASK_DISABLE);
1889   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1890
1891   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1892    * fields ignored.
1893    */
1894   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
1895	   addr_reg, brw_imm_d(offset));
1896   brw_pop_insn_state(p);
1897
1898   gen6_resolve_implied_move(p, &src, 0);
1899   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1900
1901   insn->header.predicate_control = BRW_PREDICATE_NONE;
1902   insn->header.compression_control = BRW_COMPRESSION_NONE;
1903   insn->header.destreg__conditionalmod = 0;
1904   insn->header.mask_control = BRW_MASK_DISABLE;
1905
1906   brw_set_dest(p, insn, dest);
1907   brw_set_src0(p, insn, src);
1908
1909   if (intel->gen == 6)
1910      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1911   else if (intel->gen == 5 || intel->is_g4x)
1912      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1913   else
1914      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1915
1916   brw_set_dp_read_message(p,
1917			   insn,
1918			   bind_table_index,
1919			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1920			   msg_type,
1921			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1922			   2, /* msg_length */
1923			   1); /* response_length */
1924}
1925
1926
1927
1928void brw_fb_WRITE(struct brw_compile *p,
1929		  int dispatch_width,
1930                  GLuint msg_reg_nr,
1931                  struct brw_reg src0,
1932                  GLuint binding_table_index,
1933                  GLuint msg_length,
1934                  GLuint response_length,
1935                  GLboolean eot,
1936                  GLboolean header_present)
1937{
1938   struct intel_context *intel = &p->brw->intel;
1939   struct brw_instruction *insn;
1940   GLuint msg_control, msg_type;
1941   struct brw_reg dest;
1942
1943   if (dispatch_width == 16)
1944      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1945   else
1946      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1947
1948   if (intel->gen >= 6 && binding_table_index == 0) {
1949      insn = next_insn(p, BRW_OPCODE_SENDC);
1950   } else {
1951      insn = next_insn(p, BRW_OPCODE_SEND);
1952   }
1953   /* The execution mask is ignored for render target writes. */
1954   insn->header.predicate_control = 0;
1955   insn->header.compression_control = BRW_COMPRESSION_NONE;
1956
1957   if (intel->gen >= 6) {
1958       /* headerless version, just submit color payload */
1959       src0 = brw_message_reg(msg_reg_nr);
1960
1961       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1962   } else {
1963      insn->header.destreg__conditionalmod = msg_reg_nr;
1964
1965      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1966   }
1967
1968   if (dispatch_width == 16)
1969      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1970   else
1971      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1972
1973   brw_set_dest(p, insn, dest);
1974   brw_set_src0(p, insn, src0);
1975   brw_set_dp_write_message(p,
1976			    insn,
1977			    binding_table_index,
1978			    msg_control,
1979			    msg_type,
1980			    msg_length,
1981			    header_present,
1982			    1,	/* pixel scoreboard */
1983			    response_length,
1984			    eot,
1985			    0 /* send_commit_msg */);
1986}
1987
1988
1989/**
1990 * Texture sample instruction.
1991 * Note: the msg_type plus msg_length values determine exactly what kind
1992 * of sampling operation is performed.  See volume 4, page 161 of docs.
1993 */
1994void brw_SAMPLE(struct brw_compile *p,
1995		struct brw_reg dest,
1996		GLuint msg_reg_nr,
1997		struct brw_reg src0,
1998		GLuint binding_table_index,
1999		GLuint sampler,
2000		GLuint writemask,
2001		GLuint msg_type,
2002		GLuint response_length,
2003		GLuint msg_length,
2004		GLboolean eot,
2005		GLuint header_present,
2006		GLuint simd_mode)
2007{
2008   struct intel_context *intel = &p->brw->intel;
2009   GLboolean need_stall = 0;
2010
2011   if (writemask == 0) {
2012      /*printf("%s: zero writemask??\n", __FUNCTION__); */
2013      return;
2014   }
2015
2016   /* Hardware doesn't do destination dependency checking on send
2017    * instructions properly.  Add a workaround which generates the
2018    * dependency by other means.  In practice it seems like this bug
2019    * only crops up for texture samples, and only where registers are
2020    * written by the send and then written again later without being
2021    * read in between.  Luckily for us, we already track that
2022    * information and use it to modify the writemask for the
2023    * instruction, so that is a guide for whether a workaround is
2024    * needed.
2025    */
2026   if (writemask != WRITEMASK_XYZW) {
2027      GLuint dst_offset = 0;
2028      GLuint i, newmask = 0, len = 0;
2029
2030      for (i = 0; i < 4; i++) {
2031	 if (writemask & (1<<i))
2032	    break;
2033	 dst_offset += 2;
2034      }
2035      for (; i < 4; i++) {
2036	 if (!(writemask & (1<<i)))
2037	    break;
2038	 newmask |= 1<<i;
2039	 len++;
2040      }
2041
2042      if (newmask != writemask) {
2043	 need_stall = 1;
2044         /* printf("need stall %x %x\n", newmask , writemask); */
2045      }
2046      else {
2047	 GLboolean dispatch_16 = GL_FALSE;
2048
2049	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2050
2051	 guess_execution_size(p, p->current, dest);
2052	 if (p->current->header.execution_size == BRW_EXECUTE_16)
2053	    dispatch_16 = GL_TRUE;
2054
2055	 newmask = ~newmask & WRITEMASK_XYZW;
2056
2057	 brw_push_insn_state(p);
2058
2059	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2060	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2061
2062	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2063		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2064  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2065
2066	 brw_pop_insn_state(p);
2067
2068  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2069	 dest = offset(dest, dst_offset);
2070
2071	 /* For 16-wide dispatch, masked channels are skipped in the
2072	  * response.  For 8-wide, masked channels still take up slots,
2073	  * and are just not written to.
2074	  */
2075	 if (dispatch_16)
2076	    response_length = len * 2;
2077      }
2078   }
2079
2080   {
2081      struct brw_instruction *insn;
2082
2083      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2084
2085      insn = next_insn(p, BRW_OPCODE_SEND);
2086      insn->header.predicate_control = 0; /* XXX */
2087      insn->header.compression_control = BRW_COMPRESSION_NONE;
2088      if (intel->gen < 6)
2089	  insn->header.destreg__conditionalmod = msg_reg_nr;
2090
2091      brw_set_dest(p, insn, dest);
2092      brw_set_src0(p, insn, src0);
2093      brw_set_sampler_message(p, insn,
2094			      binding_table_index,
2095			      sampler,
2096			      msg_type,
2097			      response_length,
2098			      msg_length,
2099			      eot,
2100			      header_present,
2101			      simd_mode);
2102   }
2103
2104   if (need_stall) {
2105      struct brw_reg reg = vec8(offset(dest, response_length-1));
2106
2107      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2108       */
2109      brw_push_insn_state(p);
2110      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2111      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2112	      retype(reg, BRW_REGISTER_TYPE_UD));
2113      brw_pop_insn_state(p);
2114   }
2115
2116}
2117
2118/* All these variables are pretty confusing - we might be better off
2119 * using bitmasks and macros for this, in the old style.  Or perhaps
2120 * just having the caller instantiate the fields in dword3 itself.
2121 */
2122void brw_urb_WRITE(struct brw_compile *p,
2123		   struct brw_reg dest,
2124		   GLuint msg_reg_nr,
2125		   struct brw_reg src0,
2126		   GLboolean allocate,
2127		   GLboolean used,
2128		   GLuint msg_length,
2129		   GLuint response_length,
2130		   GLboolean eot,
2131		   GLboolean writes_complete,
2132		   GLuint offset,
2133		   GLuint swizzle)
2134{
2135   struct intel_context *intel = &p->brw->intel;
2136   struct brw_instruction *insn;
2137
2138   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2139
2140   insn = next_insn(p, BRW_OPCODE_SEND);
2141
2142   assert(msg_length < BRW_MAX_MRF);
2143
2144   brw_set_dest(p, insn, dest);
2145   brw_set_src0(p, insn, src0);
2146   brw_set_src1(p, insn, brw_imm_d(0));
2147
2148   if (intel->gen < 6)
2149      insn->header.destreg__conditionalmod = msg_reg_nr;
2150
2151   brw_set_urb_message(p,
2152		       insn,
2153		       allocate,
2154		       used,
2155		       msg_length,
2156		       response_length,
2157		       eot,
2158		       writes_complete,
2159		       offset,
2160		       swizzle);
2161}
2162
2163static int
2164brw_find_next_block_end(struct brw_compile *p, int start)
2165{
2166   int ip;
2167
2168   for (ip = start + 1; ip < p->nr_insn; ip++) {
2169      struct brw_instruction *insn = &p->store[ip];
2170
2171      switch (insn->header.opcode) {
2172      case BRW_OPCODE_ENDIF:
2173      case BRW_OPCODE_ELSE:
2174      case BRW_OPCODE_WHILE:
2175	 return ip;
2176      }
2177   }
2178   assert(!"not reached");
2179   return start + 1;
2180}
2181
2182/* There is no DO instruction on gen6, so to find the end of the loop
2183 * we have to see if the loop is jumping back before our start
2184 * instruction.
2185 */
2186static int
2187brw_find_loop_end(struct brw_compile *p, int start)
2188{
2189   int ip;
2190   int br = 2;
2191
2192   for (ip = start + 1; ip < p->nr_insn; ip++) {
2193      struct brw_instruction *insn = &p->store[ip];
2194
2195      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2196	 if (ip + insn->bits1.branch_gen6.jump_count / br < start)
2197	    return ip;
2198      }
2199   }
2200   assert(!"not reached");
2201   return start + 1;
2202}
2203
2204/* After program generation, go back and update the UIP and JIP of
2205 * BREAK and CONT instructions to their correct locations.
2206 */
2207void
2208brw_set_uip_jip(struct brw_compile *p)
2209{
2210   struct intel_context *intel = &p->brw->intel;
2211   int ip;
2212   int br = 2;
2213
2214   if (intel->gen < 6)
2215      return;
2216
2217   for (ip = 0; ip < p->nr_insn; ip++) {
2218      struct brw_instruction *insn = &p->store[ip];
2219
2220      switch (insn->header.opcode) {
2221      case BRW_OPCODE_BREAK:
2222	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2223	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
2224	 break;
2225      case BRW_OPCODE_CONTINUE:
2226	 /* JIP is set at CONTINUE emit time, since that's when we
2227	  * know where the start of the loop is.
2228	  */
2229	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2230	 assert(insn->bits3.break_cont.uip != 0);
2231	 assert(insn->bits3.break_cont.jip != 0);
2232	 break;
2233      }
2234   }
2235}
2236
2237void brw_ff_sync(struct brw_compile *p,
2238		   struct brw_reg dest,
2239		   GLuint msg_reg_nr,
2240		   struct brw_reg src0,
2241		   GLboolean allocate,
2242		   GLuint response_length,
2243		   GLboolean eot)
2244{
2245   struct intel_context *intel = &p->brw->intel;
2246   struct brw_instruction *insn;
2247
2248   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2249
2250   insn = next_insn(p, BRW_OPCODE_SEND);
2251   brw_set_dest(p, insn, dest);
2252   brw_set_src0(p, insn, src0);
2253   brw_set_src1(p, insn, brw_imm_d(0));
2254
2255   if (intel->gen < 6)
2256       insn->header.destreg__conditionalmod = msg_reg_nr;
2257
2258   brw_set_ff_sync_message(p,
2259			   insn,
2260			   allocate,
2261			   response_length,
2262			   eot);
2263}
2264