brw_eu_emit.c revision 09d881bf7420c97a0f684283c24b8ec3e42404ff
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37#include "../glsl/ralloc.h"
38
39/***********************************************************************
40 * Internal helper for constructing instructions
41 */
42
43static void guess_execution_size(struct brw_compile *p,
44				 struct brw_instruction *insn,
45				 struct brw_reg reg)
46{
47   if (reg.width == BRW_WIDTH_8 && p->compressed)
48      insn->header.execution_size = BRW_EXECUTE_16;
49   else
50      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
51}
52
53
54/**
55 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
56 * registers, implicitly moving the operand to a message register.
57 *
58 * On Sandybridge, this is no longer the case.  This function performs the
59 * explicit move; it should be called before emitting a SEND instruction.
60 */
61static void
62gen6_resolve_implied_move(struct brw_compile *p,
63			  struct brw_reg *src,
64			  GLuint msg_reg_nr)
65{
66   struct intel_context *intel = &p->brw->intel;
67   if (intel->gen < 6)
68      return;
69
70   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
71      brw_push_insn_state(p);
72      brw_set_mask_control(p, BRW_MASK_DISABLE);
73      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
74      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
75	      retype(*src, BRW_REGISTER_TYPE_UD));
76      brw_pop_insn_state(p);
77   }
78   *src = brw_message_reg(msg_reg_nr);
79}
80
81static void
82gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
83{
84   struct intel_context *intel = &p->brw->intel;
85   if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
86      reg->file = BRW_GENERAL_REGISTER_FILE;
87      reg->nr += 111;
88   }
89}
90
91
92static void brw_set_dest(struct brw_compile *p,
93			 struct brw_instruction *insn,
94			 struct brw_reg dest)
95{
96   struct intel_context *intel = &p->brw->intel;
97
98   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
99       dest.file != BRW_MESSAGE_REGISTER_FILE)
100      assert(dest.nr < 128);
101
102   gen7_convert_mrf_to_grf(p, &dest);
103
104   insn->bits1.da1.dest_reg_file = dest.file;
105   insn->bits1.da1.dest_reg_type = dest.type;
106   insn->bits1.da1.dest_address_mode = dest.address_mode;
107
108   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
109      insn->bits1.da1.dest_reg_nr = dest.nr;
110
111      if (insn->header.access_mode == BRW_ALIGN_1) {
112	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
113	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
114	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
115	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
116      }
117      else {
118	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
119	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
120	 /* even ignored in da16, still need to set as '01' */
121	 insn->bits1.da16.dest_horiz_stride = 1;
122      }
123   }
124   else {
125      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
126
127      /* These are different sizes in align1 vs align16:
128       */
129      if (insn->header.access_mode == BRW_ALIGN_1) {
130	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
131	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
132	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
133	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
134      }
135      else {
136	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
137	 /* even ignored in da16, still need to set as '01' */
138	 insn->bits1.ia16.dest_horiz_stride = 1;
139      }
140   }
141
142   /* NEW: Set the execution size based on dest.width and
143    * insn->compression_control:
144    */
145   guess_execution_size(p, insn, dest);
146}
147
148extern int reg_type_size[];
149
150static void
151validate_reg(struct brw_instruction *insn, struct brw_reg reg)
152{
153   int hstride_for_reg[] = {0, 1, 2, 4};
154   int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
155   int width_for_reg[] = {1, 2, 4, 8, 16};
156   int execsize_for_reg[] = {1, 2, 4, 8, 16};
157   int width, hstride, vstride, execsize;
158
159   if (reg.file == BRW_IMMEDIATE_VALUE) {
160      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
161       * mean the destination has to be 128-bit aligned and the
162       * destination horiz stride has to be a word.
163       */
164      if (reg.type == BRW_REGISTER_TYPE_V) {
165	 assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
166		reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
167      }
168
169      return;
170   }
171
172   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
173       reg.file == BRW_ARF_NULL)
174      return;
175
176   assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
177   hstride = hstride_for_reg[reg.hstride];
178
179   if (reg.vstride == 0xf) {
180      vstride = -1;
181   } else {
182      assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
183      vstride = vstride_for_reg[reg.vstride];
184   }
185
186   assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
187   width = width_for_reg[reg.width];
188
189   assert(insn->header.execution_size >= 0 &&
190	  insn->header.execution_size < Elements(execsize_for_reg));
191   execsize = execsize_for_reg[insn->header.execution_size];
192
193   /* Restrictions from 3.3.10: Register Region Restrictions. */
194   /* 3. */
195   assert(execsize >= width);
196
197   /* 4. */
198   if (execsize == width && hstride != 0) {
199      assert(vstride == -1 || vstride == width * hstride);
200   }
201
202   /* 5. */
203   if (execsize == width && hstride == 0) {
204      /* no restriction on vstride. */
205   }
206
207   /* 6. */
208   if (width == 1) {
209      assert(hstride == 0);
210   }
211
212   /* 7. */
213   if (execsize == 1 && width == 1) {
214      assert(hstride == 0);
215      assert(vstride == 0);
216   }
217
218   /* 8. */
219   if (vstride == 0 && hstride == 0) {
220      assert(width == 1);
221   }
222
223   /* 10. Check destination issues. */
224}
225
226static void brw_set_src0(struct brw_compile *p,
227			 struct brw_instruction *insn,
228			 struct brw_reg reg)
229{
230   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
231      assert(reg.nr < 128);
232
233   gen7_convert_mrf_to_grf(p, &reg);
234
235   validate_reg(insn, reg);
236
237   insn->bits1.da1.src0_reg_file = reg.file;
238   insn->bits1.da1.src0_reg_type = reg.type;
239   insn->bits2.da1.src0_abs = reg.abs;
240   insn->bits2.da1.src0_negate = reg.negate;
241   insn->bits2.da1.src0_address_mode = reg.address_mode;
242
243   if (reg.file == BRW_IMMEDIATE_VALUE) {
244      insn->bits3.ud = reg.dw1.ud;
245
246      /* Required to set some fields in src1 as well:
247       */
248      insn->bits1.da1.src1_reg_file = 0; /* arf */
249      insn->bits1.da1.src1_reg_type = reg.type;
250   }
251   else
252   {
253      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
254	 if (insn->header.access_mode == BRW_ALIGN_1) {
255	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
256	    insn->bits2.da1.src0_reg_nr = reg.nr;
257	 }
258	 else {
259	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
260	    insn->bits2.da16.src0_reg_nr = reg.nr;
261	 }
262      }
263      else {
264	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
265
266	 if (insn->header.access_mode == BRW_ALIGN_1) {
267	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
268	 }
269	 else {
270	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
271	 }
272      }
273
274      if (insn->header.access_mode == BRW_ALIGN_1) {
275	 if (reg.width == BRW_WIDTH_1 &&
276	     insn->header.execution_size == BRW_EXECUTE_1) {
277	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
278	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
279	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
280	 }
281	 else {
282	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
283	    insn->bits2.da1.src0_width = reg.width;
284	    insn->bits2.da1.src0_vert_stride = reg.vstride;
285	 }
286      }
287      else {
288	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
289	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
290	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
291	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
292
293	 /* This is an oddity of the fact we're using the same
294	  * descriptions for registers in align_16 as align_1:
295	  */
296	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
297	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
298	 else
299	    insn->bits2.da16.src0_vert_stride = reg.vstride;
300      }
301   }
302}
303
304
305void brw_set_src1(struct brw_compile *p,
306		  struct brw_instruction *insn,
307		  struct brw_reg reg)
308{
309   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
310
311   assert(reg.nr < 128);
312
313   gen7_convert_mrf_to_grf(p, &reg);
314
315   validate_reg(insn, reg);
316
317   insn->bits1.da1.src1_reg_file = reg.file;
318   insn->bits1.da1.src1_reg_type = reg.type;
319   insn->bits3.da1.src1_abs = reg.abs;
320   insn->bits3.da1.src1_negate = reg.negate;
321
322   /* Only src1 can be immediate in two-argument instructions.
323    */
324   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
325
326   if (reg.file == BRW_IMMEDIATE_VALUE) {
327      insn->bits3.ud = reg.dw1.ud;
328   }
329   else {
330      /* This is a hardware restriction, which may or may not be lifted
331       * in the future:
332       */
333      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
334      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
335
336      if (insn->header.access_mode == BRW_ALIGN_1) {
337	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
338	 insn->bits3.da1.src1_reg_nr = reg.nr;
339      }
340      else {
341	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
342	 insn->bits3.da16.src1_reg_nr = reg.nr;
343      }
344
345      if (insn->header.access_mode == BRW_ALIGN_1) {
346	 if (reg.width == BRW_WIDTH_1 &&
347	     insn->header.execution_size == BRW_EXECUTE_1) {
348	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
349	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
350	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
351	 }
352	 else {
353	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
354	    insn->bits3.da1.src1_width = reg.width;
355	    insn->bits3.da1.src1_vert_stride = reg.vstride;
356	 }
357      }
358      else {
359	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
360	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
361	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
362	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
363
364	 /* This is an oddity of the fact we're using the same
365	  * descriptions for registers in align_16 as align_1:
366	  */
367	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
368	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
369	 else
370	    insn->bits3.da16.src1_vert_stride = reg.vstride;
371      }
372   }
373}
374
375
376
377static void brw_set_math_message( struct brw_compile *p,
378				  struct brw_instruction *insn,
379				  GLuint msg_length,
380				  GLuint response_length,
381				  GLuint function,
382				  GLuint integer_type,
383				  GLboolean low_precision,
384				  GLboolean saturate,
385				  GLuint dataType )
386{
387   struct brw_context *brw = p->brw;
388   struct intel_context *intel = &brw->intel;
389   brw_set_src1(p, insn, brw_imm_d(0));
390
391   if (intel->gen == 5) {
392       insn->bits3.math_gen5.function = function;
393       insn->bits3.math_gen5.int_type = integer_type;
394       insn->bits3.math_gen5.precision = low_precision;
395       insn->bits3.math_gen5.saturate = saturate;
396       insn->bits3.math_gen5.data_type = dataType;
397       insn->bits3.math_gen5.snapshot = 0;
398       insn->bits3.math_gen5.header_present = 0;
399       insn->bits3.math_gen5.response_length = response_length;
400       insn->bits3.math_gen5.msg_length = msg_length;
401       insn->bits3.math_gen5.end_of_thread = 0;
402       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
403       insn->bits2.send_gen5.end_of_thread = 0;
404   } else {
405       insn->bits3.math.function = function;
406       insn->bits3.math.int_type = integer_type;
407       insn->bits3.math.precision = low_precision;
408       insn->bits3.math.saturate = saturate;
409       insn->bits3.math.data_type = dataType;
410       insn->bits3.math.response_length = response_length;
411       insn->bits3.math.msg_length = msg_length;
412       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
413       insn->bits3.math.end_of_thread = 0;
414   }
415}
416
417
418static void brw_set_ff_sync_message(struct brw_compile *p,
419				    struct brw_instruction *insn,
420				    GLboolean allocate,
421				    GLuint response_length,
422				    GLboolean end_of_thread)
423{
424	struct brw_context *brw = p->brw;
425	struct intel_context *intel = &brw->intel;
426	brw_set_src1(p, insn, brw_imm_d(0));
427
428	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
429	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
430	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
431	insn->bits3.urb_gen5.allocate = allocate;
432	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
433	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
434	insn->bits3.urb_gen5.header_present = 1;
435	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
436	insn->bits3.urb_gen5.msg_length = 1;
437	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
438	if (intel->gen >= 6) {
439	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
440	} else {
441	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
442	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
443	}
444}
445
446static void brw_set_urb_message( struct brw_compile *p,
447				 struct brw_instruction *insn,
448				 GLboolean allocate,
449				 GLboolean used,
450				 GLuint msg_length,
451				 GLuint response_length,
452				 GLboolean end_of_thread,
453				 GLboolean complete,
454				 GLuint offset,
455				 GLuint swizzle_control )
456{
457    struct brw_context *brw = p->brw;
458    struct intel_context *intel = &brw->intel;
459    brw_set_src1(p, insn, brw_imm_d(0));
460
461    if (intel->gen == 7) {
462        insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
463        insn->bits3.urb_gen7.offset = offset;
464        assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
465        insn->bits3.urb_gen7.swizzle_control = swizzle_control;
466        /* per_slot_offset = 0 makes it ignore offsets in message header */
467        insn->bits3.urb_gen7.per_slot_offset = 0;
468        insn->bits3.urb_gen7.complete = complete;
469        insn->bits3.urb_gen7.header_present = 1;
470        insn->bits3.urb_gen7.response_length = response_length;
471        insn->bits3.urb_gen7.msg_length = msg_length;
472        insn->bits3.urb_gen7.end_of_thread = end_of_thread;
473	insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
474    } else if (intel->gen >= 5) {
475        insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
476        insn->bits3.urb_gen5.offset = offset;
477        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
478        insn->bits3.urb_gen5.allocate = allocate;
479        insn->bits3.urb_gen5.used = used;	/* ? */
480        insn->bits3.urb_gen5.complete = complete;
481        insn->bits3.urb_gen5.header_present = 1;
482        insn->bits3.urb_gen5.response_length = response_length;
483        insn->bits3.urb_gen5.msg_length = msg_length;
484        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
485	if (intel->gen >= 6) {
486	   /* For SNB, the SFID bits moved to the condmod bits, and
487	    * EOT stayed in bits3 above.  Does the EOT bit setting
488	    * below on Ironlake even do anything?
489	    */
490	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
491	} else {
492	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
493	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
494	}
495    } else {
496        insn->bits3.urb.opcode = 0;	/* ? */
497        insn->bits3.urb.offset = offset;
498        insn->bits3.urb.swizzle_control = swizzle_control;
499        insn->bits3.urb.allocate = allocate;
500        insn->bits3.urb.used = used;	/* ? */
501        insn->bits3.urb.complete = complete;
502        insn->bits3.urb.response_length = response_length;
503        insn->bits3.urb.msg_length = msg_length;
504        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
505        insn->bits3.urb.end_of_thread = end_of_thread;
506    }
507}
508
509static void brw_set_dp_write_message( struct brw_compile *p,
510				      struct brw_instruction *insn,
511				      GLuint binding_table_index,
512				      GLuint msg_control,
513				      GLuint msg_type,
514				      GLuint msg_length,
515				      GLboolean header_present,
516				      GLuint pixel_scoreboard_clear,
517				      GLuint response_length,
518				      GLuint end_of_thread,
519				      GLuint send_commit_msg)
520{
521   struct brw_context *brw = p->brw;
522   struct intel_context *intel = &brw->intel;
523   brw_set_src1(p, insn, brw_imm_ud(0));
524
525   if (intel->gen >= 7) {
526       insn->bits3.gen7_dp.binding_table_index = binding_table_index;
527       insn->bits3.gen7_dp.msg_control = msg_control;
528       insn->bits3.gen7_dp.pixel_scoreboard_clear = pixel_scoreboard_clear;
529       insn->bits3.gen7_dp.msg_type = msg_type;
530       insn->bits3.gen7_dp.header_present = header_present;
531       insn->bits3.gen7_dp.response_length = response_length;
532       insn->bits3.gen7_dp.msg_length = msg_length;
533       insn->bits3.gen7_dp.end_of_thread = end_of_thread;
534
535       /* We always use the render cache for write messages */
536       insn->header.destreg__conditionalmod = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
537   } else if (intel->gen == 6) {
538       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
539       insn->bits3.gen6_dp.msg_control = msg_control;
540       insn->bits3.gen6_dp.pixel_scoreboard_clear = pixel_scoreboard_clear;
541       insn->bits3.gen6_dp.msg_type = msg_type;
542       insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
543       insn->bits3.gen6_dp.header_present = header_present;
544       insn->bits3.gen6_dp.response_length = response_length;
545       insn->bits3.gen6_dp.msg_length = msg_length;
546       insn->bits3.gen6_dp.end_of_thread = end_of_thread;
547
548       /* We always use the render cache for write messages */
549       insn->header.destreg__conditionalmod = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
550   } else if (intel->gen == 5) {
551       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
552       insn->bits3.dp_write_gen5.msg_control = msg_control;
553       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
554       insn->bits3.dp_write_gen5.msg_type = msg_type;
555       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
556       insn->bits3.dp_write_gen5.header_present = header_present;
557       insn->bits3.dp_write_gen5.response_length = response_length;
558       insn->bits3.dp_write_gen5.msg_length = msg_length;
559       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
560       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
561       insn->bits2.send_gen5.end_of_thread = end_of_thread;
562   } else {
563       insn->bits3.dp_write.binding_table_index = binding_table_index;
564       insn->bits3.dp_write.msg_control = msg_control;
565       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
566       insn->bits3.dp_write.msg_type = msg_type;
567       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
568       insn->bits3.dp_write.response_length = response_length;
569       insn->bits3.dp_write.msg_length = msg_length;
570       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
571       insn->bits3.dp_write.end_of_thread = end_of_thread;
572   }
573}
574
575static void
576brw_set_dp_read_message(struct brw_compile *p,
577			struct brw_instruction *insn,
578			GLuint binding_table_index,
579			GLuint msg_control,
580			GLuint msg_type,
581			GLuint target_cache,
582			GLuint msg_length,
583			GLuint response_length)
584{
585   struct brw_context *brw = p->brw;
586   struct intel_context *intel = &brw->intel;
587   brw_set_src1(p, insn, brw_imm_d(0));
588
589   if (intel->gen >= 6) {
590       uint32_t target_function;
591
592       if (target_cache == BRW_DATAPORT_READ_TARGET_DATA_CACHE)
593	  target_function = GEN6_MESSAGE_TARGET_DP_SAMPLER_CACHE;
594       else
595	  target_function = GEN6_MESSAGE_TARGET_DP_RENDER_CACHE;
596
597       insn->bits3.gen6_dp.binding_table_index = binding_table_index;
598       insn->bits3.gen6_dp.msg_control = msg_control;
599       insn->bits3.gen6_dp.pixel_scoreboard_clear = 0;
600       insn->bits3.gen6_dp.msg_type = msg_type;
601       insn->bits3.gen6_dp.send_commit_msg = 0;
602       insn->bits3.gen6_dp.header_present = 1;
603       insn->bits3.gen6_dp.response_length = response_length;
604       insn->bits3.gen6_dp.msg_length = msg_length;
605       insn->bits3.gen6_dp.end_of_thread = 0;
606       insn->header.destreg__conditionalmod = target_function;
607   } else if (intel->gen == 5) {
608       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
609       insn->bits3.dp_read_gen5.msg_control = msg_control;
610       insn->bits3.dp_read_gen5.msg_type = msg_type;
611       insn->bits3.dp_read_gen5.target_cache = target_cache;
612       insn->bits3.dp_read_gen5.header_present = 1;
613       insn->bits3.dp_read_gen5.response_length = response_length;
614       insn->bits3.dp_read_gen5.msg_length = msg_length;
615       insn->bits3.dp_read_gen5.pad1 = 0;
616       insn->bits3.dp_read_gen5.end_of_thread = 0;
617       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
618       insn->bits2.send_gen5.end_of_thread = 0;
619   } else if (intel->is_g4x) {
620       insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
621       insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
622       insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
623       insn->bits3.dp_read_g4x.target_cache = target_cache;  /*14:15*/
624       insn->bits3.dp_read_g4x.response_length = response_length;  /*16:19*/
625       insn->bits3.dp_read_g4x.msg_length = msg_length;  /*20:23*/
626       insn->bits3.dp_read_g4x.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
627       insn->bits3.dp_read_g4x.pad1 = 0;
628       insn->bits3.dp_read_g4x.end_of_thread = 0;
629   } else {
630       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
631       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
632       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
633       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
634       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
635       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
636       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
637       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
638       insn->bits3.dp_read.end_of_thread = 0;  /*31*/
639   }
640}
641
642static void brw_set_sampler_message(struct brw_compile *p,
643                                    struct brw_instruction *insn,
644                                    GLuint binding_table_index,
645                                    GLuint sampler,
646                                    GLuint msg_type,
647                                    GLuint response_length,
648                                    GLuint msg_length,
649                                    GLboolean eot,
650                                    GLuint header_present,
651                                    GLuint simd_mode)
652{
653   struct brw_context *brw = p->brw;
654   struct intel_context *intel = &brw->intel;
655   assert(eot == 0);
656   brw_set_src1(p, insn, brw_imm_d(0));
657
658   if (intel->gen >= 5) {
659      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
660      insn->bits3.sampler_gen5.sampler = sampler;
661      insn->bits3.sampler_gen5.msg_type = msg_type;
662      insn->bits3.sampler_gen5.simd_mode = simd_mode;
663      insn->bits3.sampler_gen5.header_present = header_present;
664      insn->bits3.sampler_gen5.response_length = response_length;
665      insn->bits3.sampler_gen5.msg_length = msg_length;
666      insn->bits3.sampler_gen5.end_of_thread = eot;
667      if (intel->gen >= 6)
668	  insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_SAMPLER;
669      else {
670	  insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
671	  insn->bits2.send_gen5.end_of_thread = eot;
672      }
673   } else if (intel->is_g4x) {
674      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
675      insn->bits3.sampler_g4x.sampler = sampler;
676      insn->bits3.sampler_g4x.msg_type = msg_type;
677      insn->bits3.sampler_g4x.response_length = response_length;
678      insn->bits3.sampler_g4x.msg_length = msg_length;
679      insn->bits3.sampler_g4x.end_of_thread = eot;
680      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
681   } else {
682      insn->bits3.sampler.binding_table_index = binding_table_index;
683      insn->bits3.sampler.sampler = sampler;
684      insn->bits3.sampler.msg_type = msg_type;
685      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
686      insn->bits3.sampler.response_length = response_length;
687      insn->bits3.sampler.msg_length = msg_length;
688      insn->bits3.sampler.end_of_thread = eot;
689      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
690   }
691}
692
693
694
695static struct brw_instruction *next_insn( struct brw_compile *p,
696					  GLuint opcode )
697{
698   struct brw_instruction *insn;
699
700   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
701
702   insn = &p->store[p->nr_insn++];
703   memcpy(insn, p->current, sizeof(*insn));
704
705   /* Reset this one-shot flag:
706    */
707
708   if (p->current->header.destreg__conditionalmod) {
709      p->current->header.destreg__conditionalmod = 0;
710      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
711   }
712
713   insn->header.opcode = opcode;
714   return insn;
715}
716
717
718static struct brw_instruction *brw_alu1( struct brw_compile *p,
719					 GLuint opcode,
720					 struct brw_reg dest,
721					 struct brw_reg src )
722{
723   struct brw_instruction *insn = next_insn(p, opcode);
724   brw_set_dest(p, insn, dest);
725   brw_set_src0(p, insn, src);
726   return insn;
727}
728
729static struct brw_instruction *brw_alu2(struct brw_compile *p,
730					GLuint opcode,
731					struct brw_reg dest,
732					struct brw_reg src0,
733					struct brw_reg src1 )
734{
735   struct brw_instruction *insn = next_insn(p, opcode);
736   brw_set_dest(p, insn, dest);
737   brw_set_src0(p, insn, src0);
738   brw_set_src1(p, insn, src1);
739   return insn;
740}
741
742
743/***********************************************************************
744 * Convenience routines.
745 */
746#define ALU1(OP)					\
747struct brw_instruction *brw_##OP(struct brw_compile *p,	\
748	      struct brw_reg dest,			\
749	      struct brw_reg src0)   			\
750{							\
751   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
752}
753
754#define ALU2(OP)					\
755struct brw_instruction *brw_##OP(struct brw_compile *p,	\
756	      struct brw_reg dest,			\
757	      struct brw_reg src0,			\
758	      struct brw_reg src1)   			\
759{							\
760   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
761}
762
763/* Rounding operations (other than RNDD) require two instructions - the first
764 * stores a rounded value (possibly the wrong way) in the dest register, but
765 * also sets a per-channel "increment bit" in the flag register.  A predicated
766 * add of 1.0 fixes dest to contain the desired result.
767 */
768#define ROUND(OP)							      \
769void brw_##OP(struct brw_compile *p,					      \
770	      struct brw_reg dest,					      \
771	      struct brw_reg src)					      \
772{									      \
773   struct brw_instruction *rnd, *add;					      \
774   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
775   brw_set_dest(p, rnd, dest);						      \
776   brw_set_src0(p, rnd, src);						      \
777   rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
778									      \
779   add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
780   add->header.predicate_control = BRW_PREDICATE_NORMAL;		      \
781}
782
783
784ALU1(MOV)
785ALU2(SEL)
786ALU1(NOT)
787ALU2(AND)
788ALU2(OR)
789ALU2(XOR)
790ALU2(SHR)
791ALU2(SHL)
792ALU2(RSR)
793ALU2(RSL)
794ALU2(ASR)
795ALU1(FRC)
796ALU1(RNDD)
797ALU2(MAC)
798ALU2(MACH)
799ALU1(LZD)
800ALU2(DP4)
801ALU2(DPH)
802ALU2(DP3)
803ALU2(DP2)
804ALU2(LINE)
805ALU2(PLN)
806
807
808ROUND(RNDZ)
809ROUND(RNDE)
810
811
812struct brw_instruction *brw_ADD(struct brw_compile *p,
813				struct brw_reg dest,
814				struct brw_reg src0,
815				struct brw_reg src1)
816{
817   /* 6.2.2: add */
818   if (src0.type == BRW_REGISTER_TYPE_F ||
819       (src0.file == BRW_IMMEDIATE_VALUE &&
820	src0.type == BRW_REGISTER_TYPE_VF)) {
821      assert(src1.type != BRW_REGISTER_TYPE_UD);
822      assert(src1.type != BRW_REGISTER_TYPE_D);
823   }
824
825   if (src1.type == BRW_REGISTER_TYPE_F ||
826       (src1.file == BRW_IMMEDIATE_VALUE &&
827	src1.type == BRW_REGISTER_TYPE_VF)) {
828      assert(src0.type != BRW_REGISTER_TYPE_UD);
829      assert(src0.type != BRW_REGISTER_TYPE_D);
830   }
831
832   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
833}
834
835struct brw_instruction *brw_MUL(struct brw_compile *p,
836				struct brw_reg dest,
837				struct brw_reg src0,
838				struct brw_reg src1)
839{
840   /* 6.32.38: mul */
841   if (src0.type == BRW_REGISTER_TYPE_D ||
842       src0.type == BRW_REGISTER_TYPE_UD ||
843       src1.type == BRW_REGISTER_TYPE_D ||
844       src1.type == BRW_REGISTER_TYPE_UD) {
845      assert(dest.type != BRW_REGISTER_TYPE_F);
846   }
847
848   if (src0.type == BRW_REGISTER_TYPE_F ||
849       (src0.file == BRW_IMMEDIATE_VALUE &&
850	src0.type == BRW_REGISTER_TYPE_VF)) {
851      assert(src1.type != BRW_REGISTER_TYPE_UD);
852      assert(src1.type != BRW_REGISTER_TYPE_D);
853   }
854
855   if (src1.type == BRW_REGISTER_TYPE_F ||
856       (src1.file == BRW_IMMEDIATE_VALUE &&
857	src1.type == BRW_REGISTER_TYPE_VF)) {
858      assert(src0.type != BRW_REGISTER_TYPE_UD);
859      assert(src0.type != BRW_REGISTER_TYPE_D);
860   }
861
862   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
863	  src0.nr != BRW_ARF_ACCUMULATOR);
864   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
865	  src1.nr != BRW_ARF_ACCUMULATOR);
866
867   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
868}
869
870
871void brw_NOP(struct brw_compile *p)
872{
873   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
874   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
875   brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
876   brw_set_src1(p, insn, brw_imm_ud(0x0));
877}
878
879
880
881
882
883/***********************************************************************
884 * Comparisons, if/else/endif
885 */
886
887struct brw_instruction *brw_JMPI(struct brw_compile *p,
888                                 struct brw_reg dest,
889                                 struct brw_reg src0,
890                                 struct brw_reg src1)
891{
892   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
893
894   insn->header.execution_size = 1;
895   insn->header.compression_control = BRW_COMPRESSION_NONE;
896   insn->header.mask_control = BRW_MASK_DISABLE;
897
898   p->current->header.predicate_control = BRW_PREDICATE_NONE;
899
900   return insn;
901}
902
903static void
904push_if_stack(struct brw_compile *p, struct brw_instruction *inst)
905{
906   p->if_stack[p->if_stack_depth] = inst;
907
908   p->if_stack_depth++;
909   if (p->if_stack_array_size <= p->if_stack_depth) {
910      p->if_stack_array_size *= 2;
911      p->if_stack = reralloc(p->mem_ctx, p->if_stack, struct brw_instruction *,
912			     p->if_stack_array_size);
913   }
914}
915
916/* EU takes the value from the flag register and pushes it onto some
917 * sort of a stack (presumably merging with any flag value already on
918 * the stack).  Within an if block, the flags at the top of the stack
919 * control execution on each channel of the unit, eg. on each of the
920 * 16 pixel values in our wm programs.
921 *
922 * When the matching 'else' instruction is reached (presumably by
923 * countdown of the instruction count patched in by our ELSE/ENDIF
924 * functions), the relevent flags are inverted.
925 *
926 * When the matching 'endif' instruction is reached, the flags are
927 * popped off.  If the stack is now empty, normal execution resumes.
928 */
929struct brw_instruction *
930brw_IF(struct brw_compile *p, GLuint execute_size)
931{
932   struct intel_context *intel = &p->brw->intel;
933   struct brw_instruction *insn;
934
935   insn = next_insn(p, BRW_OPCODE_IF);
936
937   /* Override the defaults for this instruction:
938    */
939   if (intel->gen < 6) {
940      brw_set_dest(p, insn, brw_ip_reg());
941      brw_set_src0(p, insn, brw_ip_reg());
942      brw_set_src1(p, insn, brw_imm_d(0x0));
943   } else {
944      brw_set_dest(p, insn, brw_imm_w(0));
945      insn->bits1.branch_gen6.jump_count = 0;
946      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
947      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
948   }
949
950   insn->header.execution_size = execute_size;
951   insn->header.compression_control = BRW_COMPRESSION_NONE;
952   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
953   insn->header.mask_control = BRW_MASK_ENABLE;
954   if (!p->single_program_flow)
955       insn->header.thread_control = BRW_THREAD_SWITCH;
956
957   p->current->header.predicate_control = BRW_PREDICATE_NONE;
958
959   push_if_stack(p, insn);
960   return insn;
961}
962
963struct brw_instruction *
964gen6_IF(struct brw_compile *p, uint32_t conditional,
965	struct brw_reg src0, struct brw_reg src1)
966{
967   struct brw_instruction *insn;
968
969   insn = next_insn(p, BRW_OPCODE_IF);
970
971   brw_set_dest(p, insn, brw_imm_w(0));
972   insn->header.execution_size = BRW_EXECUTE_8;
973   insn->bits1.branch_gen6.jump_count = 0;
974   brw_set_src0(p, insn, src0);
975   brw_set_src1(p, insn, src1);
976
977   assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
978   assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
979   insn->header.destreg__conditionalmod = conditional;
980
981   if (!p->single_program_flow)
982       insn->header.thread_control = BRW_THREAD_SWITCH;
983
984   push_if_stack(p, insn);
985   return insn;
986}
987
988/**
989 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
990 */
991static void
992convert_IF_ELSE_to_ADD(struct brw_compile *p,
993		       struct brw_instruction *if_inst,
994		       struct brw_instruction *else_inst)
995{
996   /* The next instruction (where the ENDIF would be, if it existed) */
997   struct brw_instruction *next_inst = &p->store[p->nr_insn];
998
999   assert(p->single_program_flow);
1000   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1001   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1002   assert(if_inst->header.execution_size == BRW_EXECUTE_1);
1003
1004   /* Convert IF to an ADD instruction that moves the instruction pointer
1005    * to the first instruction of the ELSE block.  If there is no ELSE
1006    * block, point to where ENDIF would be.  Reverse the predicate.
1007    *
1008    * There's no need to execute an ENDIF since we don't need to do any
1009    * stack operations, and if we're currently executing, we just want to
1010    * continue normally.
1011    */
1012   if_inst->header.opcode = BRW_OPCODE_ADD;
1013   if_inst->header.predicate_inverse = 1;
1014
1015   if (else_inst != NULL) {
1016      /* Convert ELSE to an ADD instruction that points where the ENDIF
1017       * would be.
1018       */
1019      else_inst->header.opcode = BRW_OPCODE_ADD;
1020
1021      if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
1022      else_inst->bits3.ud = (next_inst - else_inst) * 16;
1023   } else {
1024      if_inst->bits3.ud = (next_inst - if_inst) * 16;
1025   }
1026}
1027
1028/**
1029 * Patch IF and ELSE instructions with appropriate jump targets.
1030 */
1031static void
1032patch_IF_ELSE(struct brw_compile *p,
1033	      struct brw_instruction *if_inst,
1034	      struct brw_instruction *else_inst,
1035	      struct brw_instruction *endif_inst)
1036{
1037   struct intel_context *intel = &p->brw->intel;
1038
1039   assert(!p->single_program_flow);
1040   assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
1041   assert(endif_inst != NULL);
1042   assert(else_inst == NULL || else_inst->header.opcode == BRW_OPCODE_ELSE);
1043
1044   unsigned br = 1;
1045   /* Jump count is for 64bit data chunk each, so one 128bit instruction
1046    * requires 2 chunks.
1047    */
1048   if (intel->gen >= 5)
1049      br = 2;
1050
1051   assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
1052   endif_inst->header.execution_size = if_inst->header.execution_size;
1053
1054   if (else_inst == NULL) {
1055      /* Patch IF -> ENDIF */
1056      if (intel->gen < 6) {
1057	 /* Turn it into an IFF, which means no mask stack operations for
1058	  * all-false and jumping past the ENDIF.
1059	  */
1060	 if_inst->header.opcode = BRW_OPCODE_IFF;
1061	 if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
1062	 if_inst->bits3.if_else.pop_count = 0;
1063	 if_inst->bits3.if_else.pad0 = 0;
1064      } else {
1065	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1066	 if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
1067      }
1068   } else {
1069      else_inst->header.execution_size = if_inst->header.execution_size;
1070
1071      /* Patch IF -> ELSE */
1072      if (intel->gen < 6) {
1073	 if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
1074	 if_inst->bits3.if_else.pop_count = 0;
1075	 if_inst->bits3.if_else.pad0 = 0;
1076      } else if (intel->gen == 6) {
1077	 if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
1078      }
1079
1080      /* Patch ELSE -> ENDIF */
1081      if (intel->gen < 6) {
1082	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1083	  * matching ENDIF.
1084	  */
1085	 else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
1086	 else_inst->bits3.if_else.pop_count = 1;
1087	 else_inst->bits3.if_else.pad0 = 0;
1088      } else {
1089	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1090	 else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
1091      }
1092   }
1093}
1094
1095void
1096brw_ELSE(struct brw_compile *p)
1097{
1098   struct intel_context *intel = &p->brw->intel;
1099   struct brw_instruction *insn;
1100
1101   insn = next_insn(p, BRW_OPCODE_ELSE);
1102
1103   if (intel->gen < 6) {
1104      brw_set_dest(p, insn, brw_ip_reg());
1105      brw_set_src0(p, insn, brw_ip_reg());
1106      brw_set_src1(p, insn, brw_imm_d(0x0));
1107   } else {
1108      brw_set_dest(p, insn, brw_imm_w(0));
1109      insn->bits1.branch_gen6.jump_count = 0;
1110      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1111      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1112   }
1113
1114   insn->header.compression_control = BRW_COMPRESSION_NONE;
1115   insn->header.mask_control = BRW_MASK_ENABLE;
1116   if (!p->single_program_flow)
1117       insn->header.thread_control = BRW_THREAD_SWITCH;
1118
1119   push_if_stack(p, insn);
1120}
1121
1122void
1123brw_ENDIF(struct brw_compile *p)
1124{
1125   struct intel_context *intel = &p->brw->intel;
1126   struct brw_instruction *insn;
1127   struct brw_instruction *else_inst = NULL;
1128   struct brw_instruction *if_inst = NULL;
1129
1130   /* Pop the IF and (optional) ELSE instructions from the stack */
1131   p->if_stack_depth--;
1132   if (p->if_stack[p->if_stack_depth]->header.opcode == BRW_OPCODE_ELSE) {
1133      else_inst = p->if_stack[p->if_stack_depth];
1134      p->if_stack_depth--;
1135   }
1136   if_inst = p->if_stack[p->if_stack_depth];
1137
1138   if (p->single_program_flow) {
1139      /* ENDIF is useless; don't bother emitting it. */
1140      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1141      return;
1142   }
1143
1144   insn = next_insn(p, BRW_OPCODE_ENDIF);
1145
1146   if (intel->gen < 6) {
1147      brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1148      brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
1149      brw_set_src1(p, insn, brw_imm_d(0x0));
1150   } else {
1151      brw_set_dest(p, insn, brw_imm_w(0));
1152      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1153      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1154   }
1155
1156   insn->header.compression_control = BRW_COMPRESSION_NONE;
1157   insn->header.mask_control = BRW_MASK_ENABLE;
1158   insn->header.thread_control = BRW_THREAD_SWITCH;
1159
1160   /* Also pop item off the stack in the endif instruction: */
1161   if (intel->gen < 6) {
1162      insn->bits3.if_else.jump_count = 0;
1163      insn->bits3.if_else.pop_count = 1;
1164      insn->bits3.if_else.pad0 = 0;
1165   } else {
1166      insn->bits1.branch_gen6.jump_count = 2;
1167   }
1168   patch_IF_ELSE(p, if_inst, else_inst, insn);
1169}
1170
1171struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
1172{
1173   struct intel_context *intel = &p->brw->intel;
1174   struct brw_instruction *insn;
1175
1176   insn = next_insn(p, BRW_OPCODE_BREAK);
1177   if (intel->gen >= 6) {
1178      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1179      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1180      brw_set_src1(p, insn, brw_imm_d(0x0));
1181   } else {
1182      brw_set_dest(p, insn, brw_ip_reg());
1183      brw_set_src0(p, insn, brw_ip_reg());
1184      brw_set_src1(p, insn, brw_imm_d(0x0));
1185      insn->bits3.if_else.pad0 = 0;
1186      insn->bits3.if_else.pop_count = pop_count;
1187   }
1188   insn->header.compression_control = BRW_COMPRESSION_NONE;
1189   insn->header.execution_size = BRW_EXECUTE_8;
1190
1191   return insn;
1192}
1193
1194struct brw_instruction *gen6_CONT(struct brw_compile *p,
1195				  struct brw_instruction *do_insn)
1196{
1197   struct brw_instruction *insn;
1198   int br = 2;
1199
1200   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1201   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1202   brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1203   brw_set_dest(p, insn, brw_ip_reg());
1204   brw_set_src0(p, insn, brw_ip_reg());
1205   brw_set_src1(p, insn, brw_imm_d(0x0));
1206
1207   insn->bits3.break_cont.uip = br * (do_insn - insn);
1208
1209   insn->header.compression_control = BRW_COMPRESSION_NONE;
1210   insn->header.execution_size = BRW_EXECUTE_8;
1211   return insn;
1212}
1213
1214struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
1215{
1216   struct brw_instruction *insn;
1217   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1218   brw_set_dest(p, insn, brw_ip_reg());
1219   brw_set_src0(p, insn, brw_ip_reg());
1220   brw_set_src1(p, insn, brw_imm_d(0x0));
1221   insn->header.compression_control = BRW_COMPRESSION_NONE;
1222   insn->header.execution_size = BRW_EXECUTE_8;
1223   /* insn->header.mask_control = BRW_MASK_DISABLE; */
1224   insn->bits3.if_else.pad0 = 0;
1225   insn->bits3.if_else.pop_count = pop_count;
1226   return insn;
1227}
1228
1229/* DO/WHILE loop:
1230 *
1231 * The DO/WHILE is just an unterminated loop -- break or continue are
1232 * used for control within the loop.  We have a few ways they can be
1233 * done.
1234 *
1235 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1236 * jip and no DO instruction.
1237 *
1238 * For non-uniform control flow pre-gen6, there's a DO instruction to
1239 * push the mask, and a WHILE to jump back, and BREAK to get out and
1240 * pop the mask.
1241 *
1242 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1243 * just points back to the first instruction of the loop.
1244 */
1245struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
1246{
1247   struct intel_context *intel = &p->brw->intel;
1248
1249   if (intel->gen >= 6 || p->single_program_flow) {
1250      return &p->store[p->nr_insn];
1251   } else {
1252      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
1253
1254      /* Override the defaults for this instruction:
1255       */
1256      brw_set_dest(p, insn, brw_null_reg());
1257      brw_set_src0(p, insn, brw_null_reg());
1258      brw_set_src1(p, insn, brw_null_reg());
1259
1260      insn->header.compression_control = BRW_COMPRESSION_NONE;
1261      insn->header.execution_size = execute_size;
1262      insn->header.predicate_control = BRW_PREDICATE_NONE;
1263      /* insn->header.mask_control = BRW_MASK_ENABLE; */
1264      /* insn->header.mask_control = BRW_MASK_DISABLE; */
1265
1266      return insn;
1267   }
1268}
1269
1270
1271
1272struct brw_instruction *brw_WHILE(struct brw_compile *p,
1273                                  struct brw_instruction *do_insn)
1274{
1275   struct intel_context *intel = &p->brw->intel;
1276   struct brw_instruction *insn;
1277   GLuint br = 1;
1278
1279   if (intel->gen >= 5)
1280      br = 2;
1281
1282   if (intel->gen >= 6) {
1283      insn = next_insn(p, BRW_OPCODE_WHILE);
1284
1285      brw_set_dest(p, insn, brw_imm_w(0));
1286      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
1287      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1288      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1289
1290      insn->header.execution_size = do_insn->header.execution_size;
1291      assert(insn->header.execution_size == BRW_EXECUTE_8);
1292   } else {
1293      if (p->single_program_flow) {
1294	 insn = next_insn(p, BRW_OPCODE_ADD);
1295
1296	 brw_set_dest(p, insn, brw_ip_reg());
1297	 brw_set_src0(p, insn, brw_ip_reg());
1298	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1299	 insn->header.execution_size = BRW_EXECUTE_1;
1300      } else {
1301	 insn = next_insn(p, BRW_OPCODE_WHILE);
1302
1303	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
1304
1305	 brw_set_dest(p, insn, brw_ip_reg());
1306	 brw_set_src0(p, insn, brw_ip_reg());
1307	 brw_set_src1(p, insn, brw_imm_d(0));
1308
1309	 insn->header.execution_size = do_insn->header.execution_size;
1310	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
1311	 insn->bits3.if_else.pop_count = 0;
1312	 insn->bits3.if_else.pad0 = 0;
1313      }
1314   }
1315   insn->header.compression_control = BRW_COMPRESSION_NONE;
1316   p->current->header.predicate_control = BRW_PREDICATE_NONE;
1317
1318   return insn;
1319}
1320
1321
1322/* FORWARD JUMPS:
1323 */
1324void brw_land_fwd_jump(struct brw_compile *p,
1325		       struct brw_instruction *jmp_insn)
1326{
1327   struct intel_context *intel = &p->brw->intel;
1328   struct brw_instruction *landing = &p->store[p->nr_insn];
1329   GLuint jmpi = 1;
1330
1331   if (intel->gen >= 5)
1332       jmpi = 2;
1333
1334   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
1335   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
1336
1337   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
1338}
1339
1340
1341
1342/* To integrate with the above, it makes sense that the comparison
1343 * instruction should populate the flag register.  It might be simpler
1344 * just to use the flag reg for most WM tasks?
1345 */
1346void brw_CMP(struct brw_compile *p,
1347	     struct brw_reg dest,
1348	     GLuint conditional,
1349	     struct brw_reg src0,
1350	     struct brw_reg src1)
1351{
1352   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
1353
1354   insn->header.destreg__conditionalmod = conditional;
1355   brw_set_dest(p, insn, dest);
1356   brw_set_src0(p, insn, src0);
1357   brw_set_src1(p, insn, src1);
1358
1359/*    guess_execution_size(insn, src0); */
1360
1361
1362   /* Make it so that future instructions will use the computed flag
1363    * value until brw_set_predicate_control_flag_value() is called
1364    * again.
1365    */
1366   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1367       dest.nr == 0) {
1368      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
1369      p->flag_value = 0xff;
1370   }
1371}
1372
1373/* Issue 'wait' instruction for n1, host could program MMIO
1374   to wake up thread. */
1375void brw_WAIT (struct brw_compile *p)
1376{
1377   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
1378   struct brw_reg src = brw_notification_1_reg();
1379
1380   brw_set_dest(p, insn, src);
1381   brw_set_src0(p, insn, src);
1382   brw_set_src1(p, insn, brw_null_reg());
1383   insn->header.execution_size = 0; /* must */
1384   insn->header.predicate_control = 0;
1385   insn->header.compression_control = 0;
1386}
1387
1388
1389/***********************************************************************
1390 * Helpers for the various SEND message types:
1391 */
1392
1393/** Extended math function, float[8].
1394 */
1395void brw_math( struct brw_compile *p,
1396	       struct brw_reg dest,
1397	       GLuint function,
1398	       GLuint saturate,
1399	       GLuint msg_reg_nr,
1400	       struct brw_reg src,
1401	       GLuint data_type,
1402	       GLuint precision )
1403{
1404   struct intel_context *intel = &p->brw->intel;
1405
1406   if (intel->gen >= 6) {
1407      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1408
1409      assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1410      assert(src.file == BRW_GENERAL_REGISTER_FILE);
1411
1412      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1413      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
1414
1415      /* Source modifiers are ignored for extended math instructions. */
1416      assert(!src.negate);
1417      assert(!src.abs);
1418
1419      if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1420	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1421	 assert(src.type == BRW_REGISTER_TYPE_F);
1422      }
1423
1424      /* Math is the same ISA format as other opcodes, except that CondModifier
1425       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1426       */
1427      insn->header.destreg__conditionalmod = function;
1428      insn->header.saturate = saturate;
1429
1430      brw_set_dest(p, insn, dest);
1431      brw_set_src0(p, insn, src);
1432      brw_set_src1(p, insn, brw_null_reg());
1433   } else {
1434      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1435      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1436      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1437      /* Example code doesn't set predicate_control for send
1438       * instructions.
1439       */
1440      insn->header.predicate_control = 0;
1441      insn->header.destreg__conditionalmod = msg_reg_nr;
1442
1443      brw_set_dest(p, insn, dest);
1444      brw_set_src0(p, insn, src);
1445      brw_set_math_message(p,
1446			   insn,
1447			   msg_length, response_length,
1448			   function,
1449			   BRW_MATH_INTEGER_UNSIGNED,
1450			   precision,
1451			   saturate,
1452			   data_type);
1453   }
1454}
1455
1456/** Extended math function, float[8].
1457 */
1458void brw_math2(struct brw_compile *p,
1459	       struct brw_reg dest,
1460	       GLuint function,
1461	       struct brw_reg src0,
1462	       struct brw_reg src1)
1463{
1464   struct intel_context *intel = &p->brw->intel;
1465   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
1466
1467   assert(intel->gen >= 6);
1468   (void) intel;
1469
1470
1471   assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1472   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
1473   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
1474
1475   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1476   assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1477   assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1478
1479   if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
1480       function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1481      assert(src0.type == BRW_REGISTER_TYPE_F);
1482      assert(src1.type == BRW_REGISTER_TYPE_F);
1483   }
1484
1485   /* Source modifiers are ignored for extended math instructions. */
1486   assert(!src0.negate);
1487   assert(!src0.abs);
1488   assert(!src1.negate);
1489   assert(!src1.abs);
1490
1491   /* Math is the same ISA format as other opcodes, except that CondModifier
1492    * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1493    */
1494   insn->header.destreg__conditionalmod = function;
1495
1496   brw_set_dest(p, insn, dest);
1497   brw_set_src0(p, insn, src0);
1498   brw_set_src1(p, insn, src1);
1499}
1500
1501/**
1502 * Extended math function, float[16].
1503 * Use 2 send instructions.
1504 */
1505void brw_math_16( struct brw_compile *p,
1506		  struct brw_reg dest,
1507		  GLuint function,
1508		  GLuint saturate,
1509		  GLuint msg_reg_nr,
1510		  struct brw_reg src,
1511		  GLuint precision )
1512{
1513   struct intel_context *intel = &p->brw->intel;
1514   struct brw_instruction *insn;
1515   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1516   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1517
1518   if (intel->gen >= 6) {
1519      insn = next_insn(p, BRW_OPCODE_MATH);
1520
1521      /* Math is the same ISA format as other opcodes, except that CondModifier
1522       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1523       */
1524      insn->header.destreg__conditionalmod = function;
1525      insn->header.saturate = saturate;
1526
1527      /* Source modifiers are ignored for extended math instructions. */
1528      assert(!src.negate);
1529      assert(!src.abs);
1530
1531      brw_set_dest(p, insn, dest);
1532      brw_set_src0(p, insn, src);
1533      brw_set_src1(p, insn, brw_null_reg());
1534      return;
1535   }
1536
1537   /* First instruction:
1538    */
1539   brw_push_insn_state(p);
1540   brw_set_predicate_control_flag_value(p, 0xff);
1541   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1542
1543   insn = next_insn(p, BRW_OPCODE_SEND);
1544   insn->header.destreg__conditionalmod = msg_reg_nr;
1545
1546   brw_set_dest(p, insn, dest);
1547   brw_set_src0(p, insn, src);
1548   brw_set_math_message(p,
1549			insn,
1550			msg_length, response_length,
1551			function,
1552			BRW_MATH_INTEGER_UNSIGNED,
1553			precision,
1554			saturate,
1555			BRW_MATH_DATA_VECTOR);
1556
1557   /* Second instruction:
1558    */
1559   insn = next_insn(p, BRW_OPCODE_SEND);
1560   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1561   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1562
1563   brw_set_dest(p, insn, offset(dest,1));
1564   brw_set_src0(p, insn, src);
1565   brw_set_math_message(p,
1566			insn,
1567			msg_length, response_length,
1568			function,
1569			BRW_MATH_INTEGER_UNSIGNED,
1570			precision,
1571			saturate,
1572			BRW_MATH_DATA_VECTOR);
1573
1574   brw_pop_insn_state(p);
1575}
1576
1577
1578/**
1579 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1580 * using a constant offset per channel.
1581 *
1582 * The offset must be aligned to oword size (16 bytes).  Used for
1583 * register spilling.
1584 */
1585void brw_oword_block_write_scratch(struct brw_compile *p,
1586				   struct brw_reg mrf,
1587				   int num_regs,
1588				   GLuint offset)
1589{
1590   struct intel_context *intel = &p->brw->intel;
1591   uint32_t msg_control, msg_type;
1592   int mlen;
1593
1594   if (intel->gen >= 6)
1595      offset /= 16;
1596
1597   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1598
1599   if (num_regs == 1) {
1600      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1601      mlen = 2;
1602   } else {
1603      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1604      mlen = 3;
1605   }
1606
1607   /* Set up the message header.  This is g0, with g0.2 filled with
1608    * the offset.  We don't want to leave our offset around in g0 or
1609    * it'll screw up texture samples, so set it up inside the message
1610    * reg.
1611    */
1612   {
1613      brw_push_insn_state(p);
1614      brw_set_mask_control(p, BRW_MASK_DISABLE);
1615      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1616
1617      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1618
1619      /* set message header global offset field (reg 0, element 2) */
1620      brw_MOV(p,
1621	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1622				  mrf.nr,
1623				  2), BRW_REGISTER_TYPE_UD),
1624	      brw_imm_ud(offset));
1625
1626      brw_pop_insn_state(p);
1627   }
1628
1629   {
1630      struct brw_reg dest;
1631      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1632      int send_commit_msg;
1633      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
1634					 BRW_REGISTER_TYPE_UW);
1635
1636      if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
1637	 insn->header.compression_control = BRW_COMPRESSION_NONE;
1638	 src_header = vec16(src_header);
1639      }
1640      assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
1641      insn->header.destreg__conditionalmod = mrf.nr;
1642
1643      /* Until gen6, writes followed by reads from the same location
1644       * are not guaranteed to be ordered unless write_commit is set.
1645       * If set, then a no-op write is issued to the destination
1646       * register to set a dependency, and a read from the destination
1647       * can be used to ensure the ordering.
1648       *
1649       * For gen6, only writes between different threads need ordering
1650       * protection.  Our use of DP writes is all about register
1651       * spilling within a thread.
1652       */
1653      if (intel->gen >= 6) {
1654	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1655	 send_commit_msg = 0;
1656      } else {
1657	 dest = src_header;
1658	 send_commit_msg = 1;
1659      }
1660
1661      brw_set_dest(p, insn, dest);
1662      if (intel->gen >= 6) {
1663	 brw_set_src0(p, insn, mrf);
1664      } else {
1665	 brw_set_src0(p, insn, brw_null_reg());
1666      }
1667
1668      if (intel->gen >= 6)
1669	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1670      else
1671	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
1672
1673      brw_set_dp_write_message(p,
1674			       insn,
1675			       255, /* binding table index (255=stateless) */
1676			       msg_control,
1677			       msg_type,
1678			       mlen,
1679			       GL_TRUE, /* header_present */
1680			       0, /* pixel scoreboard */
1681			       send_commit_msg, /* response_length */
1682			       0, /* eot */
1683			       send_commit_msg);
1684   }
1685}
1686
1687
1688/**
1689 * Read a block of owords (half a GRF each) from the scratch buffer
1690 * using a constant index per channel.
1691 *
1692 * Offset must be aligned to oword size (16 bytes).  Used for register
1693 * spilling.
1694 */
1695void
1696brw_oword_block_read_scratch(struct brw_compile *p,
1697			     struct brw_reg dest,
1698			     struct brw_reg mrf,
1699			     int num_regs,
1700			     GLuint offset)
1701{
1702   struct intel_context *intel = &p->brw->intel;
1703   uint32_t msg_control;
1704   int rlen;
1705
1706   if (intel->gen >= 6)
1707      offset /= 16;
1708
1709   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1710   dest = retype(dest, BRW_REGISTER_TYPE_UW);
1711
1712   if (num_regs == 1) {
1713      msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
1714      rlen = 1;
1715   } else {
1716      msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
1717      rlen = 2;
1718   }
1719
1720   {
1721      brw_push_insn_state(p);
1722      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1723      brw_set_mask_control(p, BRW_MASK_DISABLE);
1724
1725      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1726
1727      /* set message header global offset field (reg 0, element 2) */
1728      brw_MOV(p,
1729	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1730				  mrf.nr,
1731				  2), BRW_REGISTER_TYPE_UD),
1732	      brw_imm_ud(offset));
1733
1734      brw_pop_insn_state(p);
1735   }
1736
1737   {
1738      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1739
1740      assert(insn->header.predicate_control == 0);
1741      insn->header.compression_control = BRW_COMPRESSION_NONE;
1742      insn->header.destreg__conditionalmod = mrf.nr;
1743
1744      brw_set_dest(p, insn, dest);	/* UW? */
1745      if (intel->gen >= 6) {
1746	 brw_set_src0(p, insn, mrf);
1747      } else {
1748	 brw_set_src0(p, insn, brw_null_reg());
1749      }
1750
1751      brw_set_dp_read_message(p,
1752			      insn,
1753			      255, /* binding table index (255=stateless) */
1754			      msg_control,
1755			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1756			      BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
1757			      1, /* msg_length */
1758			      rlen);
1759   }
1760}
1761
1762/**
1763 * Read a float[4] vector from the data port Data Cache (const buffer).
1764 * Location (in buffer) should be a multiple of 16.
1765 * Used for fetching shader constants.
1766 */
1767void brw_oword_block_read(struct brw_compile *p,
1768			  struct brw_reg dest,
1769			  struct brw_reg mrf,
1770			  uint32_t offset,
1771			  uint32_t bind_table_index)
1772{
1773   struct intel_context *intel = &p->brw->intel;
1774
1775   /* On newer hardware, offset is in units of owords. */
1776   if (intel->gen >= 6)
1777      offset /= 16;
1778
1779   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1780
1781   brw_push_insn_state(p);
1782   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1783   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1784   brw_set_mask_control(p, BRW_MASK_DISABLE);
1785
1786   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1787
1788   /* set message header global offset field (reg 0, element 2) */
1789   brw_MOV(p,
1790	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
1791			       mrf.nr,
1792			       2), BRW_REGISTER_TYPE_UD),
1793	   brw_imm_ud(offset));
1794
1795   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1796   insn->header.destreg__conditionalmod = mrf.nr;
1797
1798   /* cast dest to a uword[8] vector */
1799   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1800
1801   brw_set_dest(p, insn, dest);
1802   if (intel->gen >= 6) {
1803      brw_set_src0(p, insn, mrf);
1804   } else {
1805      brw_set_src0(p, insn, brw_null_reg());
1806   }
1807
1808   brw_set_dp_read_message(p,
1809			   insn,
1810			   bind_table_index,
1811			   BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
1812			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
1813			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1814			   1, /* msg_length */
1815			   1); /* response_length (1 reg, 2 owords!) */
1816
1817   brw_pop_insn_state(p);
1818}
1819
1820/**
1821 * Read a set of dwords from the data port Data Cache (const buffer).
1822 *
1823 * Location (in buffer) appears as UD offsets in the register after
1824 * the provided mrf header reg.
1825 */
1826void brw_dword_scattered_read(struct brw_compile *p,
1827			      struct brw_reg dest,
1828			      struct brw_reg mrf,
1829			      uint32_t bind_table_index)
1830{
1831   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
1832
1833   brw_push_insn_state(p);
1834   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1835   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1836   brw_set_mask_control(p, BRW_MASK_DISABLE);
1837   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1838   brw_pop_insn_state(p);
1839
1840   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1841   insn->header.destreg__conditionalmod = mrf.nr;
1842
1843   /* cast dest to a uword[8] vector */
1844   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1845
1846   brw_set_dest(p, insn, dest);
1847   brw_set_src0(p, insn, brw_null_reg());
1848
1849   brw_set_dp_read_message(p,
1850			   insn,
1851			   bind_table_index,
1852			   BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS,
1853			   BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ,
1854			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1855			   2, /* msg_length */
1856			   1); /* response_length */
1857}
1858
1859
1860
1861/**
1862 * Read float[4] constant(s) from VS constant buffer.
1863 * For relative addressing, two float[4] constants will be read into 'dest'.
1864 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1865 */
1866void brw_dp_READ_4_vs(struct brw_compile *p,
1867                      struct brw_reg dest,
1868                      GLuint location,
1869                      GLuint bind_table_index)
1870{
1871   struct intel_context *intel = &p->brw->intel;
1872   struct brw_instruction *insn;
1873   GLuint msg_reg_nr = 1;
1874
1875   if (intel->gen >= 6)
1876      location /= 16;
1877
1878   /* Setup MRF[1] with location/offset into const buffer */
1879   brw_push_insn_state(p);
1880   brw_set_access_mode(p, BRW_ALIGN_1);
1881   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1882   brw_set_mask_control(p, BRW_MASK_DISABLE);
1883   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1884   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
1885		     BRW_REGISTER_TYPE_UD),
1886	   brw_imm_ud(location));
1887   brw_pop_insn_state(p);
1888
1889   insn = next_insn(p, BRW_OPCODE_SEND);
1890
1891   insn->header.predicate_control = BRW_PREDICATE_NONE;
1892   insn->header.compression_control = BRW_COMPRESSION_NONE;
1893   insn->header.destreg__conditionalmod = msg_reg_nr;
1894   insn->header.mask_control = BRW_MASK_DISABLE;
1895
1896   brw_set_dest(p, insn, dest);
1897   if (intel->gen >= 6) {
1898      brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
1899   } else {
1900      brw_set_src0(p, insn, brw_null_reg());
1901   }
1902
1903   brw_set_dp_read_message(p,
1904			   insn,
1905			   bind_table_index,
1906			   0,
1907			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1908			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1909			   1, /* msg_length */
1910			   1); /* response_length (1 Oword) */
1911}
1912
1913/**
1914 * Read a float[4] constant per vertex from VS constant buffer, with
1915 * relative addressing.
1916 */
1917void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1918			       struct brw_reg dest,
1919			       struct brw_reg addr_reg,
1920			       GLuint offset,
1921			       GLuint bind_table_index)
1922{
1923   struct intel_context *intel = &p->brw->intel;
1924   struct brw_reg src = brw_vec8_grf(0, 0);
1925   int msg_type;
1926
1927   /* Setup MRF[1] with offset into const buffer */
1928   brw_push_insn_state(p);
1929   brw_set_access_mode(p, BRW_ALIGN_1);
1930   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1931   brw_set_mask_control(p, BRW_MASK_DISABLE);
1932   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1933
1934   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1935    * fields ignored.
1936    */
1937   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
1938	   addr_reg, brw_imm_d(offset));
1939   brw_pop_insn_state(p);
1940
1941   gen6_resolve_implied_move(p, &src, 0);
1942   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1943
1944   insn->header.predicate_control = BRW_PREDICATE_NONE;
1945   insn->header.compression_control = BRW_COMPRESSION_NONE;
1946   insn->header.destreg__conditionalmod = 0;
1947   insn->header.mask_control = BRW_MASK_DISABLE;
1948
1949   brw_set_dest(p, insn, dest);
1950   brw_set_src0(p, insn, src);
1951
1952   if (intel->gen == 6)
1953      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1954   else if (intel->gen == 5 || intel->is_g4x)
1955      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1956   else
1957      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1958
1959   brw_set_dp_read_message(p,
1960			   insn,
1961			   bind_table_index,
1962			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1963			   msg_type,
1964			   BRW_DATAPORT_READ_TARGET_DATA_CACHE,
1965			   2, /* msg_length */
1966			   1); /* response_length */
1967}
1968
1969
1970
1971void brw_fb_WRITE(struct brw_compile *p,
1972		  int dispatch_width,
1973                  GLuint msg_reg_nr,
1974                  struct brw_reg src0,
1975                  GLuint binding_table_index,
1976                  GLuint msg_length,
1977                  GLuint response_length,
1978                  GLboolean eot,
1979                  GLboolean header_present)
1980{
1981   struct intel_context *intel = &p->brw->intel;
1982   struct brw_instruction *insn;
1983   GLuint msg_control, msg_type;
1984   struct brw_reg dest;
1985
1986   if (dispatch_width == 16)
1987      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1988   else
1989      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1990
1991   if (intel->gen >= 6 && binding_table_index == 0) {
1992      insn = next_insn(p, BRW_OPCODE_SENDC);
1993   } else {
1994      insn = next_insn(p, BRW_OPCODE_SEND);
1995   }
1996   /* The execution mask is ignored for render target writes. */
1997   insn->header.predicate_control = 0;
1998   insn->header.compression_control = BRW_COMPRESSION_NONE;
1999
2000   if (intel->gen >= 6) {
2001       /* headerless version, just submit color payload */
2002       src0 = brw_message_reg(msg_reg_nr);
2003
2004       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2005   } else {
2006      insn->header.destreg__conditionalmod = msg_reg_nr;
2007
2008      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2009   }
2010
2011   if (dispatch_width == 16)
2012      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2013   else
2014      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2015
2016   brw_set_dest(p, insn, dest);
2017   brw_set_src0(p, insn, src0);
2018   brw_set_dp_write_message(p,
2019			    insn,
2020			    binding_table_index,
2021			    msg_control,
2022			    msg_type,
2023			    msg_length,
2024			    header_present,
2025			    1,	/* pixel scoreboard */
2026			    response_length,
2027			    eot,
2028			    0 /* send_commit_msg */);
2029}
2030
2031
2032/**
2033 * Texture sample instruction.
2034 * Note: the msg_type plus msg_length values determine exactly what kind
2035 * of sampling operation is performed.  See volume 4, page 161 of docs.
2036 */
2037void brw_SAMPLE(struct brw_compile *p,
2038		struct brw_reg dest,
2039		GLuint msg_reg_nr,
2040		struct brw_reg src0,
2041		GLuint binding_table_index,
2042		GLuint sampler,
2043		GLuint writemask,
2044		GLuint msg_type,
2045		GLuint response_length,
2046		GLuint msg_length,
2047		GLboolean eot,
2048		GLuint header_present,
2049		GLuint simd_mode)
2050{
2051   struct intel_context *intel = &p->brw->intel;
2052   GLboolean need_stall = 0;
2053
2054   if (writemask == 0) {
2055      /*printf("%s: zero writemask??\n", __FUNCTION__); */
2056      return;
2057   }
2058
2059   /* Hardware doesn't do destination dependency checking on send
2060    * instructions properly.  Add a workaround which generates the
2061    * dependency by other means.  In practice it seems like this bug
2062    * only crops up for texture samples, and only where registers are
2063    * written by the send and then written again later without being
2064    * read in between.  Luckily for us, we already track that
2065    * information and use it to modify the writemask for the
2066    * instruction, so that is a guide for whether a workaround is
2067    * needed.
2068    */
2069   if (writemask != WRITEMASK_XYZW) {
2070      GLuint dst_offset = 0;
2071      GLuint i, newmask = 0, len = 0;
2072
2073      for (i = 0; i < 4; i++) {
2074	 if (writemask & (1<<i))
2075	    break;
2076	 dst_offset += 2;
2077      }
2078      for (; i < 4; i++) {
2079	 if (!(writemask & (1<<i)))
2080	    break;
2081	 newmask |= 1<<i;
2082	 len++;
2083      }
2084
2085      if (newmask != writemask) {
2086	 need_stall = 1;
2087         /* printf("need stall %x %x\n", newmask , writemask); */
2088      }
2089      else {
2090	 GLboolean dispatch_16 = GL_FALSE;
2091
2092	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
2093
2094	 guess_execution_size(p, p->current, dest);
2095	 if (p->current->header.execution_size == BRW_EXECUTE_16)
2096	    dispatch_16 = GL_TRUE;
2097
2098	 newmask = ~newmask & WRITEMASK_XYZW;
2099
2100	 brw_push_insn_state(p);
2101
2102	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2103	 brw_set_mask_control(p, BRW_MASK_DISABLE);
2104
2105	 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
2106		 retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
2107  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
2108
2109	 brw_pop_insn_state(p);
2110
2111  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
2112	 dest = offset(dest, dst_offset);
2113
2114	 /* For 16-wide dispatch, masked channels are skipped in the
2115	  * response.  For 8-wide, masked channels still take up slots,
2116	  * and are just not written to.
2117	  */
2118	 if (dispatch_16)
2119	    response_length = len * 2;
2120      }
2121   }
2122
2123   {
2124      struct brw_instruction *insn;
2125
2126      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2127
2128      insn = next_insn(p, BRW_OPCODE_SEND);
2129      insn->header.predicate_control = 0; /* XXX */
2130      insn->header.compression_control = BRW_COMPRESSION_NONE;
2131      if (intel->gen < 6)
2132	  insn->header.destreg__conditionalmod = msg_reg_nr;
2133
2134      brw_set_dest(p, insn, dest);
2135      brw_set_src0(p, insn, src0);
2136      brw_set_sampler_message(p, insn,
2137			      binding_table_index,
2138			      sampler,
2139			      msg_type,
2140			      response_length,
2141			      msg_length,
2142			      eot,
2143			      header_present,
2144			      simd_mode);
2145   }
2146
2147   if (need_stall) {
2148      struct brw_reg reg = vec8(offset(dest, response_length-1));
2149
2150      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
2151       */
2152      brw_push_insn_state(p);
2153      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2154      brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
2155	      retype(reg, BRW_REGISTER_TYPE_UD));
2156      brw_pop_insn_state(p);
2157   }
2158
2159}
2160
2161/* All these variables are pretty confusing - we might be better off
2162 * using bitmasks and macros for this, in the old style.  Or perhaps
2163 * just having the caller instantiate the fields in dword3 itself.
2164 */
2165void brw_urb_WRITE(struct brw_compile *p,
2166		   struct brw_reg dest,
2167		   GLuint msg_reg_nr,
2168		   struct brw_reg src0,
2169		   GLboolean allocate,
2170		   GLboolean used,
2171		   GLuint msg_length,
2172		   GLuint response_length,
2173		   GLboolean eot,
2174		   GLboolean writes_complete,
2175		   GLuint offset,
2176		   GLuint swizzle)
2177{
2178   struct intel_context *intel = &p->brw->intel;
2179   struct brw_instruction *insn;
2180
2181   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2182
2183   if (intel->gen == 7) {
2184      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2185      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2186		       BRW_REGISTER_TYPE_UD),
2187	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2188		brw_imm_ud(0xff00));
2189   }
2190
2191   insn = next_insn(p, BRW_OPCODE_SEND);
2192
2193   assert(msg_length < BRW_MAX_MRF);
2194
2195   brw_set_dest(p, insn, dest);
2196   brw_set_src0(p, insn, src0);
2197   brw_set_src1(p, insn, brw_imm_d(0));
2198
2199   if (intel->gen < 6)
2200      insn->header.destreg__conditionalmod = msg_reg_nr;
2201
2202   brw_set_urb_message(p,
2203		       insn,
2204		       allocate,
2205		       used,
2206		       msg_length,
2207		       response_length,
2208		       eot,
2209		       writes_complete,
2210		       offset,
2211		       swizzle);
2212}
2213
2214static int
2215brw_find_next_block_end(struct brw_compile *p, int start)
2216{
2217   int ip;
2218
2219   for (ip = start + 1; ip < p->nr_insn; ip++) {
2220      struct brw_instruction *insn = &p->store[ip];
2221
2222      switch (insn->header.opcode) {
2223      case BRW_OPCODE_ENDIF:
2224      case BRW_OPCODE_ELSE:
2225      case BRW_OPCODE_WHILE:
2226	 return ip;
2227      }
2228   }
2229   assert(!"not reached");
2230   return start + 1;
2231}
2232
2233/* There is no DO instruction on gen6, so to find the end of the loop
2234 * we have to see if the loop is jumping back before our start
2235 * instruction.
2236 */
2237static int
2238brw_find_loop_end(struct brw_compile *p, int start)
2239{
2240   int ip;
2241   int br = 2;
2242
2243   for (ip = start + 1; ip < p->nr_insn; ip++) {
2244      struct brw_instruction *insn = &p->store[ip];
2245
2246      if (insn->header.opcode == BRW_OPCODE_WHILE) {
2247	 if (ip + insn->bits1.branch_gen6.jump_count / br < start)
2248	    return ip;
2249      }
2250   }
2251   assert(!"not reached");
2252   return start + 1;
2253}
2254
2255/* After program generation, go back and update the UIP and JIP of
2256 * BREAK and CONT instructions to their correct locations.
2257 */
2258void
2259brw_set_uip_jip(struct brw_compile *p)
2260{
2261   struct intel_context *intel = &p->brw->intel;
2262   int ip;
2263   int br = 2;
2264
2265   if (intel->gen < 6)
2266      return;
2267
2268   for (ip = 0; ip < p->nr_insn; ip++) {
2269      struct brw_instruction *insn = &p->store[ip];
2270
2271      switch (insn->header.opcode) {
2272      case BRW_OPCODE_BREAK:
2273	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2274	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
2275	 break;
2276      case BRW_OPCODE_CONTINUE:
2277	 /* JIP is set at CONTINUE emit time, since that's when we
2278	  * know where the start of the loop is.
2279	  */
2280	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
2281	 assert(insn->bits3.break_cont.uip != 0);
2282	 assert(insn->bits3.break_cont.jip != 0);
2283	 break;
2284      }
2285   }
2286}
2287
2288void brw_ff_sync(struct brw_compile *p,
2289		   struct brw_reg dest,
2290		   GLuint msg_reg_nr,
2291		   struct brw_reg src0,
2292		   GLboolean allocate,
2293		   GLuint response_length,
2294		   GLboolean eot)
2295{
2296   struct intel_context *intel = &p->brw->intel;
2297   struct brw_instruction *insn;
2298
2299   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2300
2301   insn = next_insn(p, BRW_OPCODE_SEND);
2302   brw_set_dest(p, insn, dest);
2303   brw_set_src0(p, insn, src0);
2304   brw_set_src1(p, insn, brw_imm_d(0));
2305
2306   if (intel->gen < 6)
2307       insn->header.destreg__conditionalmod = msg_reg_nr;
2308
2309   brw_set_ff_sync_message(p,
2310			   insn,
2311			   allocate,
2312			   response_length,
2313			   eot);
2314}
2315