brw_eu_emit.c revision 352dff62f8005add9e71e6b5ba3b3321cb953d73
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keith@tungstengraphics.com>
30  */
31
32
33#include "brw_context.h"
34#include "brw_defines.h"
35#include "brw_eu.h"
36
37
38
39
40/***********************************************************************
41 * Internal helper for constructing instructions
42 */
43
44static void guess_execution_size( struct brw_instruction *insn,
45				  struct brw_reg reg )
46{
47   if (reg.width == BRW_WIDTH_8 &&
48       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED)
49      insn->header.execution_size = BRW_EXECUTE_16;
50   else
51      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
52}
53
54
55static void brw_set_dest( struct brw_instruction *insn,
56			  struct brw_reg dest )
57{
58   if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
59       dest.file != BRW_MESSAGE_REGISTER_FILE)
60      assert(dest.nr < 128);
61
62   insn->bits1.da1.dest_reg_file = dest.file;
63   insn->bits1.da1.dest_reg_type = dest.type;
64   insn->bits1.da1.dest_address_mode = dest.address_mode;
65
66   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
67      insn->bits1.da1.dest_reg_nr = dest.nr;
68
69      if (insn->header.access_mode == BRW_ALIGN_1) {
70	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
71	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
72	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
73	 insn->bits1.da1.dest_horiz_stride = dest.hstride;
74      }
75      else {
76	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
77	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
78	 /* even ignored in da16, still need to set as '01' */
79	 insn->bits1.da16.dest_horiz_stride = 1;
80      }
81   }
82   else {
83      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
84
85      /* These are different sizes in align1 vs align16:
86       */
87      if (insn->header.access_mode == BRW_ALIGN_1) {
88	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
89	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
90	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
91	 insn->bits1.ia1.dest_horiz_stride = dest.hstride;
92      }
93      else {
94	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
95	 /* even ignored in da16, still need to set as '01' */
96	 insn->bits1.ia16.dest_horiz_stride = 1;
97      }
98   }
99
100   /* NEW: Set the execution size based on dest.width and
101    * insn->compression_control:
102    */
103   guess_execution_size(insn, dest);
104}
105
106static void brw_set_src0( struct brw_instruction *insn,
107                          struct brw_reg reg )
108{
109   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
110      assert(reg.nr < 128);
111
112   insn->bits1.da1.src0_reg_file = reg.file;
113   insn->bits1.da1.src0_reg_type = reg.type;
114   insn->bits2.da1.src0_abs = reg.abs;
115   insn->bits2.da1.src0_negate = reg.negate;
116   insn->bits2.da1.src0_address_mode = reg.address_mode;
117
118   if (reg.file == BRW_IMMEDIATE_VALUE) {
119      insn->bits3.ud = reg.dw1.ud;
120
121      /* Required to set some fields in src1 as well:
122       */
123      insn->bits1.da1.src1_reg_file = 0; /* arf */
124      insn->bits1.da1.src1_reg_type = reg.type;
125   }
126   else
127   {
128      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
129	 if (insn->header.access_mode == BRW_ALIGN_1) {
130	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
131	    insn->bits2.da1.src0_reg_nr = reg.nr;
132	 }
133	 else {
134	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
135	    insn->bits2.da16.src0_reg_nr = reg.nr;
136	 }
137      }
138      else {
139	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
140
141	 if (insn->header.access_mode == BRW_ALIGN_1) {
142	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
143	 }
144	 else {
145	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
146	 }
147      }
148
149      if (insn->header.access_mode == BRW_ALIGN_1) {
150	 if (reg.width == BRW_WIDTH_1 &&
151	     insn->header.execution_size == BRW_EXECUTE_1) {
152	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
153	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
154	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
155	 }
156	 else {
157	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
158	    insn->bits2.da1.src0_width = reg.width;
159	    insn->bits2.da1.src0_vert_stride = reg.vstride;
160	 }
161      }
162      else {
163	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
164	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
165	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
166	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
167
168	 /* This is an oddity of the fact we're using the same
169	  * descriptions for registers in align_16 as align_1:
170	  */
171	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
172	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
173	 else
174	    insn->bits2.da16.src0_vert_stride = reg.vstride;
175      }
176   }
177}
178
179
180void brw_set_src1( struct brw_instruction *insn,
181                   struct brw_reg reg )
182{
183   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
184
185   assert(reg.nr < 128);
186
187   insn->bits1.da1.src1_reg_file = reg.file;
188   insn->bits1.da1.src1_reg_type = reg.type;
189   insn->bits3.da1.src1_abs = reg.abs;
190   insn->bits3.da1.src1_negate = reg.negate;
191
192   /* Only src1 can be immediate in two-argument instructions.
193    */
194   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
195
196   if (reg.file == BRW_IMMEDIATE_VALUE) {
197      insn->bits3.ud = reg.dw1.ud;
198   }
199   else {
200      /* This is a hardware restriction, which may or may not be lifted
201       * in the future:
202       */
203      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
204      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
205
206      if (insn->header.access_mode == BRW_ALIGN_1) {
207	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
208	 insn->bits3.da1.src1_reg_nr = reg.nr;
209      }
210      else {
211	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
212	 insn->bits3.da16.src1_reg_nr = reg.nr;
213      }
214
215      if (insn->header.access_mode == BRW_ALIGN_1) {
216	 if (reg.width == BRW_WIDTH_1 &&
217	     insn->header.execution_size == BRW_EXECUTE_1) {
218	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
219	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
220	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
221	 }
222	 else {
223	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
224	    insn->bits3.da1.src1_width = reg.width;
225	    insn->bits3.da1.src1_vert_stride = reg.vstride;
226	 }
227      }
228      else {
229	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
230	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
231	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
232	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
233
234	 /* This is an oddity of the fact we're using the same
235	  * descriptions for registers in align_16 as align_1:
236	  */
237	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
238	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
239	 else
240	    insn->bits3.da16.src1_vert_stride = reg.vstride;
241      }
242   }
243}
244
245
246
247static void brw_set_math_message( struct brw_context *brw,
248				  struct brw_instruction *insn,
249				  GLuint msg_length,
250				  GLuint response_length,
251				  GLuint function,
252				  GLuint integer_type,
253				  GLboolean low_precision,
254				  GLboolean saturate,
255				  GLuint dataType )
256{
257   struct intel_context *intel = &brw->intel;
258   brw_set_src1(insn, brw_imm_d(0));
259
260   if (intel->gen == 5) {
261       insn->bits3.math_gen5.function = function;
262       insn->bits3.math_gen5.int_type = integer_type;
263       insn->bits3.math_gen5.precision = low_precision;
264       insn->bits3.math_gen5.saturate = saturate;
265       insn->bits3.math_gen5.data_type = dataType;
266       insn->bits3.math_gen5.snapshot = 0;
267       insn->bits3.math_gen5.header_present = 0;
268       insn->bits3.math_gen5.response_length = response_length;
269       insn->bits3.math_gen5.msg_length = msg_length;
270       insn->bits3.math_gen5.end_of_thread = 0;
271       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_MATH;
272       insn->bits2.send_gen5.end_of_thread = 0;
273   } else {
274       insn->bits3.math.function = function;
275       insn->bits3.math.int_type = integer_type;
276       insn->bits3.math.precision = low_precision;
277       insn->bits3.math.saturate = saturate;
278       insn->bits3.math.data_type = dataType;
279       insn->bits3.math.response_length = response_length;
280       insn->bits3.math.msg_length = msg_length;
281       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
282       insn->bits3.math.end_of_thread = 0;
283   }
284}
285
286
287static void brw_set_ff_sync_message(struct brw_context *brw,
288				    struct brw_instruction *insn,
289				    GLboolean allocate,
290				    GLuint response_length,
291				    GLboolean end_of_thread)
292{
293	struct intel_context *intel = &brw->intel;
294	brw_set_src1(insn, brw_imm_d(0));
295
296	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
297	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
298	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
299	insn->bits3.urb_gen5.allocate = allocate;
300	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
301	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
302	insn->bits3.urb_gen5.header_present = 1;
303	insn->bits3.urb_gen5.response_length = response_length; /* may be 1 or 0 */
304	insn->bits3.urb_gen5.msg_length = 1;
305	insn->bits3.urb_gen5.end_of_thread = end_of_thread;
306	if (intel->gen >= 6) {
307	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
308	} else {
309	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
310	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
311	}
312}
313
314static void brw_set_urb_message( struct brw_context *brw,
315				 struct brw_instruction *insn,
316				 GLboolean allocate,
317				 GLboolean used,
318				 GLuint msg_length,
319				 GLuint response_length,
320				 GLboolean end_of_thread,
321				 GLboolean complete,
322				 GLuint offset,
323				 GLuint swizzle_control )
324{
325    struct intel_context *intel = &brw->intel;
326    brw_set_src1(insn, brw_imm_d(0));
327
328    if (intel->gen >= 5) {
329        insn->bits3.urb_gen5.opcode = 0;	/* ? */
330        insn->bits3.urb_gen5.offset = offset;
331        insn->bits3.urb_gen5.swizzle_control = swizzle_control;
332        insn->bits3.urb_gen5.allocate = allocate;
333        insn->bits3.urb_gen5.used = used;	/* ? */
334        insn->bits3.urb_gen5.complete = complete;
335        insn->bits3.urb_gen5.header_present = 1;
336        insn->bits3.urb_gen5.response_length = response_length;
337        insn->bits3.urb_gen5.msg_length = msg_length;
338        insn->bits3.urb_gen5.end_of_thread = end_of_thread;
339	if (intel->gen >= 6) {
340	   /* For SNB, the SFID bits moved to the condmod bits, and
341	    * EOT stayed in bits3 above.  Does the EOT bit setting
342	    * below on Ironlake even do anything?
343	    */
344	   insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_URB;
345	} else {
346	   insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_URB;
347	   insn->bits2.send_gen5.end_of_thread = end_of_thread;
348	}
349    } else {
350        insn->bits3.urb.opcode = 0;	/* ? */
351        insn->bits3.urb.offset = offset;
352        insn->bits3.urb.swizzle_control = swizzle_control;
353        insn->bits3.urb.allocate = allocate;
354        insn->bits3.urb.used = used;	/* ? */
355        insn->bits3.urb.complete = complete;
356        insn->bits3.urb.response_length = response_length;
357        insn->bits3.urb.msg_length = msg_length;
358        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
359        insn->bits3.urb.end_of_thread = end_of_thread;
360    }
361}
362
363static void brw_set_dp_write_message( struct brw_context *brw,
364				      struct brw_instruction *insn,
365				      GLuint binding_table_index,
366				      GLuint msg_control,
367				      GLuint msg_type,
368				      GLuint msg_length,
369				      GLuint pixel_scoreboard_clear,
370				      GLuint response_length,
371				      GLuint end_of_thread,
372				      GLuint send_commit_msg)
373{
374   struct intel_context *intel = &brw->intel;
375   brw_set_src1(insn, brw_imm_ud(0));
376
377   if (intel->gen >= 6) {
378       insn->bits3.dp_render_cache.binding_table_index = binding_table_index;
379       insn->bits3.dp_render_cache.msg_control = msg_control;
380       insn->bits3.dp_render_cache.pixel_scoreboard_clear = pixel_scoreboard_clear;
381       insn->bits3.dp_render_cache.msg_type = msg_type;
382       insn->bits3.dp_render_cache.send_commit_msg = send_commit_msg;
383       insn->bits3.dp_render_cache.header_present = 0; /* XXX */
384       insn->bits3.dp_render_cache.response_length = response_length;
385       insn->bits3.dp_render_cache.msg_length = msg_length;
386       insn->bits3.dp_render_cache.end_of_thread = end_of_thread;
387       insn->header.destreg__conditionalmod = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
388	/* XXX really need below? */
389       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
390       insn->bits2.send_gen5.end_of_thread = end_of_thread;
391   } else if (intel->gen == 5) {
392       insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
393       insn->bits3.dp_write_gen5.msg_control = msg_control;
394       insn->bits3.dp_write_gen5.pixel_scoreboard_clear = pixel_scoreboard_clear;
395       insn->bits3.dp_write_gen5.msg_type = msg_type;
396       insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
397       insn->bits3.dp_write_gen5.header_present = 1;
398       insn->bits3.dp_write_gen5.response_length = response_length;
399       insn->bits3.dp_write_gen5.msg_length = msg_length;
400       insn->bits3.dp_write_gen5.end_of_thread = end_of_thread;
401       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
402       insn->bits2.send_gen5.end_of_thread = end_of_thread;
403   } else {
404       insn->bits3.dp_write.binding_table_index = binding_table_index;
405       insn->bits3.dp_write.msg_control = msg_control;
406       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
407       insn->bits3.dp_write.msg_type = msg_type;
408       insn->bits3.dp_write.send_commit_msg = send_commit_msg;
409       insn->bits3.dp_write.response_length = response_length;
410       insn->bits3.dp_write.msg_length = msg_length;
411       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
412       insn->bits3.dp_write.end_of_thread = end_of_thread;
413   }
414}
415
416static void brw_set_dp_read_message( struct brw_context *brw,
417				      struct brw_instruction *insn,
418				      GLuint binding_table_index,
419				      GLuint msg_control,
420				      GLuint msg_type,
421				      GLuint target_cache,
422				      GLuint msg_length,
423				      GLuint response_length,
424				      GLuint end_of_thread )
425{
426   struct intel_context *intel = &brw->intel;
427   brw_set_src1(insn, brw_imm_d(0));
428
429   if (intel->gen == 5) {
430       insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
431       insn->bits3.dp_read_gen5.msg_control = msg_control;
432       insn->bits3.dp_read_gen5.msg_type = msg_type;
433       insn->bits3.dp_read_gen5.target_cache = target_cache;
434       insn->bits3.dp_read_gen5.header_present = 1;
435       insn->bits3.dp_read_gen5.response_length = response_length;
436       insn->bits3.dp_read_gen5.msg_length = msg_length;
437       insn->bits3.dp_read_gen5.pad1 = 0;
438       insn->bits3.dp_read_gen5.end_of_thread = end_of_thread;
439       insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
440       insn->bits2.send_gen5.end_of_thread = end_of_thread;
441   } else {
442       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
443       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
444       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
445       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
446       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
447       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
448       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
449       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
450       insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
451   }
452}
453
454static void brw_set_sampler_message(struct brw_context *brw,
455                                    struct brw_instruction *insn,
456                                    GLuint binding_table_index,
457                                    GLuint sampler,
458                                    GLuint msg_type,
459                                    GLuint response_length,
460                                    GLuint msg_length,
461                                    GLboolean eot,
462                                    GLuint header_present,
463                                    GLuint simd_mode)
464{
465   struct intel_context *intel = &brw->intel;
466   assert(eot == 0);
467   brw_set_src1(insn, brw_imm_d(0));
468
469   if (intel->gen == 5) {
470      insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
471      insn->bits3.sampler_gen5.sampler = sampler;
472      insn->bits3.sampler_gen5.msg_type = msg_type;
473      insn->bits3.sampler_gen5.simd_mode = simd_mode;
474      insn->bits3.sampler_gen5.header_present = header_present;
475      insn->bits3.sampler_gen5.response_length = response_length;
476      insn->bits3.sampler_gen5.msg_length = msg_length;
477      insn->bits3.sampler_gen5.end_of_thread = eot;
478      insn->bits2.send_gen5.sfid = BRW_MESSAGE_TARGET_SAMPLER;
479      insn->bits2.send_gen5.end_of_thread = eot;
480   } else if (intel->is_g4x) {
481      insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
482      insn->bits3.sampler_g4x.sampler = sampler;
483      insn->bits3.sampler_g4x.msg_type = msg_type;
484      insn->bits3.sampler_g4x.response_length = response_length;
485      insn->bits3.sampler_g4x.msg_length = msg_length;
486      insn->bits3.sampler_g4x.end_of_thread = eot;
487      insn->bits3.sampler_g4x.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
488   } else {
489      insn->bits3.sampler.binding_table_index = binding_table_index;
490      insn->bits3.sampler.sampler = sampler;
491      insn->bits3.sampler.msg_type = msg_type;
492      insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
493      insn->bits3.sampler.response_length = response_length;
494      insn->bits3.sampler.msg_length = msg_length;
495      insn->bits3.sampler.end_of_thread = eot;
496      insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
497   }
498}
499
500
501
502static struct brw_instruction *next_insn( struct brw_compile *p,
503					  GLuint opcode )
504{
505   struct brw_instruction *insn;
506
507   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
508
509   insn = &p->store[p->nr_insn++];
510   memcpy(insn, p->current, sizeof(*insn));
511
512   /* Reset this one-shot flag:
513    */
514
515   if (p->current->header.destreg__conditionalmod) {
516      p->current->header.destreg__conditionalmod = 0;
517      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
518   }
519
520   insn->header.opcode = opcode;
521   return insn;
522}
523
524
525static struct brw_instruction *brw_alu1( struct brw_compile *p,
526					 GLuint opcode,
527					 struct brw_reg dest,
528					 struct brw_reg src )
529{
530   struct brw_instruction *insn = next_insn(p, opcode);
531   brw_set_dest(insn, dest);
532   brw_set_src0(insn, src);
533   return insn;
534}
535
536static struct brw_instruction *brw_alu2(struct brw_compile *p,
537					GLuint opcode,
538					struct brw_reg dest,
539					struct brw_reg src0,
540					struct brw_reg src1 )
541{
542   struct brw_instruction *insn = next_insn(p, opcode);
543   brw_set_dest(insn, dest);
544   brw_set_src0(insn, src0);
545   brw_set_src1(insn, src1);
546   return insn;
547}
548
549
550/***********************************************************************
551 * Convenience routines.
552 */
553#define ALU1(OP)					\
554struct brw_instruction *brw_##OP(struct brw_compile *p,	\
555	      struct brw_reg dest,			\
556	      struct brw_reg src0)   			\
557{							\
558   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
559}
560
561#define ALU2(OP)					\
562struct brw_instruction *brw_##OP(struct brw_compile *p,	\
563	      struct brw_reg dest,			\
564	      struct brw_reg src0,			\
565	      struct brw_reg src1)   			\
566{							\
567   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
568}
569
570
571ALU1(MOV)
572ALU2(SEL)
573ALU1(NOT)
574ALU2(AND)
575ALU2(OR)
576ALU2(XOR)
577ALU2(SHR)
578ALU2(SHL)
579ALU2(RSR)
580ALU2(RSL)
581ALU2(ASR)
582ALU2(ADD)
583ALU2(MUL)
584ALU1(FRC)
585ALU1(RNDD)
586ALU1(RNDZ)
587ALU2(MAC)
588ALU2(MACH)
589ALU1(LZD)
590ALU2(DP4)
591ALU2(DPH)
592ALU2(DP3)
593ALU2(DP2)
594ALU2(LINE)
595ALU2(PLN)
596
597
598
599void brw_NOP(struct brw_compile *p)
600{
601   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
602   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
603   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
604   brw_set_src1(insn, brw_imm_ud(0x0));
605}
606
607
608
609
610
611/***********************************************************************
612 * Comparisons, if/else/endif
613 */
614
615struct brw_instruction *brw_JMPI(struct brw_compile *p,
616                                 struct brw_reg dest,
617                                 struct brw_reg src0,
618                                 struct brw_reg src1)
619{
620   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
621
622   insn->header.execution_size = 1;
623   insn->header.compression_control = BRW_COMPRESSION_NONE;
624   insn->header.mask_control = BRW_MASK_DISABLE;
625
626   p->current->header.predicate_control = BRW_PREDICATE_NONE;
627
628   return insn;
629}
630
631/* EU takes the value from the flag register and pushes it onto some
632 * sort of a stack (presumably merging with any flag value already on
633 * the stack).  Within an if block, the flags at the top of the stack
634 * control execution on each channel of the unit, eg. on each of the
635 * 16 pixel values in our wm programs.
636 *
637 * When the matching 'else' instruction is reached (presumably by
638 * countdown of the instruction count patched in by our ELSE/ENDIF
639 * functions), the relevent flags are inverted.
640 *
641 * When the matching 'endif' instruction is reached, the flags are
642 * popped off.  If the stack is now empty, normal execution resumes.
643 *
644 * No attempt is made to deal with stack overflow (14 elements?).
645 */
646struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
647{
648   struct brw_instruction *insn;
649
650   if (p->single_program_flow) {
651      assert(execute_size == BRW_EXECUTE_1);
652
653      insn = next_insn(p, BRW_OPCODE_ADD);
654      insn->header.predicate_inverse = 1;
655   } else {
656      insn = next_insn(p, BRW_OPCODE_IF);
657   }
658
659   /* Override the defaults for this instruction:
660    */
661   brw_set_dest(insn, brw_ip_reg());
662   brw_set_src0(insn, brw_ip_reg());
663   brw_set_src1(insn, brw_imm_d(0x0));
664
665   insn->header.execution_size = execute_size;
666   insn->header.compression_control = BRW_COMPRESSION_NONE;
667   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
668   insn->header.mask_control = BRW_MASK_ENABLE;
669   if (!p->single_program_flow)
670       insn->header.thread_control = BRW_THREAD_SWITCH;
671
672   p->current->header.predicate_control = BRW_PREDICATE_NONE;
673
674   return insn;
675}
676
677
678struct brw_instruction *brw_ELSE(struct brw_compile *p,
679				 struct brw_instruction *if_insn)
680{
681   struct intel_context *intel = &p->brw->intel;
682   struct brw_instruction *insn;
683   GLuint br = 1;
684
685   if (intel->gen == 5)
686      br = 2;
687
688   if (p->single_program_flow) {
689      insn = next_insn(p, BRW_OPCODE_ADD);
690   } else {
691      insn = next_insn(p, BRW_OPCODE_ELSE);
692   }
693
694   brw_set_dest(insn, brw_ip_reg());
695   brw_set_src0(insn, brw_ip_reg());
696   brw_set_src1(insn, brw_imm_d(0x0));
697
698   insn->header.compression_control = BRW_COMPRESSION_NONE;
699   insn->header.execution_size = if_insn->header.execution_size;
700   insn->header.mask_control = BRW_MASK_ENABLE;
701   if (!p->single_program_flow)
702       insn->header.thread_control = BRW_THREAD_SWITCH;
703
704   /* Patch the if instruction to point at this instruction.
705    */
706   if (p->single_program_flow) {
707      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
708
709      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
710   } else {
711      assert(if_insn->header.opcode == BRW_OPCODE_IF);
712
713      if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
714      if_insn->bits3.if_else.pop_count = 0;
715      if_insn->bits3.if_else.pad0 = 0;
716   }
717
718   return insn;
719}
720
721void brw_ENDIF(struct brw_compile *p,
722	       struct brw_instruction *patch_insn)
723{
724   struct intel_context *intel = &p->brw->intel;
725   GLuint br = 1;
726
727   if (intel->gen == 5)
728      br = 2;
729
730   if (p->single_program_flow) {
731      /* In single program flow mode, there's no need to execute an ENDIF,
732       * since we don't need to do any stack operations, and if we're executing
733       * currently, we want to just continue executing.
734       */
735      struct brw_instruction *next = &p->store[p->nr_insn];
736
737      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
738
739      patch_insn->bits3.ud = (next - patch_insn) * 16;
740   } else {
741      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
742
743      brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
744      brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
745      brw_set_src1(insn, brw_imm_d(0x0));
746
747      insn->header.compression_control = BRW_COMPRESSION_NONE;
748      insn->header.execution_size = patch_insn->header.execution_size;
749      insn->header.mask_control = BRW_MASK_ENABLE;
750      insn->header.thread_control = BRW_THREAD_SWITCH;
751
752      assert(patch_insn->bits3.if_else.jump_count == 0);
753
754      /* Patch the if or else instructions to point at this or the next
755       * instruction respectively.
756       */
757      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
758	 /* Automagically turn it into an IFF:
759	  */
760	 patch_insn->header.opcode = BRW_OPCODE_IFF;
761	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
762	 patch_insn->bits3.if_else.pop_count = 0;
763	 patch_insn->bits3.if_else.pad0 = 0;
764      } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
765	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
766	 patch_insn->bits3.if_else.pop_count = 1;
767	 patch_insn->bits3.if_else.pad0 = 0;
768      } else {
769	 assert(0);
770      }
771
772      /* Also pop item off the stack in the endif instruction:
773       */
774      insn->bits3.if_else.jump_count = 0;
775      insn->bits3.if_else.pop_count = 1;
776      insn->bits3.if_else.pad0 = 0;
777   }
778}
779
780struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
781{
782   struct brw_instruction *insn;
783   insn = next_insn(p, BRW_OPCODE_BREAK);
784   brw_set_dest(insn, brw_ip_reg());
785   brw_set_src0(insn, brw_ip_reg());
786   brw_set_src1(insn, brw_imm_d(0x0));
787   insn->header.compression_control = BRW_COMPRESSION_NONE;
788   insn->header.execution_size = BRW_EXECUTE_8;
789   /* insn->header.mask_control = BRW_MASK_DISABLE; */
790   insn->bits3.if_else.pad0 = 0;
791   insn->bits3.if_else.pop_count = pop_count;
792   return insn;
793}
794
795struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
796{
797   struct brw_instruction *insn;
798   insn = next_insn(p, BRW_OPCODE_CONTINUE);
799   brw_set_dest(insn, brw_ip_reg());
800   brw_set_src0(insn, brw_ip_reg());
801   brw_set_src1(insn, brw_imm_d(0x0));
802   insn->header.compression_control = BRW_COMPRESSION_NONE;
803   insn->header.execution_size = BRW_EXECUTE_8;
804   /* insn->header.mask_control = BRW_MASK_DISABLE; */
805   insn->bits3.if_else.pad0 = 0;
806   insn->bits3.if_else.pop_count = pop_count;
807   return insn;
808}
809
810/* DO/WHILE loop:
811 */
812struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
813{
814   if (p->single_program_flow) {
815      return &p->store[p->nr_insn];
816   } else {
817      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
818
819      /* Override the defaults for this instruction:
820       */
821      brw_set_dest(insn, brw_null_reg());
822      brw_set_src0(insn, brw_null_reg());
823      brw_set_src1(insn, brw_null_reg());
824
825      insn->header.compression_control = BRW_COMPRESSION_NONE;
826      insn->header.execution_size = execute_size;
827      insn->header.predicate_control = BRW_PREDICATE_NONE;
828      /* insn->header.mask_control = BRW_MASK_ENABLE; */
829      /* insn->header.mask_control = BRW_MASK_DISABLE; */
830
831      return insn;
832   }
833}
834
835
836
837struct brw_instruction *brw_WHILE(struct brw_compile *p,
838                                  struct brw_instruction *do_insn)
839{
840   struct intel_context *intel = &p->brw->intel;
841   struct brw_instruction *insn;
842   GLuint br = 1;
843
844   if (intel->gen == 5)
845      br = 2;
846
847   if (p->single_program_flow)
848      insn = next_insn(p, BRW_OPCODE_ADD);
849   else
850      insn = next_insn(p, BRW_OPCODE_WHILE);
851
852   brw_set_dest(insn, brw_ip_reg());
853   brw_set_src0(insn, brw_ip_reg());
854   brw_set_src1(insn, brw_imm_d(0x0));
855
856   insn->header.compression_control = BRW_COMPRESSION_NONE;
857
858   if (p->single_program_flow) {
859      insn->header.execution_size = BRW_EXECUTE_1;
860
861      insn->bits3.d = (do_insn - insn) * 16;
862   } else {
863      insn->header.execution_size = do_insn->header.execution_size;
864
865      assert(do_insn->header.opcode == BRW_OPCODE_DO);
866      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
867      insn->bits3.if_else.pop_count = 0;
868      insn->bits3.if_else.pad0 = 0;
869   }
870
871/*    insn->header.mask_control = BRW_MASK_ENABLE; */
872
873   /* insn->header.mask_control = BRW_MASK_DISABLE; */
874   p->current->header.predicate_control = BRW_PREDICATE_NONE;
875   return insn;
876}
877
878
879/* FORWARD JUMPS:
880 */
881void brw_land_fwd_jump(struct brw_compile *p,
882		       struct brw_instruction *jmp_insn)
883{
884   struct intel_context *intel = &p->brw->intel;
885   struct brw_instruction *landing = &p->store[p->nr_insn];
886   GLuint jmpi = 1;
887
888   if (intel->gen == 5)
889       jmpi = 2;
890
891   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
892   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
893
894   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
895}
896
897
898
899/* To integrate with the above, it makes sense that the comparison
900 * instruction should populate the flag register.  It might be simpler
901 * just to use the flag reg for most WM tasks?
902 */
903void brw_CMP(struct brw_compile *p,
904	     struct brw_reg dest,
905	     GLuint conditional,
906	     struct brw_reg src0,
907	     struct brw_reg src1)
908{
909   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
910
911   insn->header.destreg__conditionalmod = conditional;
912   brw_set_dest(insn, dest);
913   brw_set_src0(insn, src0);
914   brw_set_src1(insn, src1);
915
916/*    guess_execution_size(insn, src0); */
917
918
919   /* Make it so that future instructions will use the computed flag
920    * value until brw_set_predicate_control_flag_value() is called
921    * again.
922    */
923   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
924       dest.nr == 0) {
925      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
926      p->flag_value = 0xff;
927   }
928}
929
930/* Issue 'wait' instruction for n1, host could program MMIO
931   to wake up thread. */
932void brw_WAIT (struct brw_compile *p)
933{
934   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
935   struct brw_reg src = brw_notification_1_reg();
936
937   brw_set_dest(insn, src);
938   brw_set_src0(insn, src);
939   brw_set_src1(insn, brw_null_reg());
940   insn->header.execution_size = 0; /* must */
941   insn->header.predicate_control = 0;
942   insn->header.compression_control = 0;
943}
944
945
946/***********************************************************************
947 * Helpers for the various SEND message types:
948 */
949
950/** Extended math function, float[8].
951 */
952void brw_math( struct brw_compile *p,
953	       struct brw_reg dest,
954	       GLuint function,
955	       GLuint saturate,
956	       GLuint msg_reg_nr,
957	       struct brw_reg src,
958	       GLuint data_type,
959	       GLuint precision )
960{
961   struct intel_context *intel = &p->brw->intel;
962
963   if (intel->gen >= 6) {
964      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
965
966      /* Math is the same ISA format as other opcodes, except that CondModifier
967       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
968       */
969      insn->header.destreg__conditionalmod = function;
970
971      brw_set_dest(insn, dest);
972      brw_set_src0(insn, src);
973      brw_set_src1(insn, brw_null_reg());
974   } else {
975      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
976      GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
977      GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
978      /* Example code doesn't set predicate_control for send
979       * instructions.
980       */
981      insn->header.predicate_control = 0;
982      insn->header.destreg__conditionalmod = msg_reg_nr;
983
984      brw_set_dest(insn, dest);
985      brw_set_src0(insn, src);
986      brw_set_math_message(p->brw,
987			   insn,
988			   msg_length, response_length,
989			   function,
990			   BRW_MATH_INTEGER_UNSIGNED,
991			   precision,
992			   saturate,
993			   data_type);
994   }
995}
996
997/**
998 * Extended math function, float[16].
999 * Use 2 send instructions.
1000 */
1001void brw_math_16( struct brw_compile *p,
1002		  struct brw_reg dest,
1003		  GLuint function,
1004		  GLuint saturate,
1005		  GLuint msg_reg_nr,
1006		  struct brw_reg src,
1007		  GLuint precision )
1008{
1009   struct intel_context *intel = &p->brw->intel;
1010   struct brw_instruction *insn;
1011   GLuint msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
1012   GLuint response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
1013
1014   if (intel->gen >= 6) {
1015      insn = next_insn(p, BRW_OPCODE_MATH);
1016
1017      /* Math is the same ISA format as other opcodes, except that CondModifier
1018       * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
1019       */
1020      insn->header.destreg__conditionalmod = function;
1021
1022      brw_set_dest(insn, dest);
1023      brw_set_src0(insn, src);
1024      brw_set_src1(insn, brw_null_reg());
1025      return;
1026   }
1027
1028   /* First instruction:
1029    */
1030   brw_push_insn_state(p);
1031   brw_set_predicate_control_flag_value(p, 0xff);
1032   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1033
1034   insn = next_insn(p, BRW_OPCODE_SEND);
1035   insn->header.destreg__conditionalmod = msg_reg_nr;
1036
1037   brw_set_dest(insn, dest);
1038   brw_set_src0(insn, src);
1039   brw_set_math_message(p->brw,
1040			insn,
1041			msg_length, response_length,
1042			function,
1043			BRW_MATH_INTEGER_UNSIGNED,
1044			precision,
1045			saturate,
1046			BRW_MATH_DATA_VECTOR);
1047
1048   /* Second instruction:
1049    */
1050   insn = next_insn(p, BRW_OPCODE_SEND);
1051   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
1052   insn->header.destreg__conditionalmod = msg_reg_nr+1;
1053
1054   brw_set_dest(insn, offset(dest,1));
1055   brw_set_src0(insn, src);
1056   brw_set_math_message(p->brw,
1057			insn,
1058			msg_length, response_length,
1059			function,
1060			BRW_MATH_INTEGER_UNSIGNED,
1061			precision,
1062			saturate,
1063			BRW_MATH_DATA_VECTOR);
1064
1065   brw_pop_insn_state(p);
1066}
1067
1068
1069/**
1070 * Write block of 16 dwords/floats to the data port Render Cache scratch buffer.
1071 * Scratch offset should be a multiple of 64.
1072 * Used for register spilling.
1073 */
1074void brw_dp_WRITE_16( struct brw_compile *p,
1075		      struct brw_reg src,
1076		      GLuint scratch_offset )
1077{
1078   struct intel_context *intel = &p->brw->intel;
1079   GLuint msg_reg_nr = 1;
1080   {
1081      brw_push_insn_state(p);
1082      brw_set_mask_control(p, BRW_MASK_DISABLE);
1083      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1084
1085      /* set message header global offset field (reg 0, element 2) */
1086      brw_MOV(p,
1087	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
1088	      brw_imm_d(scratch_offset));
1089
1090      brw_pop_insn_state(p);
1091   }
1092
1093   {
1094      GLuint msg_length = 3;
1095      struct brw_reg dest;
1096      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1097      int send_commit_msg;
1098
1099      insn->header.predicate_control = 0; /* XXX */
1100      insn->header.compression_control = BRW_COMPRESSION_NONE;
1101      insn->header.destreg__conditionalmod = msg_reg_nr;
1102
1103      /* Until gen6, writes followed by reads from the same location
1104       * are not guaranteed to be ordered unless write_commit is set.
1105       * If set, then a no-op write is issued to the destination
1106       * register to set a dependency, and a read from the destination
1107       * can be used to ensure the ordering.
1108       *
1109       * For gen6, only writes between different threads need ordering
1110       * protection.  Our use of DP writes is all about register
1111       * spilling within a thread.
1112       */
1113      if (intel->gen >= 6) {
1114	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
1115	 send_commit_msg = 0;
1116      } else {
1117	 dest = brw_uw16_grf(0, 0);
1118	 send_commit_msg = 1;
1119      }
1120
1121      brw_set_dest(insn, dest);
1122      brw_set_src0(insn, src);
1123
1124      brw_set_dp_write_message(p->brw,
1125			       insn,
1126			       255, /* binding table index (255=stateless) */
1127			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
1128			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
1129			       msg_length,
1130			       0, /* pixel scoreboard */
1131			       send_commit_msg, /* response_length */
1132			       0, /* eot */
1133			       send_commit_msg);
1134   }
1135}
1136
1137
1138/**
1139 * Read block of 16 dwords/floats from the data port Render Cache scratch buffer.
1140 * Scratch offset should be a multiple of 64.
1141 * Used for register spilling.
1142 */
1143void brw_dp_READ_16( struct brw_compile *p,
1144		      struct brw_reg dest,
1145		      GLuint scratch_offset )
1146{
1147   GLuint msg_reg_nr = 1;
1148   {
1149      brw_push_insn_state(p);
1150      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1151      brw_set_mask_control(p, BRW_MASK_DISABLE);
1152
1153      /* set message header global offset field (reg 0, element 2) */
1154      brw_MOV(p,
1155	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
1156	      brw_imm_d(scratch_offset));
1157
1158      brw_pop_insn_state(p);
1159   }
1160
1161   {
1162      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1163
1164      insn->header.predicate_control = 0; /* XXX */
1165      insn->header.compression_control = BRW_COMPRESSION_NONE;
1166      insn->header.destreg__conditionalmod = msg_reg_nr;
1167
1168      brw_set_dest(insn, dest);	/* UW? */
1169      brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1170
1171      brw_set_dp_read_message(p->brw,
1172			      insn,
1173			      255, /* binding table index (255=stateless) */
1174			      BRW_DATAPORT_OWORD_BLOCK_4_OWORDS,
1175			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1176			      1, /* target cache (render/scratch) */
1177			      1, /* msg_length */
1178			      2, /* response_length */
1179			      0); /* eot */
1180   }
1181}
1182
1183
1184/**
1185 * Read a float[4] vector from the data port Data Cache (const buffer).
1186 * Location (in buffer) should be a multiple of 16.
1187 * Used for fetching shader constants.
1188 * If relAddr is true, we'll do an indirect fetch using the address register.
1189 */
1190void brw_dp_READ_4( struct brw_compile *p,
1191                    struct brw_reg dest,
1192                    GLboolean relAddr,
1193                    GLuint location,
1194                    GLuint bind_table_index )
1195{
1196   /* XXX: relAddr not implemented */
1197   GLuint msg_reg_nr = 1;
1198   {
1199      struct brw_reg b;
1200      brw_push_insn_state(p);
1201      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1202      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1203      brw_set_mask_control(p, BRW_MASK_DISABLE);
1204
1205   /* Setup MRF[1] with location/offset into const buffer */
1206      b = brw_message_reg(msg_reg_nr);
1207      b = retype(b, BRW_REGISTER_TYPE_UD);
1208      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
1209       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
1210       */
1211      brw_MOV(p, b, brw_imm_ud(location));
1212      brw_pop_insn_state(p);
1213   }
1214
1215   {
1216      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1217
1218      insn->header.predicate_control = BRW_PREDICATE_NONE;
1219      insn->header.compression_control = BRW_COMPRESSION_NONE;
1220      insn->header.destreg__conditionalmod = msg_reg_nr;
1221      insn->header.mask_control = BRW_MASK_DISABLE;
1222
1223      /* cast dest to a uword[8] vector */
1224      dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
1225
1226      brw_set_dest(insn, dest);
1227      brw_set_src0(insn, brw_null_reg());
1228
1229      brw_set_dp_read_message(p->brw,
1230			      insn,
1231			      bind_table_index,
1232			      0,  /* msg_control (0 means 1 Oword) */
1233			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1234			      0, /* source cache = data cache */
1235			      1, /* msg_length */
1236			      1, /* response_length (1 Oword) */
1237			      0); /* eot */
1238   }
1239}
1240
1241
1242/**
1243 * Read float[4] constant(s) from VS constant buffer.
1244 * For relative addressing, two float[4] constants will be read into 'dest'.
1245 * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
1246 */
1247void brw_dp_READ_4_vs(struct brw_compile *p,
1248                      struct brw_reg dest,
1249                      GLuint location,
1250                      GLuint bind_table_index)
1251{
1252   struct brw_instruction *insn;
1253   GLuint msg_reg_nr = 1;
1254   struct brw_reg b;
1255
1256   /*
1257   printf("vs const read msg, location %u, msg_reg_nr %d\n",
1258          location, msg_reg_nr);
1259   */
1260
1261   /* Setup MRF[1] with location/offset into const buffer */
1262   brw_push_insn_state(p);
1263   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1264   brw_set_mask_control(p, BRW_MASK_DISABLE);
1265   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1266
1267   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
1268    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
1269    */
1270   b = brw_message_reg(msg_reg_nr);
1271   b = retype(b, BRW_REGISTER_TYPE_UD);
1272   /*b = get_element_ud(b, 2);*/
1273   brw_MOV(p, b, brw_imm_ud(location));
1274
1275   brw_pop_insn_state(p);
1276
1277   insn = next_insn(p, BRW_OPCODE_SEND);
1278
1279   insn->header.predicate_control = BRW_PREDICATE_NONE;
1280   insn->header.compression_control = BRW_COMPRESSION_NONE;
1281   insn->header.destreg__conditionalmod = msg_reg_nr;
1282   insn->header.mask_control = BRW_MASK_DISABLE;
1283
1284   brw_set_dest(insn, dest);
1285   brw_set_src0(insn, brw_null_reg());
1286
1287   brw_set_dp_read_message(p->brw,
1288			   insn,
1289			   bind_table_index,
1290			   0,
1291			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
1292			   0, /* source cache = data cache */
1293			   1, /* msg_length */
1294			   1, /* response_length (1 Oword) */
1295			   0); /* eot */
1296}
1297
1298/**
1299 * Read a float[4] constant per vertex from VS constant buffer, with
1300 * relative addressing.
1301 */
1302void brw_dp_READ_4_vs_relative(struct brw_compile *p,
1303			       struct brw_reg dest,
1304			       struct brw_reg addr_reg,
1305			       GLuint offset,
1306			       GLuint bind_table_index)
1307{
1308   struct intel_context *intel = &p->brw->intel;
1309   int msg_type;
1310
1311   /* Setup MRF[1] with offset into const buffer */
1312   brw_push_insn_state(p);
1313   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1314   brw_set_mask_control(p, BRW_MASK_DISABLE);
1315   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1316
1317   /* M1.0 is block offset 0, M1.4 is block offset 1, all other
1318    * fields ignored.
1319    */
1320   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
1321	   addr_reg, brw_imm_d(offset));
1322   brw_pop_insn_state(p);
1323
1324   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1325
1326   insn->header.predicate_control = BRW_PREDICATE_NONE;
1327   insn->header.compression_control = BRW_COMPRESSION_NONE;
1328   insn->header.destreg__conditionalmod = 0;
1329   insn->header.mask_control = BRW_MASK_DISABLE;
1330
1331   brw_set_dest(insn, dest);
1332   brw_set_src0(insn, brw_vec8_grf(0, 0));
1333
1334   if (intel->gen == 6)
1335      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1336   else if (intel->gen == 5 || intel->is_g4x)
1337      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1338   else
1339      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1340
1341   brw_set_dp_read_message(p->brw,
1342			   insn,
1343			   bind_table_index,
1344			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1345			   msg_type,
1346			   0, /* source cache = data cache */
1347			   2, /* msg_length */
1348			   1, /* response_length */
1349			   0); /* eot */
1350}
1351
1352
1353
1354void brw_fb_WRITE(struct brw_compile *p,
1355		  int dispatch_width,
1356                  struct brw_reg dest,
1357                  GLuint msg_reg_nr,
1358                  struct brw_reg src0,
1359                  GLuint binding_table_index,
1360                  GLuint msg_length,
1361                  GLuint response_length,
1362                  GLboolean eot)
1363{
1364   struct intel_context *intel = &p->brw->intel;
1365   struct brw_instruction *insn;
1366   GLuint msg_control, msg_type;
1367
1368   insn = next_insn(p, BRW_OPCODE_SEND);
1369   insn->header.predicate_control = 0; /* XXX */
1370   insn->header.compression_control = BRW_COMPRESSION_NONE;
1371
1372   if (intel->gen >= 6) {
1373       /* headerless version, just submit color payload */
1374       src0 = brw_message_reg(msg_reg_nr);
1375
1376       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE_GEN6;
1377   } else {
1378      insn->header.destreg__conditionalmod = msg_reg_nr;
1379
1380      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
1381   }
1382
1383   if (dispatch_width == 16)
1384      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1385   else
1386      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1387
1388   brw_set_dest(insn, dest);
1389   brw_set_src0(insn, src0);
1390   brw_set_dp_write_message(p->brw,
1391			    insn,
1392			    binding_table_index,
1393			    msg_control,
1394			    msg_type,
1395			    msg_length,
1396			    1,	/* pixel scoreboard */
1397			    response_length,
1398			    eot,
1399			    0 /* send_commit_msg */);
1400}
1401
1402
1403/**
1404 * Texture sample instruction.
1405 * Note: the msg_type plus msg_length values determine exactly what kind
1406 * of sampling operation is performed.  See volume 4, page 161 of docs.
1407 */
1408void brw_SAMPLE(struct brw_compile *p,
1409		struct brw_reg dest,
1410		GLuint msg_reg_nr,
1411		struct brw_reg src0,
1412		GLuint binding_table_index,
1413		GLuint sampler,
1414		GLuint writemask,
1415		GLuint msg_type,
1416		GLuint response_length,
1417		GLuint msg_length,
1418		GLboolean eot,
1419		GLuint header_present,
1420		GLuint simd_mode)
1421{
1422   GLboolean need_stall = 0;
1423
1424   if (writemask == 0) {
1425      /*printf("%s: zero writemask??\n", __FUNCTION__); */
1426      return;
1427   }
1428
1429   /* Hardware doesn't do destination dependency checking on send
1430    * instructions properly.  Add a workaround which generates the
1431    * dependency by other means.  In practice it seems like this bug
1432    * only crops up for texture samples, and only where registers are
1433    * written by the send and then written again later without being
1434    * read in between.  Luckily for us, we already track that
1435    * information and use it to modify the writemask for the
1436    * instruction, so that is a guide for whether a workaround is
1437    * needed.
1438    */
1439   if (writemask != WRITEMASK_XYZW) {
1440      GLuint dst_offset = 0;
1441      GLuint i, newmask = 0, len = 0;
1442
1443      for (i = 0; i < 4; i++) {
1444	 if (writemask & (1<<i))
1445	    break;
1446	 dst_offset += 2;
1447      }
1448      for (; i < 4; i++) {
1449	 if (!(writemask & (1<<i)))
1450	    break;
1451	 newmask |= 1<<i;
1452	 len++;
1453      }
1454
1455      if (newmask != writemask) {
1456	 need_stall = 1;
1457         /* printf("need stall %x %x\n", newmask , writemask); */
1458      }
1459      else {
1460	 GLboolean dispatch_16 = GL_FALSE;
1461
1462	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
1463
1464	 guess_execution_size(p->current, dest);
1465	 if (p->current->header.execution_size == BRW_EXECUTE_16)
1466	    dispatch_16 = GL_TRUE;
1467
1468	 newmask = ~newmask & WRITEMASK_XYZW;
1469
1470	 brw_push_insn_state(p);
1471
1472	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1473	 brw_set_mask_control(p, BRW_MASK_DISABLE);
1474
1475	 brw_MOV(p, m1, brw_vec8_grf(0,0));
1476  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
1477
1478	 brw_pop_insn_state(p);
1479
1480  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
1481	 dest = offset(dest, dst_offset);
1482
1483	 /* For 16-wide dispatch, masked channels are skipped in the
1484	  * response.  For 8-wide, masked channels still take up slots,
1485	  * and are just not written to.
1486	  */
1487	 if (dispatch_16)
1488	    response_length = len * 2;
1489      }
1490   }
1491
1492   {
1493      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1494
1495      insn->header.predicate_control = 0; /* XXX */
1496      insn->header.compression_control = BRW_COMPRESSION_NONE;
1497      insn->header.destreg__conditionalmod = msg_reg_nr;
1498
1499      brw_set_dest(insn, dest);
1500      brw_set_src0(insn, src0);
1501      brw_set_sampler_message(p->brw, insn,
1502			      binding_table_index,
1503			      sampler,
1504			      msg_type,
1505			      response_length,
1506			      msg_length,
1507			      eot,
1508			      header_present,
1509			      simd_mode);
1510   }
1511
1512   if (need_stall) {
1513      struct brw_reg reg = vec8(offset(dest, response_length-1));
1514
1515      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
1516       */
1517      brw_push_insn_state(p);
1518      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1519      brw_MOV(p, reg, reg);
1520      brw_pop_insn_state(p);
1521   }
1522
1523}
1524
1525/* All these variables are pretty confusing - we might be better off
1526 * using bitmasks and macros for this, in the old style.  Or perhaps
1527 * just having the caller instantiate the fields in dword3 itself.
1528 */
1529void brw_urb_WRITE(struct brw_compile *p,
1530		   struct brw_reg dest,
1531		   GLuint msg_reg_nr,
1532		   struct brw_reg src0,
1533		   GLboolean allocate,
1534		   GLboolean used,
1535		   GLuint msg_length,
1536		   GLuint response_length,
1537		   GLboolean eot,
1538		   GLboolean writes_complete,
1539		   GLuint offset,
1540		   GLuint swizzle)
1541{
1542   struct intel_context *intel = &p->brw->intel;
1543   struct brw_instruction *insn;
1544
1545   /* Sandybridge doesn't have the implied move for SENDs,
1546    * and the first message register index comes from src0.
1547    */
1548   if (intel->gen >= 6) {
1549      brw_push_insn_state(p);
1550      brw_set_mask_control( p, BRW_MASK_DISABLE );
1551      brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
1552      brw_pop_insn_state(p);
1553      src0 = brw_message_reg(msg_reg_nr);
1554   }
1555
1556   insn = next_insn(p, BRW_OPCODE_SEND);
1557
1558   assert(msg_length < BRW_MAX_MRF);
1559
1560   brw_set_dest(insn, dest);
1561   brw_set_src0(insn, src0);
1562   brw_set_src1(insn, brw_imm_d(0));
1563
1564   if (intel->gen < 6)
1565      insn->header.destreg__conditionalmod = msg_reg_nr;
1566
1567   brw_set_urb_message(p->brw,
1568		       insn,
1569		       allocate,
1570		       used,
1571		       msg_length,
1572		       response_length,
1573		       eot,
1574		       writes_complete,
1575		       offset,
1576		       swizzle);
1577}
1578
1579void brw_ff_sync(struct brw_compile *p,
1580		   struct brw_reg dest,
1581		   GLuint msg_reg_nr,
1582		   struct brw_reg src0,
1583		   GLboolean allocate,
1584		   GLuint response_length,
1585		   GLboolean eot)
1586{
1587   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
1588
1589   brw_set_dest(insn, dest);
1590   brw_set_src0(insn, src0);
1591   brw_set_src1(insn, brw_imm_d(0));
1592
1593   insn->header.destreg__conditionalmod = msg_reg_nr;
1594
1595   brw_set_ff_sync_message(p->brw,
1596			   insn,
1597			   allocate,
1598			   response_length,
1599			   eot);
1600}
1601