brw_fs_visitor.cpp revision e592f7df0361eb8b5c75944f0151c4e6b3f839dd
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/** @file brw_fs_visitor.cpp
25 *
26 * This file supports generating the FS LIR from the GLSL IR.  The LIR
27 * makes it easier to do backend-specific optimizations than doing so
28 * in the GLSL IR or in the native code.
29 */
30extern "C" {
31
32#include <sys/types.h>
33
34#include "main/macros.h"
35#include "main/shaderobj.h"
36#include "main/uniforms.h"
37#include "program/prog_parameter.h"
38#include "program/prog_print.h"
39#include "program/prog_optimize.h"
40#include "program/register_allocate.h"
41#include "program/sampler.h"
42#include "program/hash_table.h"
43#include "brw_context.h"
44#include "brw_eu.h"
45#include "brw_wm.h"
46}
47#include "brw_shader.h"
48#include "brw_fs.h"
49#include "glsl/glsl_types.h"
50#include "glsl/ir_optimization.h"
51#include "glsl/ir_print_visitor.h"
52
53void
54fs_visitor::visit(ir_variable *ir)
55{
56   fs_reg *reg = NULL;
57
58   if (variable_storage(ir))
59      return;
60
61   if (ir->mode == ir_var_in) {
62      if (!strcmp(ir->name, "gl_FragCoord")) {
63	 reg = emit_fragcoord_interpolation(ir);
64      } else if (!strcmp(ir->name, "gl_FrontFacing")) {
65	 reg = emit_frontfacing_interpolation(ir);
66      } else {
67	 reg = emit_general_interpolation(ir);
68      }
69      assert(reg);
70      hash_table_insert(this->variable_ht, reg, ir);
71      return;
72   } else if (ir->mode == ir_var_out) {
73      reg = new(this->mem_ctx) fs_reg(this, ir->type);
74
75      if (ir->index > 0) {
76	 assert(ir->location == FRAG_RESULT_DATA0);
77	 assert(ir->index == 1);
78	 this->dual_src_output = *reg;
79      } else if (ir->location == FRAG_RESULT_COLOR) {
80	 /* Writing gl_FragColor outputs to all color regions. */
81	 for (unsigned int i = 0; i < MAX2(c->key.nr_color_regions, 1); i++) {
82	    this->outputs[i] = *reg;
83	    this->output_components[i] = 4;
84	 }
85      } else if (ir->location == FRAG_RESULT_DEPTH) {
86	 this->frag_depth = ir;
87      } else {
88	 /* gl_FragData or a user-defined FS output */
89	 assert(ir->location >= FRAG_RESULT_DATA0 &&
90		ir->location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
91
92	 int vector_elements =
93	    ir->type->is_array() ? ir->type->fields.array->vector_elements
94				 : ir->type->vector_elements;
95
96	 /* General color output. */
97	 for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
98	    int output = ir->location - FRAG_RESULT_DATA0 + i;
99	    this->outputs[output] = *reg;
100	    this->outputs[output].reg_offset += vector_elements * i;
101	    this->output_components[output] = vector_elements;
102	 }
103      }
104   } else if (ir->mode == ir_var_uniform) {
105      int param_index = c->prog_data.nr_params;
106
107      /* Thanks to the lower_ubo_reference pass, we will see only
108       * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
109       * variables, so no need for them to be in variable_ht.
110       */
111      if (ir->uniform_block != -1)
112         return;
113
114      if (c->dispatch_width == 16) {
115	 if (!variable_storage(ir)) {
116	    fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
117	 }
118	 return;
119      }
120
121      if (!strncmp(ir->name, "gl_", 3)) {
122	 setup_builtin_uniform_values(ir);
123      } else {
124	 setup_uniform_values(ir->location, ir->type);
125      }
126
127      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
128      reg->type = brw_type_for_base_type(ir->type);
129   }
130
131   if (!reg)
132      reg = new(this->mem_ctx) fs_reg(this, ir->type);
133
134   hash_table_insert(this->variable_ht, reg, ir);
135}
136
137void
138fs_visitor::visit(ir_dereference_variable *ir)
139{
140   fs_reg *reg = variable_storage(ir->var);
141   this->result = *reg;
142}
143
144void
145fs_visitor::visit(ir_dereference_record *ir)
146{
147   const glsl_type *struct_type = ir->record->type;
148
149   ir->record->accept(this);
150
151   unsigned int offset = 0;
152   for (unsigned int i = 0; i < struct_type->length; i++) {
153      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
154	 break;
155      offset += type_size(struct_type->fields.structure[i].type);
156   }
157   this->result.reg_offset += offset;
158   this->result.type = brw_type_for_base_type(ir->type);
159}
160
161void
162fs_visitor::visit(ir_dereference_array *ir)
163{
164   ir_constant *index;
165   int element_size;
166
167   ir->array->accept(this);
168   index = ir->array_index->as_constant();
169
170   element_size = type_size(ir->type);
171   this->result.type = brw_type_for_base_type(ir->type);
172
173   if (index) {
174      assert(this->result.file == UNIFORM || this->result.file == GRF);
175      this->result.reg_offset += index->value.i[0] * element_size;
176   } else {
177      assert(!"FINISHME: non-constant array element");
178   }
179}
180
181/* Instruction selection: Produce a MOV.sat instead of
182 * MIN(MAX(val, 0), 1) when possible.
183 */
184bool
185fs_visitor::try_emit_saturate(ir_expression *ir)
186{
187   ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
188
189   if (!sat_val)
190      return false;
191
192   fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
193
194   sat_val->accept(this);
195   fs_reg src = this->result;
196
197   fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
198
199   /* If the last instruction from our accept() didn't generate our
200    * src, generate a saturated MOV
201    */
202   fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
203   if (!modify || modify->regs_written() != 1) {
204      fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
205      inst->saturate = true;
206   } else {
207      modify->saturate = true;
208      this->result = src;
209   }
210
211
212   return true;
213}
214
215bool
216fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
217{
218   /* 3-src instructions were introduced in gen6. */
219   if (intel->gen < 6)
220      return false;
221
222   /* MAD can only handle floating-point data. */
223   if (ir->type != glsl_type::float_type)
224      return false;
225
226   ir_rvalue *nonmul = ir->operands[1 - mul_arg];
227   ir_expression *mul = ir->operands[mul_arg]->as_expression();
228
229   if (!mul || mul->operation != ir_binop_mul)
230      return false;
231
232   if (nonmul->as_constant() ||
233       mul->operands[0]->as_constant() ||
234       mul->operands[1]->as_constant())
235      return false;
236
237   nonmul->accept(this);
238   fs_reg src0 = this->result;
239
240   mul->operands[0]->accept(this);
241   fs_reg src1 = this->result;
242
243   mul->operands[1]->accept(this);
244   fs_reg src2 = this->result;
245
246   this->result = fs_reg(this, ir->type);
247   emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
248
249   return true;
250}
251
252void
253fs_visitor::visit(ir_expression *ir)
254{
255   unsigned int operand;
256   fs_reg op[2], temp;
257   fs_inst *inst;
258
259   assert(ir->get_num_operands() <= 2);
260
261   if (try_emit_saturate(ir))
262      return;
263   if (ir->operation == ir_binop_add) {
264      if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
265	 return;
266   }
267
268   for (operand = 0; operand < ir->get_num_operands(); operand++) {
269      ir->operands[operand]->accept(this);
270      if (this->result.file == BAD_FILE) {
271	 ir_print_visitor v;
272	 fail("Failed to get tree for expression operand:\n");
273	 ir->operands[operand]->accept(&v);
274      }
275      op[operand] = this->result;
276
277      /* Matrix expression operands should have been broken down to vector
278       * operations already.
279       */
280      assert(!ir->operands[operand]->type->is_matrix());
281      /* And then those vector operands should have been broken down to scalar.
282       */
283      assert(!ir->operands[operand]->type->is_vector());
284   }
285
286   /* Storage for our result.  If our result goes into an assignment, it will
287    * just get copy-propagated out, so no worries.
288    */
289   this->result = fs_reg(this, ir->type);
290
291   switch (ir->operation) {
292   case ir_unop_logic_not:
293      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
294       * ones complement of the whole register, not just bit 0.
295       */
296      emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
297      break;
298   case ir_unop_neg:
299      op[0].negate = !op[0].negate;
300      this->result = op[0];
301      break;
302   case ir_unop_abs:
303      op[0].abs = true;
304      op[0].negate = false;
305      this->result = op[0];
306      break;
307   case ir_unop_sign:
308      temp = fs_reg(this, ir->type);
309
310      emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
311
312      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
313      inst->conditional_mod = BRW_CONDITIONAL_G;
314      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
315      inst->predicated = true;
316
317      inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
318      inst->conditional_mod = BRW_CONDITIONAL_L;
319      inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
320      inst->predicated = true;
321
322      break;
323   case ir_unop_rcp:
324      emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
325      break;
326
327   case ir_unop_exp2:
328      emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
329      break;
330   case ir_unop_log2:
331      emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
332      break;
333   case ir_unop_exp:
334   case ir_unop_log:
335      assert(!"not reached: should be handled by ir_explog_to_explog2");
336      break;
337   case ir_unop_sin:
338   case ir_unop_sin_reduced:
339      emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
340      break;
341   case ir_unop_cos:
342   case ir_unop_cos_reduced:
343      emit_math(SHADER_OPCODE_COS, this->result, op[0]);
344      break;
345
346   case ir_unop_dFdx:
347      emit(FS_OPCODE_DDX, this->result, op[0]);
348      break;
349   case ir_unop_dFdy:
350      emit(FS_OPCODE_DDY, this->result, op[0]);
351      break;
352
353   case ir_binop_add:
354      emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
355      break;
356   case ir_binop_sub:
357      assert(!"not reached: should be handled by ir_sub_to_add_neg");
358      break;
359
360   case ir_binop_mul:
361      if (ir->type->is_integer()) {
362	 /* For integer multiplication, the MUL uses the low 16 bits
363	  * of one of the operands (src0 on gen6, src1 on gen7).  The
364	  * MACH accumulates in the contribution of the upper 16 bits
365	  * of that operand.
366	  *
367	  * FINISHME: Emit just the MUL if we know an operand is small
368	  * enough.
369	  */
370	 if (intel->gen >= 7 && c->dispatch_width == 16)
371	    fail("16-wide explicit accumulator operands unsupported\n");
372
373	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
374
375	 emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
376	 emit(BRW_OPCODE_MACH, reg_null_d, op[0], op[1]);
377	 emit(BRW_OPCODE_MOV, this->result, fs_reg(acc));
378      } else {
379	 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
380      }
381      break;
382   case ir_binop_div:
383      if (intel->gen >= 7 && c->dispatch_width == 16)
384	 fail("16-wide INTDIV unsupported\n");
385
386      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
387      assert(ir->type->is_integer());
388      emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
389      break;
390   case ir_binop_mod:
391      if (intel->gen >= 7 && c->dispatch_width == 16)
392	 fail("16-wide INTDIV unsupported\n");
393
394      /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
395      assert(ir->type->is_integer());
396      emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
397      break;
398
399   case ir_binop_less:
400   case ir_binop_greater:
401   case ir_binop_lequal:
402   case ir_binop_gequal:
403   case ir_binop_equal:
404   case ir_binop_all_equal:
405   case ir_binop_nequal:
406   case ir_binop_any_nequal:
407      temp = this->result;
408      /* original gen4 does implicit conversion before comparison. */
409      if (intel->gen < 5)
410	 temp.type = op[0].type;
411
412      resolve_ud_negate(&op[0]);
413      resolve_ud_negate(&op[1]);
414
415      resolve_bool_comparison(ir->operands[0], &op[0]);
416      resolve_bool_comparison(ir->operands[1], &op[1]);
417
418      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
419      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
420      break;
421
422   case ir_binop_logic_xor:
423      emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
424      break;
425
426   case ir_binop_logic_or:
427      emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
428      break;
429
430   case ir_binop_logic_and:
431      emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
432      break;
433
434   case ir_binop_dot:
435   case ir_unop_any:
436      assert(!"not reached: should be handled by brw_fs_channel_expressions");
437      break;
438
439   case ir_unop_noise:
440      assert(!"not reached: should be handled by lower_noise");
441      break;
442
443   case ir_quadop_vector:
444      assert(!"not reached: should be handled by lower_quadop_vector");
445      break;
446
447   case ir_unop_sqrt:
448      emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
449      break;
450
451   case ir_unop_rsq:
452      emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
453      break;
454
455   case ir_unop_bitcast_i2f:
456   case ir_unop_bitcast_u2f:
457      op[0].type = BRW_REGISTER_TYPE_F;
458      this->result = op[0];
459      break;
460   case ir_unop_i2u:
461   case ir_unop_bitcast_f2u:
462      op[0].type = BRW_REGISTER_TYPE_UD;
463      this->result = op[0];
464      break;
465   case ir_unop_u2i:
466   case ir_unop_bitcast_f2i:
467      op[0].type = BRW_REGISTER_TYPE_D;
468      this->result = op[0];
469      break;
470   case ir_unop_i2f:
471   case ir_unop_u2f:
472   case ir_unop_f2i:
473   case ir_unop_f2u:
474      emit(BRW_OPCODE_MOV, this->result, op[0]);
475      break;
476
477   case ir_unop_b2i:
478      inst = emit(BRW_OPCODE_AND, this->result, op[0], fs_reg(1));
479      break;
480   case ir_unop_b2f:
481      temp = fs_reg(this, glsl_type::int_type);
482      emit(BRW_OPCODE_AND, temp, op[0], fs_reg(1));
483      emit(BRW_OPCODE_MOV, this->result, temp);
484      break;
485
486   case ir_unop_f2b:
487      inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f));
488      inst->conditional_mod = BRW_CONDITIONAL_NZ;
489      emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
490      break;
491   case ir_unop_i2b:
492      assert(op[0].type == BRW_REGISTER_TYPE_D);
493
494      inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0));
495      inst->conditional_mod = BRW_CONDITIONAL_NZ;
496      emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
497      break;
498
499   case ir_unop_trunc:
500      emit(BRW_OPCODE_RNDZ, this->result, op[0]);
501      break;
502   case ir_unop_ceil:
503      op[0].negate = !op[0].negate;
504      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
505      this->result.negate = true;
506      break;
507   case ir_unop_floor:
508      inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
509      break;
510   case ir_unop_fract:
511      inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
512      break;
513   case ir_unop_round_even:
514      emit(BRW_OPCODE_RNDE, this->result, op[0]);
515      break;
516
517   case ir_binop_min:
518      resolve_ud_negate(&op[0]);
519      resolve_ud_negate(&op[1]);
520
521      if (intel->gen >= 6) {
522	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
523	 inst->conditional_mod = BRW_CONDITIONAL_L;
524      } else {
525	 /* Unalias the destination */
526	 this->result = fs_reg(this, ir->type);
527
528	 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
529	 inst->conditional_mod = BRW_CONDITIONAL_L;
530
531	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
532	 inst->predicated = true;
533      }
534      break;
535   case ir_binop_max:
536      resolve_ud_negate(&op[0]);
537      resolve_ud_negate(&op[1]);
538
539      if (intel->gen >= 6) {
540	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
541	 inst->conditional_mod = BRW_CONDITIONAL_GE;
542      } else {
543	 /* Unalias the destination */
544	 this->result = fs_reg(this, ir->type);
545
546	 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
547	 inst->conditional_mod = BRW_CONDITIONAL_G;
548
549	 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
550	 inst->predicated = true;
551      }
552      break;
553
554   case ir_binop_pow:
555      emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
556      break;
557
558   case ir_unop_bit_not:
559      inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
560      break;
561   case ir_binop_bit_and:
562      inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
563      break;
564   case ir_binop_bit_xor:
565      inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
566      break;
567   case ir_binop_bit_or:
568      inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
569      break;
570
571   case ir_binop_lshift:
572      inst = emit(BRW_OPCODE_SHL, this->result, op[0], op[1]);
573      break;
574
575   case ir_binop_rshift:
576      if (ir->type->base_type == GLSL_TYPE_INT)
577	 inst = emit(BRW_OPCODE_ASR, this->result, op[0], op[1]);
578      else
579	 inst = emit(BRW_OPCODE_SHR, this->result, op[0], op[1]);
580      break;
581
582   case ir_binop_ubo_load:
583      ir_constant *uniform_block = ir->operands[0]->as_constant();
584      ir_constant *offset = ir->operands[1]->as_constant();
585
586      fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
587      packed_consts.type = result.type;
588      fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_WM_UBO(uniform_block->value.u[0]));
589      fs_inst *pull = emit(fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
590                                   packed_consts,
591                                   surf_index,
592                                   fs_reg(offset->value.u[0])));
593      pull->base_mrf = 14;
594      pull->mlen = 1;
595
596      packed_consts.smear = offset->value.u[0] % 16 / 4;
597      for (int i = 0; i < ir->type->vector_elements; i++) {
598         /* UBO bools are any nonzero value.  We consider bools to be
599          * values with the low bit set to 1.  Convert them using CMP.
600          */
601         if (ir->type->base_type == GLSL_TYPE_BOOL) {
602            fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, result,
603                                         packed_consts, fs_reg(0u)));
604            inst->conditional_mod = BRW_CONDITIONAL_NZ;
605         } else {
606            emit(fs_inst(BRW_OPCODE_MOV, result, packed_consts));
607         }
608
609         packed_consts.smear++;
610         result.reg_offset++;
611
612         /* The std140 packing rules don't allow vectors to cross 16-byte
613          * boundaries, and a reg is 32 bytes.
614          */
615         assert(packed_consts.smear < 8);
616      }
617      result.reg_offset = 0;
618      break;
619   }
620}
621
622void
623fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
624				   const glsl_type *type, bool predicated)
625{
626   switch (type->base_type) {
627   case GLSL_TYPE_FLOAT:
628   case GLSL_TYPE_UINT:
629   case GLSL_TYPE_INT:
630   case GLSL_TYPE_BOOL:
631      for (unsigned int i = 0; i < type->components(); i++) {
632	 l.type = brw_type_for_base_type(type);
633	 r.type = brw_type_for_base_type(type);
634
635	 if (predicated || !l.equals(r)) {
636	    fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
637	    inst->predicated = predicated;
638	 }
639
640	 l.reg_offset++;
641	 r.reg_offset++;
642      }
643      break;
644   case GLSL_TYPE_ARRAY:
645      for (unsigned int i = 0; i < type->length; i++) {
646	 emit_assignment_writes(l, r, type->fields.array, predicated);
647      }
648      break;
649
650   case GLSL_TYPE_STRUCT:
651      for (unsigned int i = 0; i < type->length; i++) {
652	 emit_assignment_writes(l, r, type->fields.structure[i].type,
653				predicated);
654      }
655      break;
656
657   case GLSL_TYPE_SAMPLER:
658      break;
659
660   default:
661      assert(!"not reached");
662      break;
663   }
664}
665
666/* If the RHS processing resulted in an instruction generating a
667 * temporary value, and it would be easy to rewrite the instruction to
668 * generate its result right into the LHS instead, do so.  This ends
669 * up reliably removing instructions where it can be tricky to do so
670 * later without real UD chain information.
671 */
672bool
673fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
674                                   fs_reg dst,
675                                   fs_reg src,
676                                   fs_inst *pre_rhs_inst,
677                                   fs_inst *last_rhs_inst)
678{
679   /* Only attempt if we're doing a direct assignment. */
680   if (ir->condition ||
681       !(ir->lhs->type->is_scalar() ||
682        (ir->lhs->type->is_vector() &&
683         ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
684      return false;
685
686   /* Make sure the last instruction generated our source reg. */
687   fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
688						    last_rhs_inst,
689						    src);
690   if (!modify)
691      return false;
692
693   /* If last_rhs_inst wrote a different number of components than our LHS,
694    * we can't safely rewrite it.
695    */
696   if (ir->lhs->type->vector_elements != modify->regs_written())
697      return false;
698
699   /* Success!  Rewrite the instruction. */
700   modify->dst = dst;
701
702   return true;
703}
704
705void
706fs_visitor::visit(ir_assignment *ir)
707{
708   fs_reg l, r;
709   fs_inst *inst;
710
711   /* FINISHME: arrays on the lhs */
712   ir->lhs->accept(this);
713   l = this->result;
714
715   fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
716
717   ir->rhs->accept(this);
718   r = this->result;
719
720   fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
721
722   assert(l.file != BAD_FILE);
723   assert(r.file != BAD_FILE);
724
725   if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
726      return;
727
728   if (ir->condition) {
729      emit_bool_to_cond_code(ir->condition);
730   }
731
732   if (ir->lhs->type->is_scalar() ||
733       ir->lhs->type->is_vector()) {
734      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
735	 if (ir->write_mask & (1 << i)) {
736	    inst = emit(BRW_OPCODE_MOV, l, r);
737	    if (ir->condition)
738	       inst->predicated = true;
739	    r.reg_offset++;
740	 }
741	 l.reg_offset++;
742      }
743   } else {
744      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
745   }
746}
747
748fs_inst *
749fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
750			      fs_reg shadow_c, fs_reg lod, fs_reg dPdy,
751			      int sampler)
752{
753   int mlen;
754   int base_mrf = 1;
755   bool simd16 = false;
756   fs_reg orig_dst;
757
758   /* g0 header. */
759   mlen = 1;
760
761   if (ir->shadow_comparitor) {
762      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
763	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
764	 coordinate.reg_offset++;
765      }
766      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
767      mlen += 3;
768
769      if (ir->op == ir_tex) {
770	 /* There's no plain shadow compare message, so we use shadow
771	  * compare with a bias of 0.0.
772	  */
773	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
774	 mlen++;
775      } else if (ir->op == ir_txb || ir->op == ir_txl) {
776	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
777	 mlen++;
778      } else {
779         assert(!"Should not get here.");
780      }
781
782      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
783      mlen++;
784   } else if (ir->op == ir_tex) {
785      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
786	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
787	 coordinate.reg_offset++;
788      }
789      /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
790      mlen += 3;
791   } else if (ir->op == ir_txd) {
792      fs_reg &dPdx = lod;
793
794      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
795	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
796	 coordinate.reg_offset++;
797      }
798      /* the slots for u and v are always present, but r is optional */
799      mlen += MAX2(ir->coordinate->type->vector_elements, 2);
800
801      /*  P   = u, v, r
802       * dPdx = dudx, dvdx, drdx
803       * dPdy = dudy, dvdy, drdy
804       *
805       * 1-arg: Does not exist.
806       *
807       * 2-arg: dudx   dvdx   dudy   dvdy
808       *        dPdx.x dPdx.y dPdy.x dPdy.y
809       *        m4     m5     m6     m7
810       *
811       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
812       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
813       *        m5     m6     m7     m8     m9     m10
814       */
815      for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
816	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdx);
817	 dPdx.reg_offset++;
818      }
819      mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
820
821      for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
822	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdy);
823	 dPdy.reg_offset++;
824      }
825      mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
826   } else if (ir->op == ir_txs) {
827      /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
828      simd16 = true;
829      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
830      mlen += 2;
831   } else {
832      /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
833       * instructions.  We'll need to do SIMD16 here.
834       */
835      simd16 = true;
836      assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
837
838      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
839	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
840	      coordinate);
841	 coordinate.reg_offset++;
842      }
843
844      /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
845       * be necessary for TXF (ld), but seems wise to do for all messages.
846       */
847      for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
848	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
849      }
850
851      /* lod/bias appears after u/v/r. */
852      mlen += 6;
853
854      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, lod.type), lod);
855      mlen++;
856
857      /* The unused upper half. */
858      mlen++;
859   }
860
861   if (simd16) {
862      /* Now, since we're doing simd16, the return is 2 interleaved
863       * vec4s where the odd-indexed ones are junk. We'll need to move
864       * this weirdness around to the expected layout.
865       */
866      orig_dst = dst;
867      const glsl_type *vec_type =
868	 glsl_type::get_instance(ir->type->base_type, 4, 1);
869      dst = fs_reg(this, glsl_type::get_array_instance(vec_type, 2));
870      dst.type = intel->is_g4x ? brw_type_for_base_type(ir->type)
871			       : BRW_REGISTER_TYPE_F;
872   }
873
874   fs_inst *inst = NULL;
875   switch (ir->op) {
876   case ir_tex:
877      inst = emit(SHADER_OPCODE_TEX, dst);
878      break;
879   case ir_txb:
880      inst = emit(FS_OPCODE_TXB, dst);
881      break;
882   case ir_txl:
883      inst = emit(SHADER_OPCODE_TXL, dst);
884      break;
885   case ir_txd:
886      inst = emit(SHADER_OPCODE_TXD, dst);
887      break;
888   case ir_txs:
889      inst = emit(SHADER_OPCODE_TXS, dst);
890      break;
891   case ir_txf:
892      inst = emit(SHADER_OPCODE_TXF, dst);
893      break;
894   }
895   inst->base_mrf = base_mrf;
896   inst->mlen = mlen;
897   inst->header_present = true;
898
899   if (simd16) {
900      for (int i = 0; i < 4; i++) {
901	 emit(BRW_OPCODE_MOV, orig_dst, dst);
902	 orig_dst.reg_offset++;
903	 dst.reg_offset += 2;
904      }
905   }
906
907   return inst;
908}
909
910/* gen5's sampler has slots for u, v, r, array index, then optional
911 * parameters like shadow comparitor or LOD bias.  If optional
912 * parameters aren't present, those base slots are optional and don't
913 * need to be included in the message.
914 *
915 * We don't fill in the unnecessary slots regardless, which may look
916 * surprising in the disassembly.
917 */
918fs_inst *
919fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
920			      fs_reg shadow_c, fs_reg lod, fs_reg lod2,
921			      int sampler)
922{
923   int mlen = 0;
924   int base_mrf = 2;
925   int reg_width = c->dispatch_width / 8;
926   bool header_present = false;
927   const int vector_elements =
928      ir->coordinate ? ir->coordinate->type->vector_elements : 0;
929
930   if (ir->offset != NULL && ir->op == ir_txf) {
931      /* It appears that the ld instruction used for txf does its
932       * address bounds check before adding in the offset.  To work
933       * around this, just add the integer offset to the integer texel
934       * coordinate, and don't put the offset in the header.
935       */
936      ir_constant *offset = ir->offset->as_constant();
937      for (int i = 0; i < vector_elements; i++) {
938	 emit(BRW_OPCODE_ADD,
939	      fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
940	      coordinate,
941	      offset->value.i[i]);
942	 coordinate.reg_offset++;
943      }
944   } else {
945      if (ir->offset) {
946	 /* The offsets set up by the ir_texture visitor are in the
947	  * m1 header, so we can't go headerless.
948	  */
949	 header_present = true;
950	 mlen++;
951	 base_mrf--;
952      }
953
954      for (int i = 0; i < vector_elements; i++) {
955	 emit(BRW_OPCODE_MOV,
956	      fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
957	      coordinate);
958	 coordinate.reg_offset++;
959      }
960   }
961   mlen += vector_elements * reg_width;
962
963   if (ir->shadow_comparitor) {
964      mlen = MAX2(mlen, header_present + 4 * reg_width);
965
966      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
967      mlen += reg_width;
968   }
969
970   fs_inst *inst = NULL;
971   switch (ir->op) {
972   case ir_tex:
973      inst = emit(SHADER_OPCODE_TEX, dst);
974      break;
975   case ir_txb:
976      mlen = MAX2(mlen, header_present + 4 * reg_width);
977      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
978      mlen += reg_width;
979
980      inst = emit(FS_OPCODE_TXB, dst);
981      break;
982   case ir_txl:
983      mlen = MAX2(mlen, header_present + 4 * reg_width);
984      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
985      mlen += reg_width;
986
987      inst = emit(SHADER_OPCODE_TXL, dst);
988      break;
989   case ir_txd: {
990      mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
991
992      /**
993       *  P   =  u,    v,    r
994       * dPdx = dudx, dvdx, drdx
995       * dPdy = dudy, dvdy, drdy
996       *
997       * Load up these values:
998       * - dudx   dudy   dvdx   dvdy   drdx   drdy
999       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1000       */
1001      for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1002	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1003	 lod.reg_offset++;
1004	 mlen += reg_width;
1005
1006	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
1007	 lod2.reg_offset++;
1008	 mlen += reg_width;
1009      }
1010
1011      inst = emit(SHADER_OPCODE_TXD, dst);
1012      break;
1013   }
1014   case ir_txs:
1015      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
1016      mlen += reg_width;
1017      inst = emit(SHADER_OPCODE_TXS, dst);
1018      break;
1019   case ir_txf:
1020      mlen = header_present + 4 * reg_width;
1021
1022      emit(BRW_OPCODE_MOV,
1023	   fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD),
1024	   lod);
1025      inst = emit(SHADER_OPCODE_TXF, dst);
1026      break;
1027   }
1028   inst->base_mrf = base_mrf;
1029   inst->mlen = mlen;
1030   inst->header_present = header_present;
1031
1032   if (mlen > 11) {
1033      fail("Message length >11 disallowed by hardware\n");
1034   }
1035
1036   return inst;
1037}
1038
1039fs_inst *
1040fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1041			      fs_reg shadow_c, fs_reg lod, fs_reg lod2,
1042			      int sampler)
1043{
1044   int mlen = 0;
1045   int base_mrf = 2;
1046   int reg_width = c->dispatch_width / 8;
1047   bool header_present = false;
1048   int offsets[3];
1049
1050   if (ir->offset && ir->op != ir_txf) {
1051      /* The offsets set up by the ir_texture visitor are in the
1052       * m1 header, so we can't go headerless.
1053       */
1054      header_present = true;
1055      mlen++;
1056      base_mrf--;
1057   }
1058
1059   if (ir->shadow_comparitor) {
1060      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
1061      mlen += reg_width;
1062   }
1063
1064   /* Set up the LOD info */
1065   switch (ir->op) {
1066   case ir_tex:
1067      break;
1068   case ir_txb:
1069      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1070      mlen += reg_width;
1071      break;
1072   case ir_txl:
1073      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1074      mlen += reg_width;
1075      break;
1076   case ir_txd: {
1077      if (c->dispatch_width == 16)
1078	 fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1079
1080      /* Load dPdx and the coordinate together:
1081       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1082       */
1083      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1084	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
1085	 coordinate.reg_offset++;
1086	 mlen += reg_width;
1087
1088	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1089	 lod.reg_offset++;
1090	 mlen += reg_width;
1091
1092	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
1093	 lod2.reg_offset++;
1094	 mlen += reg_width;
1095      }
1096      break;
1097   }
1098   case ir_txs:
1099      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
1100      mlen += reg_width;
1101      break;
1102   case ir_txf:
1103      /* It appears that the ld instruction used for txf does its
1104       * address bounds check before adding in the offset.  To work
1105       * around this, just add the integer offset to the integer texel
1106       * coordinate, and don't put the offset in the header.
1107       */
1108      if (ir->offset) {
1109	 ir_constant *offset = ir->offset->as_constant();
1110	 offsets[0] = offset->value.i[0];
1111	 offsets[1] = offset->value.i[1];
1112	 offsets[2] = offset->value.i[2];
1113      } else {
1114	 memset(offsets, 0, sizeof(offsets));
1115      }
1116
1117      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1118      emit(BRW_OPCODE_ADD,
1119	   fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[0]);
1120      coordinate.reg_offset++;
1121      mlen += reg_width;
1122
1123      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), lod);
1124      mlen += reg_width;
1125
1126      for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
1127	 emit(BRW_OPCODE_ADD,
1128	      fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[i]);
1129	 coordinate.reg_offset++;
1130	 mlen += reg_width;
1131      }
1132      break;
1133   }
1134
1135   /* Set up the coordinate (except for cases where it was done above) */
1136   if (ir->op != ir_txd && ir->op != ir_txs && ir->op != ir_txf) {
1137      for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1138	 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
1139	 coordinate.reg_offset++;
1140	 mlen += reg_width;
1141      }
1142   }
1143
1144   /* Generate the SEND */
1145   fs_inst *inst = NULL;
1146   switch (ir->op) {
1147   case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst); break;
1148   case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
1149   case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst); break;
1150   case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst); break;
1151   case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
1152   case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
1153   }
1154   inst->base_mrf = base_mrf;
1155   inst->mlen = mlen;
1156   inst->header_present = header_present;
1157
1158   if (mlen > 11) {
1159      fail("Message length >11 disallowed by hardware\n");
1160   }
1161
1162   return inst;
1163}
1164
1165/**
1166 * Emit code to produce the coordinates for a texture lookup.
1167 *
1168 * Returns the fs_reg containing the texture coordinate (as opposed to
1169 * setting this->result).
1170 */
1171fs_reg
1172fs_visitor::emit_texcoord(ir_texture *ir, int sampler)
1173{
1174   fs_inst *inst = NULL;
1175
1176   if (!ir->coordinate)
1177      return fs_reg(); /* Return the default BAD_FILE register. */
1178
1179   ir->coordinate->accept(this);
1180   fs_reg coordinate = this->result;
1181
1182   bool needs_gl_clamp = true;
1183
1184   fs_reg scale_x, scale_y;
1185
1186   /* The 965 requires the EU to do the normalization of GL rectangle
1187    * texture coordinates.  We use the program parameter state
1188    * tracking to get the scaling factor.
1189    */
1190   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT &&
1191       (intel->gen < 6 ||
1192	(intel->gen >= 6 && (c->key.tex.gl_clamp_mask[0] & (1 << sampler) ||
1193			     c->key.tex.gl_clamp_mask[1] & (1 << sampler))))) {
1194      struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1195      int tokens[STATE_LENGTH] = {
1196	 STATE_INTERNAL,
1197	 STATE_TEXRECT_SCALE,
1198	 sampler,
1199	 0,
1200	 0
1201      };
1202
1203      if (c->dispatch_width == 16) {
1204	 fail("rectangle scale uniform setup not supported on 16-wide\n");
1205	 return fs_reg(this, ir->type);
1206      }
1207
1208      scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1209      scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1210
1211      GLuint index = _mesa_add_state_reference(params,
1212					       (gl_state_index *)tokens);
1213
1214      this->param_index[c->prog_data.nr_params] = index;
1215      this->param_offset[c->prog_data.nr_params] = 0;
1216      c->prog_data.nr_params++;
1217      this->param_index[c->prog_data.nr_params] = index;
1218      this->param_offset[c->prog_data.nr_params] = 1;
1219      c->prog_data.nr_params++;
1220   }
1221
1222   /* The 965 requires the EU to do the normalization of GL rectangle
1223    * texture coordinates.  We use the program parameter state
1224    * tracking to get the scaling factor.
1225    */
1226   if (intel->gen < 6 &&
1227       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1228      fs_reg dst = fs_reg(this, ir->coordinate->type);
1229      fs_reg src = coordinate;
1230      coordinate = dst;
1231
1232      emit(BRW_OPCODE_MUL, dst, src, scale_x);
1233      dst.reg_offset++;
1234      src.reg_offset++;
1235      emit(BRW_OPCODE_MUL, dst, src, scale_y);
1236   } else if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1237      /* On gen6+, the sampler handles the rectangle coordinates
1238       * natively, without needing rescaling.  But that means we have
1239       * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1240       * not [0, 1] like the default case below.
1241       */
1242      needs_gl_clamp = false;
1243
1244      for (int i = 0; i < 2; i++) {
1245	 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1246	    fs_reg chan = coordinate;
1247	    chan.reg_offset += i;
1248
1249	    inst = emit(BRW_OPCODE_SEL, chan, chan, brw_imm_f(0.0));
1250	    inst->conditional_mod = BRW_CONDITIONAL_G;
1251
1252	    /* Our parameter comes in as 1.0/width or 1.0/height,
1253	     * because that's what people normally want for doing
1254	     * texture rectangle handling.  We need width or height
1255	     * for clamping, but we don't care enough to make a new
1256	     * parameter type, so just invert back.
1257	     */
1258	    fs_reg limit = fs_reg(this, glsl_type::float_type);
1259	    emit(BRW_OPCODE_MOV, limit, i == 0 ? scale_x : scale_y);
1260	    emit(SHADER_OPCODE_RCP, limit, limit);
1261
1262	    inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1263	    inst->conditional_mod = BRW_CONDITIONAL_L;
1264	 }
1265      }
1266   }
1267
1268   if (ir->coordinate && needs_gl_clamp) {
1269      for (unsigned int i = 0;
1270	   i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
1271	 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1272	    fs_reg chan = coordinate;
1273	    chan.reg_offset += i;
1274
1275	    fs_inst *inst = emit(BRW_OPCODE_MOV, chan, chan);
1276	    inst->saturate = true;
1277	 }
1278      }
1279   }
1280   return coordinate;
1281}
1282
1283void
1284fs_visitor::visit(ir_texture *ir)
1285{
1286   fs_inst *inst = NULL;
1287
1288   int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &fp->Base);
1289   sampler = fp->Base.SamplerUnits[sampler];
1290
1291   /* Should be lowered by do_lower_texture_projection */
1292   assert(!ir->projector);
1293
1294   /* Generate code to compute all the subexpression trees.  This has to be
1295    * done before loading any values into MRFs for the sampler message since
1296    * generating these values may involve SEND messages that need the MRFs.
1297    */
1298   fs_reg coordinate = emit_texcoord(ir, sampler);
1299
1300   fs_reg shadow_comparitor;
1301   if (ir->shadow_comparitor) {
1302      ir->shadow_comparitor->accept(this);
1303      shadow_comparitor = this->result;
1304   }
1305
1306   fs_reg lod, lod2;
1307   switch (ir->op) {
1308   case ir_tex:
1309      break;
1310   case ir_txb:
1311      ir->lod_info.bias->accept(this);
1312      lod = this->result;
1313      break;
1314   case ir_txd:
1315      ir->lod_info.grad.dPdx->accept(this);
1316      lod = this->result;
1317
1318      ir->lod_info.grad.dPdy->accept(this);
1319      lod2 = this->result;
1320      break;
1321   case ir_txf:
1322   case ir_txl:
1323   case ir_txs:
1324      ir->lod_info.lod->accept(this);
1325      lod = this->result;
1326      break;
1327   };
1328
1329   /* Writemasking doesn't eliminate channels on SIMD8 texture
1330    * samples, so don't worry about them.
1331    */
1332   fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
1333
1334   if (intel->gen >= 7) {
1335      inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
1336                               lod, lod2, sampler);
1337   } else if (intel->gen >= 5) {
1338      inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
1339                               lod, lod2, sampler);
1340   } else {
1341      inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
1342                               lod, lod2, sampler);
1343   }
1344
1345   /* The header is set up by generate_tex() when necessary. */
1346   inst->src[0] = reg_undef;
1347
1348   if (ir->offset != NULL && ir->op != ir_txf)
1349      inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1350
1351   inst->sampler = sampler;
1352
1353   if (ir->shadow_comparitor)
1354      inst->shadow_compare = true;
1355
1356   swizzle_result(ir, dst, sampler);
1357}
1358
1359/**
1360 * Swizzle the result of a texture result.  This is necessary for
1361 * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1362 */
1363void
1364fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, int sampler)
1365{
1366   this->result = orig_val;
1367
1368   if (ir->op == ir_txs)
1369      return;
1370
1371   if (ir->type == glsl_type::float_type) {
1372      /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1373      assert(ir->sampler->type->sampler_shadow);
1374   } else if (c->key.tex.swizzles[sampler] != SWIZZLE_NOOP) {
1375      fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
1376
1377      for (int i = 0; i < 4; i++) {
1378	 int swiz = GET_SWZ(c->key.tex.swizzles[sampler], i);
1379	 fs_reg l = swizzled_result;
1380	 l.reg_offset += i;
1381
1382	 if (swiz == SWIZZLE_ZERO) {
1383	    emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1384	 } else if (swiz == SWIZZLE_ONE) {
1385	    emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1386	 } else {
1387	    fs_reg r = orig_val;
1388	    r.reg_offset += GET_SWZ(c->key.tex.swizzles[sampler], i);
1389	    emit(BRW_OPCODE_MOV, l, r);
1390	 }
1391      }
1392      this->result = swizzled_result;
1393   }
1394}
1395
1396void
1397fs_visitor::visit(ir_swizzle *ir)
1398{
1399   ir->val->accept(this);
1400   fs_reg val = this->result;
1401
1402   if (ir->type->vector_elements == 1) {
1403      this->result.reg_offset += ir->mask.x;
1404      return;
1405   }
1406
1407   fs_reg result = fs_reg(this, ir->type);
1408   this->result = result;
1409
1410   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1411      fs_reg channel = val;
1412      int swiz = 0;
1413
1414      switch (i) {
1415      case 0:
1416	 swiz = ir->mask.x;
1417	 break;
1418      case 1:
1419	 swiz = ir->mask.y;
1420	 break;
1421      case 2:
1422	 swiz = ir->mask.z;
1423	 break;
1424      case 3:
1425	 swiz = ir->mask.w;
1426	 break;
1427      }
1428
1429      channel.reg_offset += swiz;
1430      emit(BRW_OPCODE_MOV, result, channel);
1431      result.reg_offset++;
1432   }
1433}
1434
1435void
1436fs_visitor::visit(ir_discard *ir)
1437{
1438   assert(ir->condition == NULL); /* FINISHME */
1439
1440   emit(FS_OPCODE_DISCARD);
1441}
1442
1443void
1444fs_visitor::visit(ir_constant *ir)
1445{
1446   /* Set this->result to reg at the bottom of the function because some code
1447    * paths will cause this visitor to be applied to other fields.  This will
1448    * cause the value stored in this->result to be modified.
1449    *
1450    * Make reg constant so that it doesn't get accidentally modified along the
1451    * way.  Yes, I actually had this problem. :(
1452    */
1453   const fs_reg reg(this, ir->type);
1454   fs_reg dst_reg = reg;
1455
1456   if (ir->type->is_array()) {
1457      const unsigned size = type_size(ir->type->fields.array);
1458
1459      for (unsigned i = 0; i < ir->type->length; i++) {
1460	 ir->array_elements[i]->accept(this);
1461	 fs_reg src_reg = this->result;
1462
1463	 dst_reg.type = src_reg.type;
1464	 for (unsigned j = 0; j < size; j++) {
1465	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1466	    src_reg.reg_offset++;
1467	    dst_reg.reg_offset++;
1468	 }
1469      }
1470   } else if (ir->type->is_record()) {
1471      foreach_list(node, &ir->components) {
1472	 ir_constant *const field = (ir_constant *) node;
1473	 const unsigned size = type_size(field->type);
1474
1475	 field->accept(this);
1476	 fs_reg src_reg = this->result;
1477
1478	 dst_reg.type = src_reg.type;
1479	 for (unsigned j = 0; j < size; j++) {
1480	    emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1481	    src_reg.reg_offset++;
1482	    dst_reg.reg_offset++;
1483	 }
1484      }
1485   } else {
1486      const unsigned size = type_size(ir->type);
1487
1488      for (unsigned i = 0; i < size; i++) {
1489	 switch (ir->type->base_type) {
1490	 case GLSL_TYPE_FLOAT:
1491	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1492	    break;
1493	 case GLSL_TYPE_UINT:
1494	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1495	    break;
1496	 case GLSL_TYPE_INT:
1497	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1498	    break;
1499	 case GLSL_TYPE_BOOL:
1500	    emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1501	    break;
1502	 default:
1503	    assert(!"Non-float/uint/int/bool constant");
1504	 }
1505	 dst_reg.reg_offset++;
1506      }
1507   }
1508
1509   this->result = reg;
1510}
1511
1512void
1513fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1514{
1515   ir_expression *expr = ir->as_expression();
1516
1517   if (expr) {
1518      fs_reg op[2];
1519      fs_inst *inst;
1520
1521      assert(expr->get_num_operands() <= 2);
1522      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1523	 assert(expr->operands[i]->type->is_scalar());
1524
1525	 expr->operands[i]->accept(this);
1526	 op[i] = this->result;
1527
1528	 resolve_ud_negate(&op[i]);
1529      }
1530
1531      switch (expr->operation) {
1532      case ir_unop_logic_not:
1533	 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1534	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1535	 break;
1536
1537      case ir_binop_logic_xor:
1538      case ir_binop_logic_or:
1539      case ir_binop_logic_and:
1540	 goto out;
1541
1542      case ir_unop_f2b:
1543	 if (intel->gen >= 6) {
1544	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1545	 } else {
1546	    inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1547	 }
1548	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1549	 break;
1550
1551      case ir_unop_i2b:
1552	 if (intel->gen >= 6) {
1553	    inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1554	 } else {
1555	    inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1556	 }
1557	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1558	 break;
1559
1560      case ir_binop_greater:
1561      case ir_binop_gequal:
1562      case ir_binop_less:
1563      case ir_binop_lequal:
1564      case ir_binop_equal:
1565      case ir_binop_all_equal:
1566      case ir_binop_nequal:
1567      case ir_binop_any_nequal:
1568	 resolve_bool_comparison(expr->operands[0], &op[0]);
1569	 resolve_bool_comparison(expr->operands[1], &op[1]);
1570
1571	 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1572	 inst->conditional_mod =
1573	    brw_conditional_for_comparison(expr->operation);
1574	 break;
1575
1576      default:
1577	 assert(!"not reached");
1578	 fail("bad cond code\n");
1579	 break;
1580      }
1581      return;
1582   }
1583
1584out:
1585   ir->accept(this);
1586
1587   fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1588   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1589}
1590
1591/**
1592 * Emit a gen6 IF statement with the comparison folded into the IF
1593 * instruction.
1594 */
1595void
1596fs_visitor::emit_if_gen6(ir_if *ir)
1597{
1598   ir_expression *expr = ir->condition->as_expression();
1599
1600   if (expr) {
1601      fs_reg op[2];
1602      fs_inst *inst;
1603      fs_reg temp;
1604
1605      assert(expr->get_num_operands() <= 2);
1606      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1607	 assert(expr->operands[i]->type->is_scalar());
1608
1609	 expr->operands[i]->accept(this);
1610	 op[i] = this->result;
1611      }
1612
1613      switch (expr->operation) {
1614      case ir_unop_logic_not:
1615	 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1616	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1617	 return;
1618
1619      case ir_binop_logic_xor:
1620	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1621	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1622	 return;
1623
1624      case ir_binop_logic_or:
1625	 temp = fs_reg(this, glsl_type::bool_type);
1626	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1627	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1628	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1629	 return;
1630
1631      case ir_binop_logic_and:
1632	 temp = fs_reg(this, glsl_type::bool_type);
1633	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1634	 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1635	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1636	 return;
1637
1638      case ir_unop_f2b:
1639	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1640	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1641	 return;
1642
1643      case ir_unop_i2b:
1644	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1645	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1646	 return;
1647
1648      case ir_binop_greater:
1649      case ir_binop_gequal:
1650      case ir_binop_less:
1651      case ir_binop_lequal:
1652      case ir_binop_equal:
1653      case ir_binop_all_equal:
1654      case ir_binop_nequal:
1655      case ir_binop_any_nequal:
1656	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1657	 inst->conditional_mod =
1658	    brw_conditional_for_comparison(expr->operation);
1659	 return;
1660      default:
1661	 assert(!"not reached");
1662	 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1663	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1664	 fail("bad condition\n");
1665	 return;
1666      }
1667      return;
1668   }
1669
1670   ir->condition->accept(this);
1671
1672   fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1673   inst->conditional_mod = BRW_CONDITIONAL_NZ;
1674}
1675
1676void
1677fs_visitor::visit(ir_if *ir)
1678{
1679   fs_inst *inst;
1680
1681   if (intel->gen < 6 && c->dispatch_width == 16) {
1682      fail("Can't support (non-uniform) control flow on 16-wide\n");
1683   }
1684
1685   /* Don't point the annotation at the if statement, because then it plus
1686    * the then and else blocks get printed.
1687    */
1688   this->base_ir = ir->condition;
1689
1690   if (intel->gen == 6) {
1691      emit_if_gen6(ir);
1692   } else {
1693      emit_bool_to_cond_code(ir->condition);
1694
1695      inst = emit(BRW_OPCODE_IF);
1696      inst->predicated = true;
1697   }
1698
1699   foreach_list(node, &ir->then_instructions) {
1700      ir_instruction *ir = (ir_instruction *)node;
1701      this->base_ir = ir;
1702
1703      ir->accept(this);
1704   }
1705
1706   if (!ir->else_instructions.is_empty()) {
1707      emit(BRW_OPCODE_ELSE);
1708
1709      foreach_list(node, &ir->else_instructions) {
1710	 ir_instruction *ir = (ir_instruction *)node;
1711	 this->base_ir = ir;
1712
1713	 ir->accept(this);
1714      }
1715   }
1716
1717   emit(BRW_OPCODE_ENDIF);
1718}
1719
1720void
1721fs_visitor::visit(ir_loop *ir)
1722{
1723   fs_reg counter = reg_undef;
1724
1725   if (intel->gen < 6 && c->dispatch_width == 16) {
1726      fail("Can't support (non-uniform) control flow on 16-wide\n");
1727   }
1728
1729   if (ir->counter) {
1730      this->base_ir = ir->counter;
1731      ir->counter->accept(this);
1732      counter = *(variable_storage(ir->counter));
1733
1734      if (ir->from) {
1735	 this->base_ir = ir->from;
1736	 ir->from->accept(this);
1737
1738	 emit(BRW_OPCODE_MOV, counter, this->result);
1739      }
1740   }
1741
1742   this->base_ir = NULL;
1743   emit(BRW_OPCODE_DO);
1744
1745   if (ir->to) {
1746      this->base_ir = ir->to;
1747      ir->to->accept(this);
1748
1749      fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1750      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1751
1752      inst = emit(BRW_OPCODE_BREAK);
1753      inst->predicated = true;
1754   }
1755
1756   foreach_list(node, &ir->body_instructions) {
1757      ir_instruction *ir = (ir_instruction *)node;
1758
1759      this->base_ir = ir;
1760      ir->accept(this);
1761   }
1762
1763   if (ir->increment) {
1764      this->base_ir = ir->increment;
1765      ir->increment->accept(this);
1766      emit(BRW_OPCODE_ADD, counter, counter, this->result);
1767   }
1768
1769   this->base_ir = NULL;
1770   emit(BRW_OPCODE_WHILE);
1771}
1772
1773void
1774fs_visitor::visit(ir_loop_jump *ir)
1775{
1776   switch (ir->mode) {
1777   case ir_loop_jump::jump_break:
1778      emit(BRW_OPCODE_BREAK);
1779      break;
1780   case ir_loop_jump::jump_continue:
1781      emit(BRW_OPCODE_CONTINUE);
1782      break;
1783   }
1784}
1785
1786void
1787fs_visitor::visit(ir_call *ir)
1788{
1789   assert(!"FINISHME");
1790}
1791
1792void
1793fs_visitor::visit(ir_return *ir)
1794{
1795   assert(!"FINISHME");
1796}
1797
1798void
1799fs_visitor::visit(ir_function *ir)
1800{
1801   /* Ignore function bodies other than main() -- we shouldn't see calls to
1802    * them since they should all be inlined before we get to ir_to_mesa.
1803    */
1804   if (strcmp(ir->name, "main") == 0) {
1805      const ir_function_signature *sig;
1806      exec_list empty;
1807
1808      sig = ir->matching_signature(&empty);
1809
1810      assert(sig);
1811
1812      foreach_list(node, &sig->body) {
1813	 ir_instruction *ir = (ir_instruction *)node;
1814	 this->base_ir = ir;
1815
1816	 ir->accept(this);
1817      }
1818   }
1819}
1820
1821void
1822fs_visitor::visit(ir_function_signature *ir)
1823{
1824   assert(!"not reached");
1825   (void)ir;
1826}
1827
1828fs_inst *
1829fs_visitor::emit(fs_inst inst)
1830{
1831   fs_inst *list_inst = new(mem_ctx) fs_inst;
1832   *list_inst = inst;
1833
1834   if (force_uncompressed_stack > 0)
1835      list_inst->force_uncompressed = true;
1836   else if (force_sechalf_stack > 0)
1837      list_inst->force_sechalf = true;
1838
1839   list_inst->annotation = this->current_annotation;
1840   list_inst->ir = this->base_ir;
1841
1842   this->instructions.push_tail(list_inst);
1843
1844   return list_inst;
1845}
1846
1847/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1848void
1849fs_visitor::emit_dummy_fs()
1850{
1851   int reg_width = c->dispatch_width / 8;
1852
1853   /* Everyone's favorite color. */
1854   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f));
1855   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f));
1856   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f));
1857   emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f));
1858
1859   fs_inst *write;
1860   write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1861   write->base_mrf = 2;
1862   write->mlen = 4 * reg_width;
1863   write->eot = true;
1864}
1865
1866/* The register location here is relative to the start of the URB
1867 * data.  It will get adjusted to be a real location before
1868 * generate_code() time.
1869 */
1870struct brw_reg
1871fs_visitor::interp_reg(int location, int channel)
1872{
1873   int regnr = urb_setup[location] * 2 + channel / 2;
1874   int stride = (channel & 1) * 4;
1875
1876   assert(urb_setup[location] != -1);
1877
1878   return brw_vec1_grf(regnr, stride);
1879}
1880
1881/** Emits the interpolation for the varying inputs. */
1882void
1883fs_visitor::emit_interpolation_setup_gen4()
1884{
1885   this->current_annotation = "compute pixel centers";
1886   this->pixel_x = fs_reg(this, glsl_type::uint_type);
1887   this->pixel_y = fs_reg(this, glsl_type::uint_type);
1888   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1889   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1890
1891   emit(FS_OPCODE_PIXEL_X, this->pixel_x);
1892   emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
1893
1894   this->current_annotation = "compute pixel deltas from v0";
1895   if (brw->has_pln) {
1896      this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1897         fs_reg(this, glsl_type::vec2_type);
1898      this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1899         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
1900      this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
1901   } else {
1902      this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1903         fs_reg(this, glsl_type::float_type);
1904      this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1905         fs_reg(this, glsl_type::float_type);
1906   }
1907   emit(BRW_OPCODE_ADD, this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1908	this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1909   emit(BRW_OPCODE_ADD, this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1910	this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1911
1912   this->current_annotation = "compute pos.w and 1/pos.w";
1913   /* Compute wpos.w.  It's always in our setup, since it's needed to
1914    * interpolate the other attributes.
1915    */
1916   this->wpos_w = fs_reg(this, glsl_type::float_type);
1917   emit(FS_OPCODE_LINTERP, wpos_w,
1918        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1919        this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1920	interp_reg(FRAG_ATTRIB_WPOS, 3));
1921   /* Compute the pixel 1/W value from wpos.w. */
1922   this->pixel_w = fs_reg(this, glsl_type::float_type);
1923   emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
1924   this->current_annotation = NULL;
1925}
1926
1927/** Emits the interpolation for the varying inputs. */
1928void
1929fs_visitor::emit_interpolation_setup_gen6()
1930{
1931   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1932
1933   /* If the pixel centers end up used, the setup is the same as for gen4. */
1934   this->current_annotation = "compute pixel centers";
1935   fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1936   fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1937   int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1938   int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1939   emit(BRW_OPCODE_ADD,
1940	int_pixel_x,
1941	fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1942	fs_reg(brw_imm_v(0x10101010)));
1943   emit(BRW_OPCODE_ADD,
1944	int_pixel_y,
1945	fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1946	fs_reg(brw_imm_v(0x11001100)));
1947
1948   /* As of gen6, we can no longer mix float and int sources.  We have
1949    * to turn the integer pixel centers into floats for their actual
1950    * use.
1951    */
1952   this->pixel_x = fs_reg(this, glsl_type::float_type);
1953   this->pixel_y = fs_reg(this, glsl_type::float_type);
1954   emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
1955   emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
1956
1957   this->current_annotation = "compute pos.w";
1958   this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
1959   this->wpos_w = fs_reg(this, glsl_type::float_type);
1960   emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
1961
1962   for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
1963      uint8_t reg = c->barycentric_coord_reg[i];
1964      this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
1965      this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
1966   }
1967
1968   this->current_annotation = NULL;
1969}
1970
1971void
1972fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
1973{
1974   int reg_width = c->dispatch_width / 8;
1975   fs_inst *inst;
1976   fs_reg color = outputs[target];
1977   fs_reg mrf;
1978
1979   /* If there's no color data to be written, skip it. */
1980   if (color.file == BAD_FILE)
1981      return;
1982
1983   color.reg_offset += index;
1984
1985   if (c->dispatch_width == 8 || intel->gen >= 6) {
1986      /* SIMD8 write looks like:
1987       * m + 0: r0
1988       * m + 1: r1
1989       * m + 2: g0
1990       * m + 3: g1
1991       *
1992       * gen6 SIMD16 DP write looks like:
1993       * m + 0: r0
1994       * m + 1: r1
1995       * m + 2: g0
1996       * m + 3: g1
1997       * m + 4: b0
1998       * m + 5: b1
1999       * m + 6: a0
2000       * m + 7: a1
2001       */
2002      inst = emit(BRW_OPCODE_MOV,
2003		  fs_reg(MRF, first_color_mrf + index * reg_width, color.type),
2004		  color);
2005      inst->saturate = c->key.clamp_fragment_color;
2006   } else {
2007      /* pre-gen6 SIMD16 single source DP write looks like:
2008       * m + 0: r0
2009       * m + 1: g0
2010       * m + 2: b0
2011       * m + 3: a0
2012       * m + 4: r1
2013       * m + 5: g1
2014       * m + 6: b1
2015       * m + 7: a1
2016       */
2017      if (brw->has_compr4) {
2018	 /* By setting the high bit of the MRF register number, we
2019	  * indicate that we want COMPR4 mode - instead of doing the
2020	  * usual destination + 1 for the second half we get
2021	  * destination + 4.
2022	  */
2023	 inst = emit(BRW_OPCODE_MOV,
2024		     fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
2025			    color.type),
2026		     color);
2027	 inst->saturate = c->key.clamp_fragment_color;
2028      } else {
2029	 push_force_uncompressed();
2030	 inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index,
2031					    color.type),
2032		     color);
2033	 inst->saturate = c->key.clamp_fragment_color;
2034	 pop_force_uncompressed();
2035
2036	 push_force_sechalf();
2037	 color.sechalf = true;
2038	 inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4,
2039					    color.type),
2040		     color);
2041	 inst->saturate = c->key.clamp_fragment_color;
2042	 pop_force_sechalf();
2043	 color.sechalf = false;
2044      }
2045   }
2046}
2047
2048void
2049fs_visitor::emit_fb_writes()
2050{
2051   this->current_annotation = "FB write header";
2052   bool header_present = true;
2053   /* We can potentially have a message length of up to 15, so we have to set
2054    * base_mrf to either 0 or 1 in order to fit in m0..m15.
2055    */
2056   int base_mrf = 1;
2057   int nr = base_mrf;
2058   int reg_width = c->dispatch_width / 8;
2059   bool do_dual_src = this->dual_src_output.file != BAD_FILE;
2060   bool src0_alpha_to_render_target = false;
2061
2062   if (c->dispatch_width == 16 && do_dual_src) {
2063      fail("GL_ARB_blend_func_extended not yet supported in 16-wide.");
2064      do_dual_src = false;
2065   }
2066
2067   /* From the Sandy Bridge PRM, volume 4, page 198:
2068    *
2069    *     "Dispatched Pixel Enables. One bit per pixel indicating
2070    *      which pixels were originally enabled when the thread was
2071    *      dispatched. This field is only required for the end-of-
2072    *      thread message and on all dual-source messages."
2073    */
2074   if (intel->gen >= 6 &&
2075       !this->fp->UsesKill &&
2076       !do_dual_src &&
2077       c->key.nr_color_regions == 1) {
2078      header_present = false;
2079   }
2080
2081   if (header_present) {
2082      src0_alpha_to_render_target = intel->gen >= 6 &&
2083				    !do_dual_src &&
2084				    c->key.nr_color_regions > 1 &&
2085				    c->key.sample_alpha_to_coverage;
2086      /* m2, m3 header */
2087      nr += 2;
2088   }
2089
2090   if (c->aa_dest_stencil_reg) {
2091      push_force_uncompressed();
2092      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2093	   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2094      pop_force_uncompressed();
2095   }
2096
2097   /* Reserve space for color. It'll be filled in per MRT below. */
2098   int color_mrf = nr;
2099   nr += 4 * reg_width;
2100   if (do_dual_src)
2101      nr += 4;
2102   if (src0_alpha_to_render_target)
2103      nr += reg_width;
2104
2105   if (c->source_depth_to_render_target) {
2106      if (intel->gen == 6 && c->dispatch_width == 16) {
2107	 /* For outputting oDepth on gen6, SIMD8 writes have to be
2108	  * used.  This would require 8-wide moves of each half to
2109	  * message regs, kind of like pre-gen5 SIMD16 FB writes.
2110	  * Just bail on doing so for now.
2111	  */
2112	 fail("Missing support for simd16 depth writes on gen6\n");
2113      }
2114
2115      if (c->computes_depth) {
2116	 /* Hand over gl_FragDepth. */
2117	 assert(this->frag_depth);
2118	 fs_reg depth = *(variable_storage(this->frag_depth));
2119
2120	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
2121      } else {
2122	 /* Pass through the payload depth. */
2123	 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2124	      fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2125      }
2126      nr += reg_width;
2127   }
2128
2129   if (c->dest_depth_reg) {
2130      emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2131	   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2132      nr += reg_width;
2133   }
2134
2135   if (do_dual_src) {
2136      fs_reg src0 = this->outputs[0];
2137      fs_reg src1 = this->dual_src_output;
2138
2139      this->current_annotation = ralloc_asprintf(this->mem_ctx,
2140						 "FB write src0");
2141      for (int i = 0; i < 4; i++) {
2142	 fs_inst *inst = emit(BRW_OPCODE_MOV,
2143			      fs_reg(MRF, color_mrf + i, src0.type),
2144			      src0);
2145	 src0.reg_offset++;
2146	 inst->saturate = c->key.clamp_fragment_color;
2147      }
2148
2149      this->current_annotation = ralloc_asprintf(this->mem_ctx,
2150						 "FB write src1");
2151      for (int i = 0; i < 4; i++) {
2152	 fs_inst *inst = emit(BRW_OPCODE_MOV,
2153			      fs_reg(MRF, color_mrf + 4 + i, src1.type),
2154			      src1);
2155	 src1.reg_offset++;
2156	 inst->saturate = c->key.clamp_fragment_color;
2157      }
2158
2159      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2160      inst->target = 0;
2161      inst->base_mrf = base_mrf;
2162      inst->mlen = nr - base_mrf;
2163      inst->eot = true;
2164      inst->header_present = header_present;
2165
2166      c->prog_data.dual_src_blend = true;
2167      this->current_annotation = NULL;
2168      return;
2169   }
2170
2171   for (int target = 0; target < c->key.nr_color_regions; target++) {
2172      this->current_annotation = ralloc_asprintf(this->mem_ctx,
2173						 "FB write target %d",
2174						 target);
2175      /* If src0_alpha_to_render_target is true, include source zero alpha
2176       * data in RenderTargetWrite message for targets > 0.
2177       */
2178      int write_color_mrf = color_mrf;
2179      if (src0_alpha_to_render_target && target != 0) {
2180         fs_inst *inst;
2181         fs_reg color = outputs[0];
2182         color.reg_offset += 3;
2183
2184         inst = emit(BRW_OPCODE_MOV,
2185		     fs_reg(MRF, write_color_mrf, color.type),
2186		     color);
2187         inst->saturate = c->key.clamp_fragment_color;
2188         write_color_mrf = color_mrf + reg_width;
2189      }
2190
2191      for (unsigned i = 0; i < this->output_components[target]; i++)
2192         emit_color_write(target, i, write_color_mrf);
2193
2194      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2195      inst->target = target;
2196      inst->base_mrf = base_mrf;
2197      if (src0_alpha_to_render_target && target == 0)
2198         inst->mlen = nr - base_mrf - reg_width;
2199      else
2200         inst->mlen = nr - base_mrf;
2201      if (target == c->key.nr_color_regions - 1)
2202	 inst->eot = true;
2203      inst->header_present = header_present;
2204   }
2205
2206   if (c->key.nr_color_regions == 0) {
2207      /* Even if there's no color buffers enabled, we still need to send
2208       * alpha out the pipeline to our null renderbuffer to support
2209       * alpha-testing, alpha-to-coverage, and so on.
2210       */
2211      emit_color_write(0, 3, color_mrf);
2212
2213      fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2214      inst->base_mrf = base_mrf;
2215      inst->mlen = nr - base_mrf;
2216      inst->eot = true;
2217      inst->header_present = header_present;
2218   }
2219
2220   this->current_annotation = NULL;
2221}
2222
2223void
2224fs_visitor::resolve_ud_negate(fs_reg *reg)
2225{
2226   if (reg->type != BRW_REGISTER_TYPE_UD ||
2227       !reg->negate)
2228      return;
2229
2230   fs_reg temp = fs_reg(this, glsl_type::uint_type);
2231   emit(BRW_OPCODE_MOV, temp, *reg);
2232   *reg = temp;
2233}
2234
2235void
2236fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
2237{
2238   if (rvalue->type != glsl_type::bool_type)
2239      return;
2240
2241   fs_reg temp = fs_reg(this, glsl_type::bool_type);
2242   emit(BRW_OPCODE_AND, temp, *reg, fs_reg(1));
2243   *reg = temp;
2244}
2245
2246fs_visitor::fs_visitor(struct brw_wm_compile *c, struct gl_shader_program *prog,
2247                       struct brw_shader *shader)
2248{
2249   this->c = c;
2250   this->p = &c->func;
2251   this->brw = p->brw;
2252   this->fp = (struct gl_fragment_program *)
2253      prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2254   this->prog = prog;
2255   this->intel = &brw->intel;
2256   this->ctx = &intel->ctx;
2257   this->mem_ctx = ralloc_context(NULL);
2258   this->shader = shader;
2259   this->failed = false;
2260   this->variable_ht = hash_table_ctor(0,
2261                                       hash_table_pointer_hash,
2262                                       hash_table_pointer_compare);
2263
2264   /* There's a question that appears to be left open in the spec:
2265    * How do implicit dst conversions interact with the CMP
2266    * instruction or conditional mods?  On gen6, the instruction:
2267    *
2268    * CMP null<d> src0<f> src1<f>
2269    *
2270    * will do src1 - src0 and compare that result as if it was an
2271    * integer.  On gen4, it will do src1 - src0 as float, convert
2272    * the result to int, and compare as int.  In between, it
2273    * appears that it does src1 - src0 and does the compare in the
2274    * execution type so dst type doesn't matter.
2275    */
2276   if (this->intel->gen > 4)
2277      this->reg_null_cmp = reg_null_d;
2278   else
2279      this->reg_null_cmp = reg_null_f;
2280
2281   this->frag_depth = NULL;
2282   memset(this->outputs, 0, sizeof(this->outputs));
2283   this->first_non_payload_grf = 0;
2284   this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2285
2286   this->current_annotation = NULL;
2287   this->base_ir = NULL;
2288
2289   this->virtual_grf_sizes = NULL;
2290   this->virtual_grf_count = 0;
2291   this->virtual_grf_array_size = 0;
2292   this->virtual_grf_def = NULL;
2293   this->virtual_grf_use = NULL;
2294   this->live_intervals_valid = false;
2295
2296   this->force_uncompressed_stack = 0;
2297   this->force_sechalf_stack = 0;
2298}
2299
2300fs_visitor::~fs_visitor()
2301{
2302   ralloc_free(this->mem_ctx);
2303   hash_table_dtor(this->variable_ht);
2304}
2305