brw_vec4_visitor.cpp revision 81a0b2166991a3015f8336e184c34cf6a92adfe0
1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_vec4.h"
25extern "C" {
26#include "main/macros.h"
27#include "program/prog_parameter.h"
28}
29
30namespace brw {
31
32src_reg::src_reg(dst_reg reg)
33{
34   init();
35
36   this->file = reg.file;
37   this->reg = reg.reg;
38   this->reg_offset = reg.reg_offset;
39   this->type = reg.type;
40   this->reladdr = reg.reladdr;
41   this->fixed_hw_reg = reg.fixed_hw_reg;
42
43   int swizzles[4];
44   int next_chan = 0;
45   int last = 0;
46
47   for (int i = 0; i < 4; i++) {
48      if (!(reg.writemask & (1 << i)))
49	 continue;
50
51      swizzles[next_chan++] = last = i;
52   }
53
54   for (; next_chan < 4; next_chan++) {
55      swizzles[next_chan] = last;
56   }
57
58   this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
59				swizzles[2], swizzles[3]);
60}
61
62dst_reg::dst_reg(src_reg reg)
63{
64   init();
65
66   this->file = reg.file;
67   this->reg = reg.reg;
68   this->reg_offset = reg.reg_offset;
69   this->type = reg.type;
70   this->writemask = WRITEMASK_XYZW;
71   this->reladdr = reg.reladdr;
72   this->fixed_hw_reg = reg.fixed_hw_reg;
73}
74
75vec4_instruction::vec4_instruction(vec4_visitor *v,
76				   enum opcode opcode, dst_reg dst,
77				   src_reg src0, src_reg src1, src_reg src2)
78{
79   this->opcode = opcode;
80   this->dst = dst;
81   this->src[0] = src0;
82   this->src[1] = src1;
83   this->src[2] = src2;
84   this->ir = v->base_ir;
85   this->annotation = v->current_annotation;
86}
87
88vec4_instruction *
89vec4_visitor::emit(vec4_instruction *inst)
90{
91   this->instructions.push_tail(inst);
92
93   return inst;
94}
95
96vec4_instruction *
97vec4_visitor::emit(enum opcode opcode, dst_reg dst,
98		   src_reg src0, src_reg src1, src_reg src2)
99{
100   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
101					     src0, src1, src2));
102}
103
104
105vec4_instruction *
106vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
107{
108   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
109}
110
111vec4_instruction *
112vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
113{
114   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
115}
116
117vec4_instruction *
118vec4_visitor::emit(enum opcode opcode)
119{
120   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
121}
122
123void
124vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
125{
126   static enum opcode dot_opcodes[] = {
127      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
128   };
129
130   emit(dot_opcodes[elements - 2], dst, src0, src1);
131}
132
133void
134vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
135{
136   /* The gen6 math instruction ignores the source modifiers --
137    * swizzle, abs, negate, and at least some parts of the register
138    * region description.
139    *
140    * While it would seem that this MOV could be avoided at this point
141    * in the case that the swizzle is matched up with the destination
142    * writemask, note that uniform packing and register allocation
143    * could rearrange our swizzle, so let's leave this matter up to
144    * copy propagation later.
145    */
146   src_reg temp_src = src_reg(this, glsl_type::vec4_type);
147   emit(BRW_OPCODE_MOV, dst_reg(temp_src), src);
148
149   if (dst.writemask != WRITEMASK_XYZW) {
150      /* The gen6 math instruction must be align1, so we can't do
151       * writemasks.
152       */
153      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
154
155      emit(opcode, temp_dst, temp_src);
156
157      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
158   } else {
159      emit(opcode, dst, temp_src);
160   }
161}
162
163void
164vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
165{
166   vec4_instruction *inst = emit(opcode, dst, src);
167   inst->base_mrf = 1;
168   inst->mlen = 1;
169}
170
171void
172vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
173{
174   switch (opcode) {
175   case SHADER_OPCODE_RCP:
176   case SHADER_OPCODE_RSQ:
177   case SHADER_OPCODE_SQRT:
178   case SHADER_OPCODE_EXP2:
179   case SHADER_OPCODE_LOG2:
180   case SHADER_OPCODE_SIN:
181   case SHADER_OPCODE_COS:
182      break;
183   default:
184      assert(!"not reached: bad math opcode");
185      return;
186   }
187
188   if (intel->gen >= 6) {
189      return emit_math1_gen6(opcode, dst, src);
190   } else {
191      return emit_math1_gen4(opcode, dst, src);
192   }
193}
194
195void
196vec4_visitor::emit_math2_gen6(enum opcode opcode,
197			      dst_reg dst, src_reg src0, src_reg src1)
198{
199   src_reg expanded;
200
201   /* The gen6 math instruction ignores the source modifiers --
202    * swizzle, abs, negate, and at least some parts of the register
203    * region description.  Move the sources to temporaries to make it
204    * generally work.
205    */
206
207   expanded = src_reg(this, glsl_type::vec4_type);
208   emit(BRW_OPCODE_MOV, dst_reg(expanded), src0);
209   src0 = expanded;
210
211   expanded = src_reg(this, glsl_type::vec4_type);
212   emit(BRW_OPCODE_MOV, dst_reg(expanded), src1);
213   src1 = expanded;
214
215   if (dst.writemask != WRITEMASK_XYZW) {
216      /* The gen6 math instruction must be align1, so we can't do
217       * writemasks.
218       */
219      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
220
221      emit(opcode, temp_dst, src0, src1);
222
223      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
224   } else {
225      emit(opcode, dst, src0, src1);
226   }
227}
228
229void
230vec4_visitor::emit_math2_gen4(enum opcode opcode,
231			      dst_reg dst, src_reg src0, src_reg src1)
232{
233   vec4_instruction *inst = emit(opcode, dst, src0, src1);
234   inst->base_mrf = 1;
235   inst->mlen = 2;
236}
237
238void
239vec4_visitor::emit_math(enum opcode opcode,
240			dst_reg dst, src_reg src0, src_reg src1)
241{
242   assert(opcode == SHADER_OPCODE_POW);
243
244   if (intel->gen >= 6) {
245      return emit_math2_gen6(opcode, dst, src0, src1);
246   } else {
247      return emit_math2_gen4(opcode, dst, src0, src1);
248   }
249}
250
251void
252vec4_visitor::visit_instructions(const exec_list *list)
253{
254   foreach_list(node, list) {
255      ir_instruction *ir = (ir_instruction *)node;
256
257      base_ir = ir;
258      ir->accept(this);
259   }
260}
261
262
263static int
264type_size(const struct glsl_type *type)
265{
266   unsigned int i;
267   int size;
268
269   switch (type->base_type) {
270   case GLSL_TYPE_UINT:
271   case GLSL_TYPE_INT:
272   case GLSL_TYPE_FLOAT:
273   case GLSL_TYPE_BOOL:
274      if (type->is_matrix()) {
275	 return type->matrix_columns;
276      } else {
277	 /* Regardless of size of vector, it gets a vec4. This is bad
278	  * packing for things like floats, but otherwise arrays become a
279	  * mess.  Hopefully a later pass over the code can pack scalars
280	  * down if appropriate.
281	  */
282	 return 1;
283      }
284   case GLSL_TYPE_ARRAY:
285      assert(type->length > 0);
286      return type_size(type->fields.array) * type->length;
287   case GLSL_TYPE_STRUCT:
288      size = 0;
289      for (i = 0; i < type->length; i++) {
290	 size += type_size(type->fields.structure[i].type);
291      }
292      return size;
293   case GLSL_TYPE_SAMPLER:
294      /* Samplers take up one slot in UNIFORMS[], but they're baked in
295       * at link time.
296       */
297      return 1;
298   default:
299      assert(0);
300      return 0;
301   }
302}
303
304int
305vec4_visitor::virtual_grf_alloc(int size)
306{
307   if (virtual_grf_array_size <= virtual_grf_count) {
308      if (virtual_grf_array_size == 0)
309	 virtual_grf_array_size = 16;
310      else
311	 virtual_grf_array_size *= 2;
312      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
313				   virtual_grf_array_size);
314   }
315   virtual_grf_sizes[virtual_grf_count] = size;
316   return virtual_grf_count++;
317}
318
319src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
320{
321   init();
322
323   this->file = GRF;
324   this->reg = v->virtual_grf_alloc(type_size(type));
325
326   if (type->is_array() || type->is_record()) {
327      this->swizzle = BRW_SWIZZLE_NOOP;
328   } else {
329      this->swizzle = swizzle_for_size(type->vector_elements);
330   }
331
332   this->type = brw_type_for_base_type(type);
333}
334
335dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
336{
337   init();
338
339   this->file = GRF;
340   this->reg = v->virtual_grf_alloc(type_size(type));
341
342   if (type->is_array() || type->is_record()) {
343      this->writemask = WRITEMASK_XYZW;
344   } else {
345      this->writemask = (1 << type->vector_elements) - 1;
346   }
347
348   this->type = brw_type_for_base_type(type);
349}
350
351/* Our support for uniforms is piggy-backed on the struct
352 * gl_fragment_program, because that's where the values actually
353 * get stored, rather than in some global gl_shader_program uniform
354 * store.
355 */
356int
357vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
358{
359   unsigned int offset = 0;
360   float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
361
362   if (type->is_matrix()) {
363      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
364							type->vector_elements,
365							1);
366
367      for (unsigned int i = 0; i < type->matrix_columns; i++) {
368	 offset += setup_uniform_values(loc + offset, column);
369      }
370
371      return offset;
372   }
373
374   switch (type->base_type) {
375   case GLSL_TYPE_FLOAT:
376   case GLSL_TYPE_UINT:
377   case GLSL_TYPE_INT:
378   case GLSL_TYPE_BOOL:
379      for (unsigned int i = 0; i < type->vector_elements; i++) {
380	 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
381      }
382
383      /* Set up pad elements to get things aligned to a vec4 boundary. */
384      for (unsigned int i = type->vector_elements; i < 4; i++) {
385	 static float zero = 0;
386
387	 c->prog_data.param[this->uniforms * 4 + i] = &zero;
388      }
389
390      /* Track the size of this uniform vector, for future packing of
391       * uniforms.
392       */
393      this->uniform_vector_size[this->uniforms] = type->vector_elements;
394      this->uniforms++;
395
396      return 1;
397
398   case GLSL_TYPE_STRUCT:
399      for (unsigned int i = 0; i < type->length; i++) {
400	 offset += setup_uniform_values(loc + offset,
401					type->fields.structure[i].type);
402      }
403      return offset;
404
405   case GLSL_TYPE_ARRAY:
406      for (unsigned int i = 0; i < type->length; i++) {
407	 offset += setup_uniform_values(loc + offset, type->fields.array);
408      }
409      return offset;
410
411   case GLSL_TYPE_SAMPLER:
412      /* The sampler takes up a slot, but we don't use any values from it. */
413      return 1;
414
415   default:
416      assert(!"not reached");
417      return 0;
418   }
419}
420
421/* Our support for builtin uniforms is even scarier than non-builtin.
422 * It sits on top of the PROG_STATE_VAR parameters that are
423 * automatically updated from GL context state.
424 */
425void
426vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
427{
428   const ir_state_slot *const slots = ir->state_slots;
429   assert(ir->state_slots != NULL);
430
431   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
432      /* This state reference has already been setup by ir_to_mesa,
433       * but we'll get the same index back here.  We can reference
434       * ParameterValues directly, since unlike brw_fs.cpp, we never
435       * add new state references during compile.
436       */
437      int index = _mesa_add_state_reference(this->vp->Base.Parameters,
438					    (gl_state_index *)slots[i].tokens);
439      float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
440
441      this->uniform_vector_size[this->uniforms] = 0;
442      /* Add each of the unique swizzled channels of the element.
443       * This will end up matching the size of the glsl_type of this field.
444       */
445      int last_swiz = -1;
446      for (unsigned int j = 0; j < 4; j++) {
447	 int swiz = GET_SWZ(slots[i].swizzle, j);
448	 last_swiz = swiz;
449
450	 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
451	 if (swiz <= last_swiz)
452	    this->uniform_vector_size[this->uniforms]++;
453      }
454      this->uniforms++;
455   }
456}
457
458dst_reg *
459vec4_visitor::variable_storage(ir_variable *var)
460{
461   return (dst_reg *)hash_table_find(this->variable_ht, var);
462}
463
464void
465vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
466{
467   ir_expression *expr = ir->as_expression();
468
469   if (expr) {
470      src_reg op[2];
471      vec4_instruction *inst;
472
473      assert(expr->get_num_operands() <= 2);
474      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
475	 assert(expr->operands[i]->type->is_scalar());
476
477	 expr->operands[i]->accept(this);
478	 op[i] = this->result;
479      }
480
481      switch (expr->operation) {
482      case ir_unop_logic_not:
483	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], src_reg(1));
484	 inst->conditional_mod = BRW_CONDITIONAL_Z;
485	 break;
486
487      case ir_binop_logic_xor:
488	 inst = emit(BRW_OPCODE_XOR, dst_null_d(), op[0], op[1]);
489	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
490	 break;
491
492      case ir_binop_logic_or:
493	 inst = emit(BRW_OPCODE_OR, dst_null_d(), op[0], op[1]);
494	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
495	 break;
496
497      case ir_binop_logic_and:
498	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], op[1]);
499	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
500	 break;
501
502      case ir_unop_f2b:
503	 if (intel->gen >= 6) {
504	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0.0f));
505	 } else {
506	    inst = emit(BRW_OPCODE_MOV, dst_null_f(), op[0]);
507	 }
508	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
509	 break;
510
511      case ir_unop_i2b:
512	 if (intel->gen >= 6) {
513	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
514	 } else {
515	    inst = emit(BRW_OPCODE_MOV, dst_null_d(), op[0]);
516	 }
517	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
518	 break;
519
520      case ir_binop_greater:
521      case ir_binop_gequal:
522      case ir_binop_less:
523      case ir_binop_lequal:
524      case ir_binop_equal:
525      case ir_binop_all_equal:
526      case ir_binop_nequal:
527      case ir_binop_any_nequal:
528	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
529	 inst->conditional_mod =
530	    brw_conditional_for_comparison(expr->operation);
531	 break;
532
533      default:
534	 assert(!"not reached");
535	 break;
536      }
537      return;
538   }
539
540   ir->accept(this);
541
542   if (intel->gen >= 6) {
543      vec4_instruction *inst = emit(BRW_OPCODE_AND, dst_null_d(),
544			       this->result, src_reg(1));
545      inst->conditional_mod = BRW_CONDITIONAL_NZ;
546   } else {
547      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst_null_d(), this->result);
548      inst->conditional_mod = BRW_CONDITIONAL_NZ;
549   }
550}
551
552/**
553 * Emit a gen6 IF statement with the comparison folded into the IF
554 * instruction.
555 */
556void
557vec4_visitor::emit_if_gen6(ir_if *ir)
558{
559   ir_expression *expr = ir->condition->as_expression();
560
561   if (expr) {
562      src_reg op[2];
563      vec4_instruction *inst;
564      dst_reg temp;
565
566      assert(expr->get_num_operands() <= 2);
567      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
568	 expr->operands[i]->accept(this);
569	 op[i] = this->result;
570      }
571
572      switch (expr->operation) {
573      case ir_unop_logic_not:
574	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
575	 inst->conditional_mod = BRW_CONDITIONAL_Z;
576	 return;
577
578      case ir_binop_logic_xor:
579	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
580	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
581	 return;
582
583      case ir_binop_logic_or:
584	 temp = dst_reg(this, glsl_type::bool_type);
585	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
586	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
587	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
588	 return;
589
590      case ir_binop_logic_and:
591	 temp = dst_reg(this, glsl_type::bool_type);
592	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
593	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
594	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
595	 return;
596
597      case ir_unop_f2b:
598	 inst = emit(BRW_OPCODE_IF, dst_null_f(), op[0], src_reg(0));
599	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
600	 return;
601
602      case ir_unop_i2b:
603	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
604	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
605	 return;
606
607      case ir_binop_greater:
608      case ir_binop_gequal:
609      case ir_binop_less:
610      case ir_binop_lequal:
611      case ir_binop_equal:
612      case ir_binop_nequal:
613	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
614	 inst->conditional_mod =
615	    brw_conditional_for_comparison(expr->operation);
616	 return;
617
618      case ir_binop_all_equal:
619	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
620	 inst->conditional_mod = BRW_CONDITIONAL_Z;
621
622	 inst = emit(BRW_OPCODE_IF);
623	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
624	 return;
625
626      case ir_binop_any_nequal:
627	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
628	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
629
630	 inst = emit(BRW_OPCODE_IF);
631	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
632	 return;
633
634      case ir_unop_any:
635	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
636	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
637
638	 inst = emit(BRW_OPCODE_IF);
639	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
640	 return;
641
642      default:
643	 assert(!"not reached");
644	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
645	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
646	 return;
647      }
648      return;
649   }
650
651   ir->condition->accept(this);
652
653   vec4_instruction *inst = emit(BRW_OPCODE_IF, dst_null_d(),
654			    this->result, src_reg(0));
655   inst->conditional_mod = BRW_CONDITIONAL_NZ;
656}
657
658void
659vec4_visitor::visit(ir_variable *ir)
660{
661   dst_reg *reg = NULL;
662
663   if (variable_storage(ir))
664      return;
665
666   switch (ir->mode) {
667   case ir_var_in:
668      reg = new(mem_ctx) dst_reg(ATTR, ir->location);
669
670      /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
671       * come in as floating point conversions of the integer values.
672       */
673      for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
674	 if (!c->key.gl_fixed_input_size[i])
675	    continue;
676
677	 dst_reg dst = *reg;
678	 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
679	 emit(BRW_OPCODE_MUL, dst, src_reg(dst), src_reg(1.0f / 65536.0f));
680      }
681      break;
682
683   case ir_var_out:
684      reg = new(mem_ctx) dst_reg(this, ir->type);
685
686      for (int i = 0; i < type_size(ir->type); i++) {
687	 output_reg[ir->location + i] = *reg;
688	 output_reg[ir->location + i].reg_offset = i;
689	 output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
690      }
691      break;
692
693   case ir_var_auto:
694   case ir_var_temporary:
695      reg = new(mem_ctx) dst_reg(this, ir->type);
696      break;
697
698   case ir_var_uniform:
699      reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
700
701      /* Track how big the whole uniform variable is, in case we need to put a
702       * copy of its data into pull constants for array access.
703       */
704      this->uniform_size[this->uniforms] = type_size(ir->type);
705
706      if (!strncmp(ir->name, "gl_", 3)) {
707	 setup_builtin_uniform_values(ir);
708      } else {
709	 setup_uniform_values(ir->location, ir->type);
710      }
711      break;
712
713   default:
714      assert(!"not reached");
715   }
716
717   reg->type = brw_type_for_base_type(ir->type);
718   hash_table_insert(this->variable_ht, reg, ir);
719}
720
721void
722vec4_visitor::visit(ir_loop *ir)
723{
724   dst_reg counter;
725
726   /* We don't want debugging output to print the whole body of the
727    * loop as the annotation.
728    */
729   this->base_ir = NULL;
730
731   if (ir->counter != NULL) {
732      this->base_ir = ir->counter;
733      ir->counter->accept(this);
734      counter = *(variable_storage(ir->counter));
735
736      if (ir->from != NULL) {
737	 this->base_ir = ir->from;
738	 ir->from->accept(this);
739
740	 emit(BRW_OPCODE_MOV, counter, this->result);
741      }
742   }
743
744   emit(BRW_OPCODE_DO);
745
746   if (ir->to) {
747      this->base_ir = ir->to;
748      ir->to->accept(this);
749
750      vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst_null_d(),
751				    src_reg(counter), this->result);
752      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
753
754      inst = emit(BRW_OPCODE_BREAK);
755      inst->predicate = BRW_PREDICATE_NORMAL;
756   }
757
758   visit_instructions(&ir->body_instructions);
759
760
761   if (ir->increment) {
762      this->base_ir = ir->increment;
763      ir->increment->accept(this);
764      emit(BRW_OPCODE_ADD, counter, src_reg(counter), this->result);
765   }
766
767   emit(BRW_OPCODE_WHILE);
768}
769
770void
771vec4_visitor::visit(ir_loop_jump *ir)
772{
773   switch (ir->mode) {
774   case ir_loop_jump::jump_break:
775      emit(BRW_OPCODE_BREAK);
776      break;
777   case ir_loop_jump::jump_continue:
778      emit(BRW_OPCODE_CONTINUE);
779      break;
780   }
781}
782
783
784void
785vec4_visitor::visit(ir_function_signature *ir)
786{
787   assert(0);
788   (void)ir;
789}
790
791void
792vec4_visitor::visit(ir_function *ir)
793{
794   /* Ignore function bodies other than main() -- we shouldn't see calls to
795    * them since they should all be inlined.
796    */
797   if (strcmp(ir->name, "main") == 0) {
798      const ir_function_signature *sig;
799      exec_list empty;
800
801      sig = ir->matching_signature(&empty);
802
803      assert(sig);
804
805      visit_instructions(&sig->body);
806   }
807}
808
809GLboolean
810vec4_visitor::try_emit_sat(ir_expression *ir)
811{
812   ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
813   if (!sat_src)
814      return false;
815
816   sat_src->accept(this);
817   src_reg src = this->result;
818
819   this->result = src_reg(this, ir->type);
820   vec4_instruction *inst;
821   inst = emit(BRW_OPCODE_MOV, dst_reg(this->result), src);
822   inst->saturate = true;
823
824   return true;
825}
826
827void
828vec4_visitor::emit_bool_comparison(unsigned int op,
829				 dst_reg dst, src_reg src0, src_reg src1)
830{
831   /* original gen4 does destination conversion before comparison. */
832   if (intel->gen < 5)
833      dst.type = src0.type;
834
835   vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst, src0, src1);
836   inst->conditional_mod = brw_conditional_for_comparison(op);
837
838   dst.type = BRW_REGISTER_TYPE_D;
839   emit(BRW_OPCODE_AND, dst, src_reg(dst), src_reg(0x1));
840}
841
842void
843vec4_visitor::visit(ir_expression *ir)
844{
845   unsigned int operand;
846   src_reg op[Elements(ir->operands)];
847   src_reg result_src;
848   dst_reg result_dst;
849   vec4_instruction *inst;
850
851   if (try_emit_sat(ir))
852      return;
853
854   for (operand = 0; operand < ir->get_num_operands(); operand++) {
855      this->result.file = BAD_FILE;
856      ir->operands[operand]->accept(this);
857      if (this->result.file == BAD_FILE) {
858	 printf("Failed to get tree for expression operand:\n");
859	 ir->operands[operand]->print();
860	 exit(1);
861      }
862      op[operand] = this->result;
863
864      /* Matrix expression operands should have been broken down to vector
865       * operations already.
866       */
867      assert(!ir->operands[operand]->type->is_matrix());
868   }
869
870   int vector_elements = ir->operands[0]->type->vector_elements;
871   if (ir->operands[1]) {
872      vector_elements = MAX2(vector_elements,
873			     ir->operands[1]->type->vector_elements);
874   }
875
876   this->result.file = BAD_FILE;
877
878   /* Storage for our result.  Ideally for an assignment we'd be using
879    * the actual storage for the result here, instead.
880    */
881   result_src = src_reg(this, ir->type);
882   /* convenience for the emit functions below. */
883   result_dst = dst_reg(result_src);
884   /* If nothing special happens, this is the result. */
885   this->result = result_src;
886   /* Limit writes to the channels that will be used by result_src later.
887    * This does limit this temp's use as a temporary for multi-instruction
888    * sequences.
889    */
890   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
891
892   switch (ir->operation) {
893   case ir_unop_logic_not:
894      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
895       * ones complement of the whole register, not just bit 0.
896       */
897      emit(BRW_OPCODE_XOR, result_dst, op[0], src_reg(1));
898      break;
899   case ir_unop_neg:
900      op[0].negate = !op[0].negate;
901      this->result = op[0];
902      break;
903   case ir_unop_abs:
904      op[0].abs = true;
905      op[0].negate = false;
906      this->result = op[0];
907      break;
908
909   case ir_unop_sign:
910      emit(BRW_OPCODE_MOV, result_dst, src_reg(0.0f));
911
912      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
913      inst->conditional_mod = BRW_CONDITIONAL_G;
914      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1.0f));
915      inst->predicate = BRW_PREDICATE_NORMAL;
916
917      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
918      inst->conditional_mod = BRW_CONDITIONAL_L;
919      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(-1.0f));
920      inst->predicate = BRW_PREDICATE_NORMAL;
921
922      break;
923
924   case ir_unop_rcp:
925      emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
926      break;
927
928   case ir_unop_exp2:
929      emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
930      break;
931   case ir_unop_log2:
932      emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
933      break;
934   case ir_unop_exp:
935   case ir_unop_log:
936      assert(!"not reached: should be handled by ir_explog_to_explog2");
937      break;
938   case ir_unop_sin:
939   case ir_unop_sin_reduced:
940      emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
941      break;
942   case ir_unop_cos:
943   case ir_unop_cos_reduced:
944      emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
945      break;
946
947   case ir_unop_dFdx:
948   case ir_unop_dFdy:
949      assert(!"derivatives not valid in vertex shader");
950      break;
951
952   case ir_unop_noise:
953      assert(!"not reached: should be handled by lower_noise");
954      break;
955
956   case ir_binop_add:
957      emit(BRW_OPCODE_ADD, result_dst, op[0], op[1]);
958      break;
959   case ir_binop_sub:
960      assert(!"not reached: should be handled by ir_sub_to_add_neg");
961      break;
962
963   case ir_binop_mul:
964      if (ir->type->is_integer()) {
965	 /* For integer multiplication, the MUL uses the low 16 bits
966	  * of one of the operands (src0 on gen6, src1 on gen7).  The
967	  * MACH accumulates in the contribution of the upper 16 bits
968	  * of that operand.
969	  *
970	  * FINISHME: Emit just the MUL if we know an operand is small
971	  * enough.
972	  */
973	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
974
975	 emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
976	 emit(BRW_OPCODE_MACH, dst_null_d(), op[0], op[1]);
977	 emit(BRW_OPCODE_MOV, result_dst, src_reg(acc));
978      } else {
979	 emit(BRW_OPCODE_MUL, result_dst, op[0], op[1]);
980      }
981      break;
982   case ir_binop_div:
983      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
984   case ir_binop_mod:
985      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
986      break;
987
988   case ir_binop_less:
989   case ir_binop_greater:
990   case ir_binop_lequal:
991   case ir_binop_gequal:
992   case ir_binop_equal:
993   case ir_binop_nequal: {
994      dst_reg temp = result_dst;
995      /* original gen4 does implicit conversion before comparison. */
996      if (intel->gen < 5)
997	 temp.type = op[0].type;
998
999      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1000      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
1001      emit(BRW_OPCODE_AND, result_dst, this->result, src_reg(0x1));
1002      break;
1003   }
1004
1005   case ir_binop_all_equal:
1006      /* "==" operator producing a scalar boolean. */
1007      if (ir->operands[0]->type->is_vector() ||
1008	  ir->operands[1]->type->is_vector()) {
1009	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
1010	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1011
1012	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1013	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1014	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1015      } else {
1016	 dst_reg temp = result_dst;
1017	 /* original gen4 does implicit conversion before comparison. */
1018	 if (intel->gen < 5)
1019	    temp.type = op[0].type;
1020
1021	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1022	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1023	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
1024      }
1025      break;
1026   case ir_binop_any_nequal:
1027      /* "!=" operator producing a scalar boolean. */
1028      if (ir->operands[0]->type->is_vector() ||
1029	  ir->operands[1]->type->is_vector()) {
1030	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
1031	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1032
1033	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1034	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1035	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1036      } else {
1037	 dst_reg temp = result_dst;
1038	 /* original gen4 does implicit conversion before comparison. */
1039	 if (intel->gen < 5)
1040	    temp.type = op[0].type;
1041
1042	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1043	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1044	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
1045      }
1046      break;
1047
1048   case ir_unop_any:
1049      inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
1050      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1051
1052      emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1053
1054      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1055      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1056      break;
1057
1058   case ir_binop_logic_xor:
1059      emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
1060      break;
1061
1062   case ir_binop_logic_or:
1063      emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
1064      break;
1065
1066   case ir_binop_logic_and:
1067      emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
1068      break;
1069
1070   case ir_binop_dot:
1071      assert(ir->operands[0]->type->is_vector());
1072      assert(ir->operands[0]->type == ir->operands[1]->type);
1073      emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1074      break;
1075
1076   case ir_unop_sqrt:
1077      emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1078      break;
1079   case ir_unop_rsq:
1080      emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1081      break;
1082   case ir_unop_i2f:
1083   case ir_unop_i2u:
1084   case ir_unop_u2i:
1085   case ir_unop_u2f:
1086   case ir_unop_b2f:
1087   case ir_unop_b2i:
1088   case ir_unop_f2i:
1089      emit(BRW_OPCODE_MOV, result_dst, op[0]);
1090      break;
1091   case ir_unop_f2b:
1092   case ir_unop_i2b: {
1093      dst_reg temp = result_dst;
1094      /* original gen4 does implicit conversion before comparison. */
1095      if (intel->gen < 5)
1096	 temp.type = op[0].type;
1097
1098      inst = emit(BRW_OPCODE_CMP, temp, op[0], src_reg(0.0f));
1099      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1100      inst = emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(1));
1101      break;
1102   }
1103
1104   case ir_unop_trunc:
1105      emit(BRW_OPCODE_RNDZ, result_dst, op[0]);
1106      break;
1107   case ir_unop_ceil:
1108      op[0].negate = !op[0].negate;
1109      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
1110      this->result.negate = true;
1111      break;
1112   case ir_unop_floor:
1113      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
1114      break;
1115   case ir_unop_fract:
1116      inst = emit(BRW_OPCODE_FRC, result_dst, op[0]);
1117      break;
1118   case ir_unop_round_even:
1119      emit(BRW_OPCODE_RNDE, result_dst, op[0]);
1120      break;
1121
1122   case ir_binop_min:
1123      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
1124      inst->conditional_mod = BRW_CONDITIONAL_L;
1125
1126      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1127      inst->predicate = BRW_PREDICATE_NORMAL;
1128      break;
1129   case ir_binop_max:
1130      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
1131      inst->conditional_mod = BRW_CONDITIONAL_G;
1132
1133      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1134      inst->predicate = BRW_PREDICATE_NORMAL;
1135      break;
1136
1137   case ir_binop_pow:
1138      emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1139      break;
1140
1141   case ir_unop_bit_not:
1142      inst = emit(BRW_OPCODE_NOT, result_dst, op[0]);
1143      break;
1144   case ir_binop_bit_and:
1145      inst = emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
1146      break;
1147   case ir_binop_bit_xor:
1148      inst = emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
1149      break;
1150   case ir_binop_bit_or:
1151      inst = emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
1152      break;
1153
1154   case ir_binop_lshift:
1155   case ir_binop_rshift:
1156      assert(!"GLSL 1.30 features unsupported");
1157      break;
1158
1159   case ir_quadop_vector:
1160      assert(!"not reached: should be handled by lower_quadop_vector");
1161      break;
1162   }
1163}
1164
1165
1166void
1167vec4_visitor::visit(ir_swizzle *ir)
1168{
1169   src_reg src;
1170   int i = 0;
1171   int swizzle[4];
1172
1173   /* Note that this is only swizzles in expressions, not those on the left
1174    * hand side of an assignment, which do write masking.  See ir_assignment
1175    * for that.
1176    */
1177
1178   ir->val->accept(this);
1179   src = this->result;
1180   assert(src.file != BAD_FILE);
1181
1182   for (i = 0; i < ir->type->vector_elements; i++) {
1183      switch (i) {
1184      case 0:
1185	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1186	 break;
1187      case 1:
1188	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1189	 break;
1190      case 2:
1191	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1192	 break;
1193      case 3:
1194	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1195	    break;
1196      }
1197   }
1198   for (; i < 4; i++) {
1199      /* Replicate the last channel out. */
1200      swizzle[i] = swizzle[ir->type->vector_elements - 1];
1201   }
1202
1203   src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1204
1205   this->result = src;
1206}
1207
1208void
1209vec4_visitor::visit(ir_dereference_variable *ir)
1210{
1211   const struct glsl_type *type = ir->type;
1212   dst_reg *reg = variable_storage(ir->var);
1213
1214   if (!reg) {
1215      fail("Failed to find variable storage for %s\n", ir->var->name);
1216      this->result = src_reg(brw_null_reg());
1217      return;
1218   }
1219
1220   this->result = src_reg(*reg);
1221
1222   if (type->is_scalar() || type->is_vector() || type->is_matrix())
1223      this->result.swizzle = swizzle_for_size(type->vector_elements);
1224}
1225
1226void
1227vec4_visitor::visit(ir_dereference_array *ir)
1228{
1229   ir_constant *constant_index;
1230   src_reg src;
1231   int element_size = type_size(ir->type);
1232
1233   constant_index = ir->array_index->constant_expression_value();
1234
1235   ir->array->accept(this);
1236   src = this->result;
1237
1238   if (constant_index) {
1239      src.reg_offset += constant_index->value.i[0] * element_size;
1240   } else {
1241      /* Variable index array dereference.  It eats the "vec4" of the
1242       * base of the array and an index that offsets the Mesa register
1243       * index.
1244       */
1245      ir->array_index->accept(this);
1246
1247      src_reg index_reg;
1248
1249      if (element_size == 1) {
1250	 index_reg = this->result;
1251      } else {
1252	 index_reg = src_reg(this, glsl_type::int_type);
1253
1254	 emit(BRW_OPCODE_MUL, dst_reg(index_reg),
1255	      this->result, src_reg(element_size));
1256      }
1257
1258      if (src.reladdr) {
1259	 src_reg temp = src_reg(this, glsl_type::int_type);
1260
1261	 emit(BRW_OPCODE_ADD, dst_reg(temp), *src.reladdr, index_reg);
1262
1263	 index_reg = temp;
1264      }
1265
1266      src.reladdr = ralloc(mem_ctx, src_reg);
1267      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1268   }
1269
1270   /* If the type is smaller than a vec4, replicate the last channel out. */
1271   if (ir->type->is_scalar() || ir->type->is_vector())
1272      src.swizzle = swizzle_for_size(ir->type->vector_elements);
1273   else
1274      src.swizzle = BRW_SWIZZLE_NOOP;
1275   src.type = brw_type_for_base_type(ir->type);
1276
1277   this->result = src;
1278}
1279
1280void
1281vec4_visitor::visit(ir_dereference_record *ir)
1282{
1283   unsigned int i;
1284   const glsl_type *struct_type = ir->record->type;
1285   int offset = 0;
1286
1287   ir->record->accept(this);
1288
1289   for (i = 0; i < struct_type->length; i++) {
1290      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1291	 break;
1292      offset += type_size(struct_type->fields.structure[i].type);
1293   }
1294
1295   /* If the type is smaller than a vec4, replicate the last channel out. */
1296   if (ir->type->is_scalar() || ir->type->is_vector())
1297      this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1298   else
1299      this->result.swizzle = BRW_SWIZZLE_NOOP;
1300   this->result.type = brw_type_for_base_type(ir->type);
1301
1302   this->result.reg_offset += offset;
1303}
1304
1305/**
1306 * We want to be careful in assignment setup to hit the actual storage
1307 * instead of potentially using a temporary like we might with the
1308 * ir_dereference handler.
1309 */
1310static dst_reg
1311get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1312{
1313   /* The LHS must be a dereference.  If the LHS is a variable indexed array
1314    * access of a vector, it must be separated into a series conditional moves
1315    * before reaching this point (see ir_vec_index_to_cond_assign).
1316    */
1317   assert(ir->as_dereference());
1318   ir_dereference_array *deref_array = ir->as_dereference_array();
1319   if (deref_array) {
1320      assert(!deref_array->array->type->is_vector());
1321   }
1322
1323   /* Use the rvalue deref handler for the most part.  We'll ignore
1324    * swizzles in it and write swizzles using writemask, though.
1325    */
1326   ir->accept(v);
1327   return dst_reg(v->result);
1328}
1329
1330void
1331vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1332			      const struct glsl_type *type, bool predicated)
1333{
1334   if (type->base_type == GLSL_TYPE_STRUCT) {
1335      for (unsigned int i = 0; i < type->length; i++) {
1336	 emit_block_move(dst, src, type->fields.structure[i].type, predicated);
1337      }
1338      return;
1339   }
1340
1341   if (type->is_array()) {
1342      for (unsigned int i = 0; i < type->length; i++) {
1343	 emit_block_move(dst, src, type->fields.array, predicated);
1344      }
1345      return;
1346   }
1347
1348   if (type->is_matrix()) {
1349      const struct glsl_type *vec_type;
1350
1351      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1352					 type->vector_elements, 1);
1353
1354      for (int i = 0; i < type->matrix_columns; i++) {
1355	 emit_block_move(dst, src, vec_type, predicated);
1356      }
1357      return;
1358   }
1359
1360   assert(type->is_scalar() || type->is_vector());
1361
1362   dst->type = brw_type_for_base_type(type);
1363   src->type = dst->type;
1364
1365   dst->writemask = (1 << type->vector_elements) - 1;
1366
1367   /* Do we need to worry about swizzling a swizzle? */
1368   assert(src->swizzle = BRW_SWIZZLE_NOOP);
1369   src->swizzle = swizzle_for_size(type->vector_elements);
1370
1371   vec4_instruction *inst = emit(BRW_OPCODE_MOV, *dst, *src);
1372   if (predicated)
1373      inst->predicate = BRW_PREDICATE_NORMAL;
1374
1375   dst->reg_offset++;
1376   src->reg_offset++;
1377}
1378
1379
1380/* If the RHS processing resulted in an instruction generating a
1381 * temporary value, and it would be easy to rewrite the instruction to
1382 * generate its result right into the LHS instead, do so.  This ends
1383 * up reliably removing instructions where it can be tricky to do so
1384 * later without real UD chain information.
1385 */
1386bool
1387vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1388				     dst_reg dst,
1389				     src_reg src,
1390				     vec4_instruction *pre_rhs_inst,
1391				     vec4_instruction *last_rhs_inst)
1392{
1393   /* This could be supported, but it would take more smarts. */
1394   if (ir->condition)
1395      return false;
1396
1397   if (pre_rhs_inst == last_rhs_inst)
1398      return false; /* No instructions generated to work with. */
1399
1400   /* Make sure the last instruction generated our source reg. */
1401   if (src.file != GRF ||
1402       src.file != last_rhs_inst->dst.file ||
1403       src.reg != last_rhs_inst->dst.reg ||
1404       src.reg_offset != last_rhs_inst->dst.reg_offset ||
1405       src.reladdr ||
1406       src.abs ||
1407       src.negate ||
1408       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1409      return false;
1410
1411   /* Check that that last instruction fully initialized the channels
1412    * we want to use, in the order we want to use them.  We could
1413    * potentially reswizzle the operands of many instructions so that
1414    * we could handle out of order channels, but don't yet.
1415    */
1416   for (int i = 0; i < 4; i++) {
1417      if (dst.writemask & (1 << i)) {
1418	 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1419	    return false;
1420
1421	 if (BRW_GET_SWZ(src.swizzle, i) != i)
1422	    return false;
1423      }
1424   }
1425
1426   /* Success!  Rewrite the instruction. */
1427   last_rhs_inst->dst.file = dst.file;
1428   last_rhs_inst->dst.reg = dst.reg;
1429   last_rhs_inst->dst.reg_offset = dst.reg_offset;
1430   last_rhs_inst->dst.reladdr = dst.reladdr;
1431   last_rhs_inst->dst.writemask &= dst.writemask;
1432
1433   return true;
1434}
1435
1436void
1437vec4_visitor::visit(ir_assignment *ir)
1438{
1439   dst_reg dst = get_assignment_lhs(ir->lhs, this);
1440
1441   if (!ir->lhs->type->is_scalar() &&
1442       !ir->lhs->type->is_vector()) {
1443      ir->rhs->accept(this);
1444      src_reg src = this->result;
1445
1446      if (ir->condition) {
1447	 emit_bool_to_cond_code(ir->condition);
1448      }
1449
1450      emit_block_move(&dst, &src, ir->rhs->type, ir->condition != NULL);
1451      return;
1452   }
1453
1454   /* Now we're down to just a scalar/vector with writemasks. */
1455   int i;
1456
1457   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1458   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1459
1460   ir->rhs->accept(this);
1461
1462   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1463
1464   src_reg src = this->result;
1465
1466   int swizzles[4];
1467   int first_enabled_chan = 0;
1468   int src_chan = 0;
1469
1470   assert(ir->lhs->type->is_vector() ||
1471	  ir->lhs->type->is_scalar());
1472   dst.writemask = ir->write_mask;
1473
1474   for (int i = 0; i < 4; i++) {
1475      if (dst.writemask & (1 << i)) {
1476	 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1477	 break;
1478      }
1479   }
1480
1481   /* Swizzle a small RHS vector into the channels being written.
1482    *
1483    * glsl ir treats write_mask as dictating how many channels are
1484    * present on the RHS while in our instructions we need to make
1485    * those channels appear in the slots of the vec4 they're written to.
1486    */
1487   for (int i = 0; i < 4; i++) {
1488      if (dst.writemask & (1 << i))
1489	 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1490      else
1491	 swizzles[i] = first_enabled_chan;
1492   }
1493   src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1494			      swizzles[2], swizzles[3]);
1495
1496   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1497      return;
1498   }
1499
1500   if (ir->condition) {
1501      emit_bool_to_cond_code(ir->condition);
1502   }
1503
1504   for (i = 0; i < type_size(ir->lhs->type); i++) {
1505      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
1506
1507      if (ir->condition)
1508	 inst->predicate = BRW_PREDICATE_NORMAL;
1509
1510      dst.reg_offset++;
1511      src.reg_offset++;
1512   }
1513}
1514
1515void
1516vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1517{
1518   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1519      foreach_list(node, &ir->components) {
1520	 ir_constant *field_value = (ir_constant *)node;
1521
1522	 emit_constant_values(dst, field_value);
1523      }
1524      return;
1525   }
1526
1527   if (ir->type->is_array()) {
1528      for (unsigned int i = 0; i < ir->type->length; i++) {
1529	 emit_constant_values(dst, ir->array_elements[i]);
1530      }
1531      return;
1532   }
1533
1534   if (ir->type->is_matrix()) {
1535      for (int i = 0; i < ir->type->matrix_columns; i++) {
1536	 for (int j = 0; j < ir->type->vector_elements; j++) {
1537	    dst->writemask = 1 << j;
1538	    dst->type = BRW_REGISTER_TYPE_F;
1539
1540	    emit(BRW_OPCODE_MOV, *dst,
1541		 src_reg(ir->value.f[i * ir->type->vector_elements + j]));
1542	 }
1543	 dst->reg_offset++;
1544      }
1545      return;
1546   }
1547
1548   for (int i = 0; i < ir->type->vector_elements; i++) {
1549      dst->writemask = 1 << i;
1550      dst->type = brw_type_for_base_type(ir->type);
1551
1552      switch (ir->type->base_type) {
1553      case GLSL_TYPE_FLOAT:
1554	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.f[i]));
1555	 break;
1556      case GLSL_TYPE_INT:
1557	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.i[i]));
1558	 break;
1559      case GLSL_TYPE_UINT:
1560	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.u[i]));
1561	 break;
1562      case GLSL_TYPE_BOOL:
1563	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.b[i]));
1564	 break;
1565      default:
1566	 assert(!"Non-float/uint/int/bool constant");
1567	 break;
1568      }
1569   }
1570   dst->reg_offset++;
1571}
1572
1573void
1574vec4_visitor::visit(ir_constant *ir)
1575{
1576   dst_reg dst = dst_reg(this, ir->type);
1577   this->result = src_reg(dst);
1578
1579   emit_constant_values(&dst, ir);
1580}
1581
1582void
1583vec4_visitor::visit(ir_call *ir)
1584{
1585   assert(!"not reached");
1586}
1587
1588void
1589vec4_visitor::visit(ir_texture *ir)
1590{
1591   /* FINISHME: Implement vertex texturing.
1592    *
1593    * With 0 vertex samplers available, the linker will reject
1594    * programs that do vertex texturing, but after our visitor has
1595    * run.
1596    */
1597}
1598
1599void
1600vec4_visitor::visit(ir_return *ir)
1601{
1602   assert(!"not reached");
1603}
1604
1605void
1606vec4_visitor::visit(ir_discard *ir)
1607{
1608   assert(!"not reached");
1609}
1610
1611void
1612vec4_visitor::visit(ir_if *ir)
1613{
1614   /* Don't point the annotation at the if statement, because then it plus
1615    * the then and else blocks get printed.
1616    */
1617   this->base_ir = ir->condition;
1618
1619   if (intel->gen == 6) {
1620      emit_if_gen6(ir);
1621   } else {
1622      emit_bool_to_cond_code(ir->condition);
1623      vec4_instruction *inst = emit(BRW_OPCODE_IF);
1624      inst->predicate = BRW_PREDICATE_NORMAL;
1625   }
1626
1627   visit_instructions(&ir->then_instructions);
1628
1629   if (!ir->else_instructions.is_empty()) {
1630      this->base_ir = ir->condition;
1631      emit(BRW_OPCODE_ELSE);
1632
1633      visit_instructions(&ir->else_instructions);
1634   }
1635
1636   this->base_ir = ir->condition;
1637   emit(BRW_OPCODE_ENDIF);
1638}
1639
1640int
1641vec4_visitor::emit_vue_header_gen4(int header_mrf)
1642{
1643   /* Get the position */
1644   src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1645
1646   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1647   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1648
1649   current_annotation = "NDC";
1650   dst_reg ndc_w = ndc;
1651   ndc_w.writemask = WRITEMASK_W;
1652   src_reg pos_w = pos;
1653   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1654   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1655
1656   dst_reg ndc_xyz = ndc;
1657   ndc_xyz.writemask = WRITEMASK_XYZ;
1658
1659   emit(BRW_OPCODE_MUL, ndc_xyz, pos, src_reg(ndc_w));
1660
1661   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1662       c->key.nr_userclip || brw->has_negative_rhw_bug) {
1663      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1664      GLuint i;
1665
1666      emit(BRW_OPCODE_MOV, header1, 0u);
1667
1668      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1669	 assert(!"finishme: psiz");
1670	 src_reg psiz;
1671
1672	 header1.writemask = WRITEMASK_W;
1673	 emit(BRW_OPCODE_MUL, header1, psiz, 1u << 11);
1674	 emit(BRW_OPCODE_AND, header1, src_reg(header1), 0x7ff << 8);
1675      }
1676
1677      for (i = 0; i < c->key.nr_userclip; i++) {
1678	 vec4_instruction *inst;
1679
1680	 inst = emit(BRW_OPCODE_DP4, dst_reg(brw_null_reg()),
1681		     pos, src_reg(c->userplane[i]));
1682	 inst->conditional_mod = BRW_CONDITIONAL_L;
1683
1684	 emit(BRW_OPCODE_OR, header1, src_reg(header1), 1u << i);
1685	 inst->predicate = BRW_PREDICATE_NORMAL;
1686      }
1687
1688      /* i965 clipping workaround:
1689       * 1) Test for -ve rhw
1690       * 2) If set,
1691       *      set ndc = (0,0,0,0)
1692       *      set ucp[6] = 1
1693       *
1694       * Later, clipping will detect ucp[6] and ensure the primitive is
1695       * clipped against all fixed planes.
1696       */
1697      if (brw->has_negative_rhw_bug) {
1698#if 0
1699	 /* FINISHME */
1700	 brw_CMP(p,
1701		 vec8(brw_null_reg()),
1702		 BRW_CONDITIONAL_L,
1703		 brw_swizzle1(ndc, 3),
1704		 brw_imm_f(0));
1705
1706	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1707	 brw_MOV(p, ndc, brw_imm_f(0));
1708	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1709#endif
1710      }
1711
1712      header1.writemask = WRITEMASK_XYZW;
1713      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(header1));
1714   } else {
1715      emit(BRW_OPCODE_MOV, retype(brw_message_reg(header_mrf++),
1716				  BRW_REGISTER_TYPE_UD), 0u);
1717   }
1718
1719   if (intel->gen == 5) {
1720      /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1721       * dword 0-3 (m1) of the header is indices, point width, clip flags.
1722       * dword 4-7 (m2) is the ndc position (set above)
1723       * dword 8-11 (m3) of the vertex header is the 4D space position
1724       * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1725       * m6 is a pad so that the vertex element data is aligned
1726       * m7 is the first vertex data we fill.
1727       */
1728      current_annotation = "NDC";
1729      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
1730
1731      current_annotation = "gl_Position";
1732      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
1733
1734      /* user clip distance. */
1735      header_mrf += 2;
1736
1737      /* Pad so that vertex element data is aligned. */
1738      header_mrf++;
1739   } else {
1740      /* There are 8 dwords in VUE header pre-Ironlake:
1741       * dword 0-3 (m1) is indices, point width, clip flags.
1742       * dword 4-7 (m2) is ndc position (set above)
1743       *
1744       * dword 8-11 (m3) is the first vertex data.
1745       */
1746      current_annotation = "NDC";
1747      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
1748
1749      current_annotation = "gl_Position";
1750      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
1751   }
1752
1753   return header_mrf;
1754}
1755
1756int
1757vec4_visitor::emit_vue_header_gen6(int header_mrf)
1758{
1759   struct brw_reg reg;
1760
1761   /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1762    * dword 0-3 (m2) of the header is indices, point width, clip flags.
1763    * dword 4-7 (m3) is the 4D space position
1764    * dword 8-15 (m4,m5) of the vertex header is the user clip distance if
1765    * enabled.
1766    *
1767    * m4 or 6 is the first vertex element data we fill.
1768    */
1769
1770   current_annotation = "indices, point width, clip flags";
1771   reg = brw_message_reg(header_mrf++);
1772   emit(BRW_OPCODE_MOV, retype(reg, BRW_REGISTER_TYPE_D), src_reg(0));
1773   if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1774      emit(BRW_OPCODE_MOV, brw_writemask(reg, WRITEMASK_W),
1775	   src_reg(output_reg[VERT_RESULT_PSIZ]));
1776   }
1777
1778   current_annotation = "gl_Position";
1779   emit(BRW_OPCODE_MOV,
1780	brw_message_reg(header_mrf++), src_reg(output_reg[VERT_RESULT_HPOS]));
1781
1782   current_annotation = "user clip distances";
1783   if (c->key.nr_userclip) {
1784      for (int i = 0; i < c->key.nr_userclip; i++) {
1785	 struct brw_reg m;
1786	 if (i < 4)
1787	    m = brw_message_reg(header_mrf);
1788	 else
1789	    m = brw_message_reg(header_mrf + 1);
1790
1791	 emit(BRW_OPCODE_DP4,
1792	      dst_reg(brw_writemask(m, 1 << (i & 3))),
1793	      src_reg(c->userplane[i]));
1794      }
1795      header_mrf += 2;
1796   }
1797
1798   current_annotation = NULL;
1799
1800   return header_mrf;
1801}
1802
1803static int
1804align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1805{
1806   struct intel_context *intel = &brw->intel;
1807
1808   if (intel->gen >= 6) {
1809      /* URB data written (does not include the message header reg) must
1810       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1811       * section 5.4.3.2.2: URB_INTERLEAVED.
1812       *
1813       * URB entries are allocated on a multiple of 1024 bits, so an
1814       * extra 128 bits written here to make the end align to 256 is
1815       * no problem.
1816       */
1817      if ((mlen % 2) != 1)
1818	 mlen++;
1819   }
1820
1821   return mlen;
1822}
1823
1824/**
1825 * Generates the VUE payload plus the 1 or 2 URB write instructions to
1826 * complete the VS thread.
1827 *
1828 * The VUE layout is documented in Volume 2a.
1829 */
1830void
1831vec4_visitor::emit_urb_writes()
1832{
1833   /* MRF 0 is reserved for the debugger, so start with message header
1834    * in MRF 1.
1835    */
1836   int base_mrf = 1;
1837   int mrf = base_mrf;
1838   int urb_entry_size;
1839   uint64_t outputs_remaining = c->prog_data.outputs_written;
1840   /* In the process of generating our URB write message contents, we
1841    * may need to unspill a register or load from an array.  Those
1842    * reads would use MRFs 14-15.
1843    */
1844   int max_usable_mrf = 13;
1845
1846   /* FINISHME: edgeflag */
1847
1848   /* First mrf is the g0-based message header containing URB handles and such,
1849    * which is implied in VS_OPCODE_URB_WRITE.
1850    */
1851   mrf++;
1852
1853   if (intel->gen >= 6) {
1854      mrf = emit_vue_header_gen6(mrf);
1855   } else {
1856      mrf = emit_vue_header_gen4(mrf);
1857   }
1858
1859   /* Set up the VUE data for the first URB write */
1860   int attr;
1861   for (attr = 0; attr < VERT_RESULT_MAX; attr++) {
1862      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1863	 continue;
1864
1865      outputs_remaining &= ~BITFIELD64_BIT(attr);
1866
1867      /* This is set up in the VUE header. */
1868      if (attr == VERT_RESULT_HPOS)
1869	 continue;
1870
1871      /* This is loaded into the VUE header, and thus doesn't occupy
1872       * an attribute slot.
1873       */
1874      if (attr == VERT_RESULT_PSIZ)
1875	 continue;
1876
1877      vec4_instruction *inst = emit(BRW_OPCODE_MOV, brw_message_reg(mrf++),
1878				    src_reg(output_reg[attr]));
1879
1880      if ((attr == VERT_RESULT_COL0 ||
1881	   attr == VERT_RESULT_COL1 ||
1882	   attr == VERT_RESULT_BFC0 ||
1883	   attr == VERT_RESULT_BFC1) &&
1884	  c->key.clamp_vertex_color) {
1885	 inst->saturate = true;
1886      }
1887
1888      /* If this was MRF 15, we can't fit anything more into this URB
1889       * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
1890       * even-numbered amount of URB write data, which will meet
1891       * gen6's requirements for length alignment.
1892       */
1893      if (mrf > max_usable_mrf) {
1894	 attr++;
1895	 break;
1896      }
1897   }
1898
1899   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
1900   inst->base_mrf = base_mrf;
1901   inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1902   inst->eot = !outputs_remaining;
1903
1904   urb_entry_size = mrf - base_mrf;
1905
1906   /* Optional second URB write */
1907   if (outputs_remaining) {
1908      mrf = base_mrf + 1;
1909
1910      for (; attr < VERT_RESULT_MAX; attr++) {
1911	 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1912	    continue;
1913
1914	 assert(mrf < max_usable_mrf);
1915
1916	 emit(BRW_OPCODE_MOV, brw_message_reg(mrf++), src_reg(output_reg[attr]));
1917      }
1918
1919      inst = emit(VS_OPCODE_URB_WRITE);
1920      inst->base_mrf = base_mrf;
1921      inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1922      inst->eot = true;
1923      /* URB destination offset.  In the previous write, we got MRFs
1924       * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
1925       * URB row increments, and each of our MRFs is half of one of
1926       * those, since we're doing interleaved writes.
1927       */
1928      inst->offset = (max_usable_mrf - base_mrf) / 2;
1929
1930      urb_entry_size += mrf - base_mrf;
1931   }
1932
1933   if (intel->gen == 6)
1934      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
1935   else
1936      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
1937}
1938
1939src_reg
1940vec4_visitor::get_scratch_offset(vec4_instruction *inst,
1941				 src_reg *reladdr, int reg_offset)
1942{
1943   /* Because we store the values to scratch interleaved like our
1944    * vertex data, we need to scale the vec4 index by 2.
1945    */
1946   int message_header_scale = 2;
1947
1948   /* Pre-gen6, the message header uses byte offsets instead of vec4
1949    * (16-byte) offset units.
1950    */
1951   if (intel->gen < 6)
1952      message_header_scale *= 16;
1953
1954   if (reladdr) {
1955      src_reg index = src_reg(this, glsl_type::int_type);
1956
1957      vec4_instruction *add = emit(BRW_OPCODE_ADD,
1958				   dst_reg(index),
1959				   *reladdr,
1960				   src_reg(reg_offset));
1961      /* Move our new instruction from the tail to its correct place. */
1962      add->remove();
1963      inst->insert_before(add);
1964
1965      vec4_instruction *mul = emit(BRW_OPCODE_MUL, dst_reg(index),
1966				   index, src_reg(message_header_scale));
1967      mul->remove();
1968      inst->insert_before(mul);
1969
1970      return index;
1971   } else {
1972      return src_reg(reg_offset * message_header_scale);
1973   }
1974}
1975
1976src_reg
1977vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
1978				       src_reg *reladdr, int reg_offset)
1979{
1980   if (reladdr) {
1981      src_reg index = src_reg(this, glsl_type::int_type);
1982
1983      vec4_instruction *add = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_ADD,
1984							    dst_reg(index),
1985							    *reladdr,
1986							    src_reg(reg_offset));
1987      add->ir = inst->ir;
1988      add->annotation = inst->annotation;
1989      inst->insert_before(add);
1990
1991      /* Pre-gen6, the message header uses byte offsets instead of vec4
1992       * (16-byte) offset units.
1993       */
1994      if (intel->gen < 6) {
1995	 vec4_instruction *mul = new(mem_ctx) vec4_instruction(this,
1996							       BRW_OPCODE_MUL,
1997							       dst_reg(index),
1998							       index,
1999							       src_reg(16));
2000	 mul->ir = inst->ir;
2001	 mul->annotation = inst->annotation;
2002	 inst->insert_before(mul);
2003      }
2004
2005      return index;
2006   } else {
2007      int message_header_scale = intel->gen < 6 ? 16 : 1;
2008      return src_reg(reg_offset * message_header_scale);
2009   }
2010}
2011
2012/**
2013 * Emits an instruction before @inst to load the value named by @orig_src
2014 * from scratch space at @base_offset to @temp.
2015 */
2016void
2017vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2018				dst_reg temp, src_reg orig_src,
2019				int base_offset)
2020{
2021   int reg_offset = base_offset + orig_src.reg_offset;
2022   src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2023
2024   vec4_instruction *scratch_read_inst = emit(VS_OPCODE_SCRATCH_READ,
2025					      temp, index);
2026
2027   scratch_read_inst->base_mrf = 14;
2028   scratch_read_inst->mlen = 1;
2029   /* Move our instruction from the tail to its correct place. */
2030   scratch_read_inst->remove();
2031   inst->insert_before(scratch_read_inst);
2032}
2033
2034/**
2035 * Emits an instruction after @inst to store the value to be written
2036 * to @orig_dst to scratch space at @base_offset, from @temp.
2037 */
2038void
2039vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2040				 src_reg temp, dst_reg orig_dst,
2041				 int base_offset)
2042{
2043   int reg_offset = base_offset + orig_dst.reg_offset;
2044   src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2045
2046   dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2047				       orig_dst.writemask));
2048   vec4_instruction *scratch_write_inst = emit(VS_OPCODE_SCRATCH_WRITE,
2049					       dst, temp, index);
2050   scratch_write_inst->base_mrf = 13;
2051   scratch_write_inst->mlen = 2;
2052   scratch_write_inst->predicate = inst->predicate;
2053   /* Move our instruction from the tail to its correct place. */
2054   scratch_write_inst->remove();
2055   inst->insert_after(scratch_write_inst);
2056}
2057
2058/**
2059 * We can't generally support array access in GRF space, because a
2060 * single instruction's destination can only span 2 contiguous
2061 * registers.  So, we send all GRF arrays that get variable index
2062 * access to scratch space.
2063 */
2064void
2065vec4_visitor::move_grf_array_access_to_scratch()
2066{
2067   int scratch_loc[this->virtual_grf_count];
2068
2069   for (int i = 0; i < this->virtual_grf_count; i++) {
2070      scratch_loc[i] = -1;
2071   }
2072
2073   /* First, calculate the set of virtual GRFs that need to be punted
2074    * to scratch due to having any array access on them, and where in
2075    * scratch.
2076    */
2077   foreach_list(node, &this->instructions) {
2078      vec4_instruction *inst = (vec4_instruction *)node;
2079
2080      if (inst->dst.file == GRF && inst->dst.reladdr &&
2081	  scratch_loc[inst->dst.reg] == -1) {
2082	 scratch_loc[inst->dst.reg] = c->last_scratch;
2083	 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2084      }
2085
2086      for (int i = 0 ; i < 3; i++) {
2087	 src_reg *src = &inst->src[i];
2088
2089	 if (src->file == GRF && src->reladdr &&
2090	     scratch_loc[src->reg] == -1) {
2091	    scratch_loc[src->reg] = c->last_scratch;
2092	    c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2093	 }
2094      }
2095   }
2096
2097   /* Now, for anything that will be accessed through scratch, rewrite
2098    * it to load/store.  Note that this is a _safe list walk, because
2099    * we may generate a new scratch_write instruction after the one
2100    * we're processing.
2101    */
2102   foreach_list_safe(node, &this->instructions) {
2103      vec4_instruction *inst = (vec4_instruction *)node;
2104
2105      /* Set up the annotation tracking for new generated instructions. */
2106      base_ir = inst->ir;
2107      current_annotation = inst->annotation;
2108
2109      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2110	 src_reg temp = src_reg(this, glsl_type::vec4_type);
2111
2112	 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2113
2114	 inst->dst.file = temp.file;
2115	 inst->dst.reg = temp.reg;
2116	 inst->dst.reg_offset = temp.reg_offset;
2117	 inst->dst.reladdr = NULL;
2118      }
2119
2120      for (int i = 0 ; i < 3; i++) {
2121	 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2122	    continue;
2123
2124	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2125
2126	 emit_scratch_read(inst, temp, inst->src[i],
2127			   scratch_loc[inst->src[i].reg]);
2128
2129	 inst->src[i].file = temp.file;
2130	 inst->src[i].reg = temp.reg;
2131	 inst->src[i].reg_offset = temp.reg_offset;
2132	 inst->src[i].reladdr = NULL;
2133      }
2134   }
2135}
2136
2137/**
2138 * Emits an instruction before @inst to load the value named by @orig_src
2139 * from the pull constant buffer (surface) at @base_offset to @temp.
2140 */
2141void
2142vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2143				      dst_reg temp, src_reg orig_src,
2144				      int base_offset)
2145{
2146   int reg_offset = base_offset + orig_src.reg_offset;
2147   src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2148   vec4_instruction *load;
2149
2150   load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2151					temp, index);
2152   load->annotation = inst->annotation;
2153   load->ir = inst->ir;
2154   load->base_mrf = 14;
2155   load->mlen = 1;
2156   inst->insert_before(load);
2157}
2158
2159/**
2160 * Implements array access of uniforms by inserting a
2161 * PULL_CONSTANT_LOAD instruction.
2162 *
2163 * Unlike temporary GRF array access (where we don't support it due to
2164 * the difficulty of doing relative addressing on instruction
2165 * destinations), we could potentially do array access of uniforms
2166 * that were loaded in GRF space as push constants.  In real-world
2167 * usage we've seen, though, the arrays being used are always larger
2168 * than we could load as push constants, so just always move all
2169 * uniform array access out to a pull constant buffer.
2170 */
2171void
2172vec4_visitor::move_uniform_array_access_to_pull_constants()
2173{
2174   int pull_constant_loc[this->uniforms];
2175
2176   for (int i = 0; i < this->uniforms; i++) {
2177      pull_constant_loc[i] = -1;
2178   }
2179
2180   /* Walk through and find array access of uniforms.  Put a copy of that
2181    * uniform in the pull constant buffer.
2182    *
2183    * Note that we don't move constant-indexed accesses to arrays.  No
2184    * testing has been done of the performance impact of this choice.
2185    */
2186   foreach_list_safe(node, &this->instructions) {
2187      vec4_instruction *inst = (vec4_instruction *)node;
2188
2189      for (int i = 0 ; i < 3; i++) {
2190	 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2191	    continue;
2192
2193	 int uniform = inst->src[i].reg;
2194
2195	 /* If this array isn't already present in the pull constant buffer,
2196	  * add it.
2197	  */
2198	 if (pull_constant_loc[uniform] == -1) {
2199	    const float **values = &prog_data->param[uniform * 4];
2200
2201	    pull_constant_loc[uniform] = prog_data->nr_pull_params;
2202
2203	    for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2204	       prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2205	    }
2206	 }
2207
2208	 /* Set up the annotation tracking for new generated instructions. */
2209	 base_ir = inst->ir;
2210	 current_annotation = inst->annotation;
2211
2212	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2213
2214	 emit_pull_constant_load(inst, temp, inst->src[i],
2215				 pull_constant_loc[uniform]);
2216
2217	 inst->src[i].file = temp.file;
2218	 inst->src[i].reg = temp.reg;
2219	 inst->src[i].reg_offset = temp.reg_offset;
2220	 inst->src[i].reladdr = NULL;
2221      }
2222   }
2223
2224   /* Now there are no accesses of the UNIFORM file with a reladdr, so
2225    * no need to track them as larger-than-vec4 objects.  This will be
2226    * relied on in cutting out unused uniform vectors from push
2227    * constants.
2228    */
2229   split_uniform_registers();
2230}
2231
2232vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2233			   struct gl_shader_program *prog,
2234			   struct brw_shader *shader)
2235{
2236   this->c = c;
2237   this->p = &c->func;
2238   this->brw = p->brw;
2239   this->intel = &brw->intel;
2240   this->ctx = &intel->ctx;
2241   this->prog = prog;
2242   this->shader = shader;
2243
2244   this->mem_ctx = ralloc_context(NULL);
2245   this->failed = false;
2246
2247   this->base_ir = NULL;
2248   this->current_annotation = NULL;
2249
2250   this->c = c;
2251   this->vp = prog->VertexProgram;
2252   this->prog_data = &c->prog_data;
2253
2254   this->variable_ht = hash_table_ctor(0,
2255				       hash_table_pointer_hash,
2256				       hash_table_pointer_compare);
2257
2258   this->virtual_grf_def = NULL;
2259   this->virtual_grf_use = NULL;
2260   this->virtual_grf_sizes = NULL;
2261   this->virtual_grf_count = 0;
2262   this->virtual_grf_array_size = 0;
2263   this->live_intervals_valid = false;
2264
2265   this->uniforms = 0;
2266
2267   this->variable_ht = hash_table_ctor(0,
2268				       hash_table_pointer_hash,
2269				       hash_table_pointer_compare);
2270}
2271
2272vec4_visitor::~vec4_visitor()
2273{
2274   ralloc_free(this->mem_ctx);
2275   hash_table_dtor(this->variable_ht);
2276}
2277
2278
2279void
2280vec4_visitor::fail(const char *format, ...)
2281{
2282   va_list va;
2283   char *msg;
2284
2285   if (failed)
2286      return;
2287
2288   failed = true;
2289
2290   va_start(va, format);
2291   msg = ralloc_vasprintf(mem_ctx, format, va);
2292   va_end(va);
2293   msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2294
2295   this->fail_msg = msg;
2296
2297   if (INTEL_DEBUG & DEBUG_VS) {
2298      fprintf(stderr, "%s",  msg);
2299   }
2300}
2301
2302} /* namespace brw */
2303