brw_vec4_visitor.cpp revision 72cfc6f3778d8297e52c254a5861a88eb62e4d67
1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_vec4.h"
25extern "C" {
26#include "main/macros.h"
27#include "program/prog_parameter.h"
28}
29
30namespace brw {
31
32src_reg::src_reg(dst_reg reg)
33{
34   init();
35
36   this->file = reg.file;
37   this->reg = reg.reg;
38   this->reg_offset = reg.reg_offset;
39   this->type = reg.type;
40   this->reladdr = reg.reladdr;
41   this->fixed_hw_reg = reg.fixed_hw_reg;
42
43   int swizzles[4];
44   int next_chan = 0;
45   int last = 0;
46
47   for (int i = 0; i < 4; i++) {
48      if (!(reg.writemask & (1 << i)))
49	 continue;
50
51      swizzles[next_chan++] = last = i;
52   }
53
54   for (; next_chan < 4; next_chan++) {
55      swizzles[next_chan] = last;
56   }
57
58   this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
59				swizzles[2], swizzles[3]);
60}
61
62dst_reg::dst_reg(src_reg reg)
63{
64   init();
65
66   this->file = reg.file;
67   this->reg = reg.reg;
68   this->reg_offset = reg.reg_offset;
69   this->type = reg.type;
70   this->writemask = WRITEMASK_XYZW;
71   this->reladdr = reg.reladdr;
72   this->fixed_hw_reg = reg.fixed_hw_reg;
73}
74
75vec4_instruction::vec4_instruction(vec4_visitor *v,
76				   enum opcode opcode, dst_reg dst,
77				   src_reg src0, src_reg src1, src_reg src2)
78{
79   this->opcode = opcode;
80   this->dst = dst;
81   this->src[0] = src0;
82   this->src[1] = src1;
83   this->src[2] = src2;
84   this->ir = v->base_ir;
85   this->annotation = v->current_annotation;
86}
87
88vec4_instruction *
89vec4_visitor::emit(vec4_instruction *inst)
90{
91   this->instructions.push_tail(inst);
92
93   return inst;
94}
95
96vec4_instruction *
97vec4_visitor::emit(enum opcode opcode, dst_reg dst,
98		   src_reg src0, src_reg src1, src_reg src2)
99{
100   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
101					     src0, src1, src2));
102}
103
104
105vec4_instruction *
106vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
107{
108   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
109}
110
111vec4_instruction *
112vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
113{
114   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
115}
116
117vec4_instruction *
118vec4_visitor::emit(enum opcode opcode)
119{
120   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
121}
122
123void
124vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
125{
126   static enum opcode dot_opcodes[] = {
127      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
128   };
129
130   emit(dot_opcodes[elements - 2], dst, src0, src1);
131}
132
133void
134vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
135{
136   /* The gen6 math instruction ignores the source modifiers --
137    * swizzle, abs, negate, and at least some parts of the register
138    * region description.
139    *
140    * While it would seem that this MOV could be avoided at this point
141    * in the case that the swizzle is matched up with the destination
142    * writemask, note that uniform packing and register allocation
143    * could rearrange our swizzle, so let's leave this matter up to
144    * copy propagation later.
145    */
146   src_reg temp_src = src_reg(this, glsl_type::vec4_type);
147   emit(BRW_OPCODE_MOV, dst_reg(temp_src), src);
148
149   if (dst.writemask != WRITEMASK_XYZW) {
150      /* The gen6 math instruction must be align1, so we can't do
151       * writemasks.
152       */
153      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
154
155      emit(opcode, temp_dst, temp_src);
156
157      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
158   } else {
159      emit(opcode, dst, temp_src);
160   }
161}
162
163void
164vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
165{
166   vec4_instruction *inst = emit(opcode, dst, src);
167   inst->base_mrf = 1;
168   inst->mlen = 1;
169}
170
171void
172vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
173{
174   switch (opcode) {
175   case SHADER_OPCODE_RCP:
176   case SHADER_OPCODE_RSQ:
177   case SHADER_OPCODE_SQRT:
178   case SHADER_OPCODE_EXP2:
179   case SHADER_OPCODE_LOG2:
180   case SHADER_OPCODE_SIN:
181   case SHADER_OPCODE_COS:
182      break;
183   default:
184      assert(!"not reached: bad math opcode");
185      return;
186   }
187
188   if (intel->gen >= 6) {
189      return emit_math1_gen6(opcode, dst, src);
190   } else {
191      return emit_math1_gen4(opcode, dst, src);
192   }
193}
194
195void
196vec4_visitor::emit_math2_gen6(enum opcode opcode,
197			      dst_reg dst, src_reg src0, src_reg src1)
198{
199   src_reg expanded;
200
201   /* The gen6 math instruction ignores the source modifiers --
202    * swizzle, abs, negate, and at least some parts of the register
203    * region description.  Move the sources to temporaries to make it
204    * generally work.
205    */
206
207   expanded = src_reg(this, glsl_type::vec4_type);
208   emit(BRW_OPCODE_MOV, dst_reg(expanded), src0);
209   src0 = expanded;
210
211   expanded = src_reg(this, glsl_type::vec4_type);
212   emit(BRW_OPCODE_MOV, dst_reg(expanded), src1);
213   src1 = expanded;
214
215   if (dst.writemask != WRITEMASK_XYZW) {
216      /* The gen6 math instruction must be align1, so we can't do
217       * writemasks.
218       */
219      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
220
221      emit(opcode, temp_dst, src0, src1);
222
223      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
224   } else {
225      emit(opcode, dst, src0, src1);
226   }
227}
228
229void
230vec4_visitor::emit_math2_gen4(enum opcode opcode,
231			      dst_reg dst, src_reg src0, src_reg src1)
232{
233   vec4_instruction *inst = emit(opcode, dst, src0, src1);
234   inst->base_mrf = 1;
235   inst->mlen = 2;
236}
237
238void
239vec4_visitor::emit_math(enum opcode opcode,
240			dst_reg dst, src_reg src0, src_reg src1)
241{
242   assert(opcode == SHADER_OPCODE_POW);
243
244   if (intel->gen >= 6) {
245      return emit_math2_gen6(opcode, dst, src0, src1);
246   } else {
247      return emit_math2_gen4(opcode, dst, src0, src1);
248   }
249}
250
251void
252vec4_visitor::visit_instructions(const exec_list *list)
253{
254   foreach_list(node, list) {
255      ir_instruction *ir = (ir_instruction *)node;
256
257      base_ir = ir;
258      ir->accept(this);
259   }
260}
261
262
263static int
264type_size(const struct glsl_type *type)
265{
266   unsigned int i;
267   int size;
268
269   switch (type->base_type) {
270   case GLSL_TYPE_UINT:
271   case GLSL_TYPE_INT:
272   case GLSL_TYPE_FLOAT:
273   case GLSL_TYPE_BOOL:
274      if (type->is_matrix()) {
275	 return type->matrix_columns;
276      } else {
277	 /* Regardless of size of vector, it gets a vec4. This is bad
278	  * packing for things like floats, but otherwise arrays become a
279	  * mess.  Hopefully a later pass over the code can pack scalars
280	  * down if appropriate.
281	  */
282	 return 1;
283      }
284   case GLSL_TYPE_ARRAY:
285      assert(type->length > 0);
286      return type_size(type->fields.array) * type->length;
287   case GLSL_TYPE_STRUCT:
288      size = 0;
289      for (i = 0; i < type->length; i++) {
290	 size += type_size(type->fields.structure[i].type);
291      }
292      return size;
293   case GLSL_TYPE_SAMPLER:
294      /* Samplers take up one slot in UNIFORMS[], but they're baked in
295       * at link time.
296       */
297      return 1;
298   default:
299      assert(0);
300      return 0;
301   }
302}
303
304int
305vec4_visitor::virtual_grf_alloc(int size)
306{
307   if (virtual_grf_array_size <= virtual_grf_count) {
308      if (virtual_grf_array_size == 0)
309	 virtual_grf_array_size = 16;
310      else
311	 virtual_grf_array_size *= 2;
312      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
313				   virtual_grf_array_size);
314   }
315   virtual_grf_sizes[virtual_grf_count] = size;
316   return virtual_grf_count++;
317}
318
319src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
320{
321   init();
322
323   this->file = GRF;
324   this->reg = v->virtual_grf_alloc(type_size(type));
325
326   if (type->is_array() || type->is_record()) {
327      this->swizzle = BRW_SWIZZLE_NOOP;
328   } else {
329      this->swizzle = swizzle_for_size(type->vector_elements);
330   }
331
332   this->type = brw_type_for_base_type(type);
333}
334
335dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
336{
337   init();
338
339   this->file = GRF;
340   this->reg = v->virtual_grf_alloc(type_size(type));
341
342   if (type->is_array() || type->is_record()) {
343      this->writemask = WRITEMASK_XYZW;
344   } else {
345      this->writemask = (1 << type->vector_elements) - 1;
346   }
347
348   this->type = brw_type_for_base_type(type);
349}
350
351/* Our support for uniforms is piggy-backed on the struct
352 * gl_fragment_program, because that's where the values actually
353 * get stored, rather than in some global gl_shader_program uniform
354 * store.
355 */
356int
357vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
358{
359   unsigned int offset = 0;
360   float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
361
362   if (type->is_matrix()) {
363      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
364							type->vector_elements,
365							1);
366
367      for (unsigned int i = 0; i < type->matrix_columns; i++) {
368	 offset += setup_uniform_values(loc + offset, column);
369      }
370
371      return offset;
372   }
373
374   switch (type->base_type) {
375   case GLSL_TYPE_FLOAT:
376   case GLSL_TYPE_UINT:
377   case GLSL_TYPE_INT:
378   case GLSL_TYPE_BOOL:
379      for (unsigned int i = 0; i < type->vector_elements; i++) {
380	 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
381      }
382
383      /* Set up pad elements to get things aligned to a vec4 boundary. */
384      for (unsigned int i = type->vector_elements; i < 4; i++) {
385	 static float zero = 0;
386
387	 c->prog_data.param[this->uniforms * 4 + i] = &zero;
388      }
389
390      /* Track the size of this uniform vector, for future packing of
391       * uniforms.
392       */
393      this->uniform_vector_size[this->uniforms] = type->vector_elements;
394      this->uniforms++;
395
396      return 1;
397
398   case GLSL_TYPE_STRUCT:
399      for (unsigned int i = 0; i < type->length; i++) {
400	 offset += setup_uniform_values(loc + offset,
401					type->fields.structure[i].type);
402      }
403      return offset;
404
405   case GLSL_TYPE_ARRAY:
406      for (unsigned int i = 0; i < type->length; i++) {
407	 offset += setup_uniform_values(loc + offset, type->fields.array);
408      }
409      return offset;
410
411   case GLSL_TYPE_SAMPLER:
412      /* The sampler takes up a slot, but we don't use any values from it. */
413      return 1;
414
415   default:
416      assert(!"not reached");
417      return 0;
418   }
419}
420
421/* Our support for builtin uniforms is even scarier than non-builtin.
422 * It sits on top of the PROG_STATE_VAR parameters that are
423 * automatically updated from GL context state.
424 */
425void
426vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
427{
428   const ir_state_slot *const slots = ir->state_slots;
429   assert(ir->state_slots != NULL);
430
431   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
432      /* This state reference has already been setup by ir_to_mesa,
433       * but we'll get the same index back here.  We can reference
434       * ParameterValues directly, since unlike brw_fs.cpp, we never
435       * add new state references during compile.
436       */
437      int index = _mesa_add_state_reference(this->vp->Base.Parameters,
438					    (gl_state_index *)slots[i].tokens);
439      float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
440
441      this->uniform_vector_size[this->uniforms] = 0;
442      /* Add each of the unique swizzled channels of the element.
443       * This will end up matching the size of the glsl_type of this field.
444       */
445      int last_swiz = -1;
446      for (unsigned int j = 0; j < 4; j++) {
447	 int swiz = GET_SWZ(slots[i].swizzle, j);
448	 last_swiz = swiz;
449
450	 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
451	 if (swiz <= last_swiz)
452	    this->uniform_vector_size[this->uniforms]++;
453      }
454      this->uniforms++;
455   }
456}
457
458dst_reg *
459vec4_visitor::variable_storage(ir_variable *var)
460{
461   return (dst_reg *)hash_table_find(this->variable_ht, var);
462}
463
464void
465vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
466{
467   ir_expression *expr = ir->as_expression();
468
469   if (expr) {
470      src_reg op[2];
471      vec4_instruction *inst;
472
473      assert(expr->get_num_operands() <= 2);
474      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
475	 assert(expr->operands[i]->type->is_scalar());
476
477	 expr->operands[i]->accept(this);
478	 op[i] = this->result;
479      }
480
481      switch (expr->operation) {
482      case ir_unop_logic_not:
483	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], src_reg(1));
484	 inst->conditional_mod = BRW_CONDITIONAL_Z;
485	 break;
486
487      case ir_binop_logic_xor:
488	 inst = emit(BRW_OPCODE_XOR, dst_null_d(), op[0], op[1]);
489	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
490	 break;
491
492      case ir_binop_logic_or:
493	 inst = emit(BRW_OPCODE_OR, dst_null_d(), op[0], op[1]);
494	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
495	 break;
496
497      case ir_binop_logic_and:
498	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], op[1]);
499	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
500	 break;
501
502      case ir_unop_f2b:
503	 if (intel->gen >= 6) {
504	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0.0f));
505	 } else {
506	    inst = emit(BRW_OPCODE_MOV, dst_null_f(), op[0]);
507	 }
508	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
509	 break;
510
511      case ir_unop_i2b:
512	 if (intel->gen >= 6) {
513	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
514	 } else {
515	    inst = emit(BRW_OPCODE_MOV, dst_null_d(), op[0]);
516	 }
517	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
518	 break;
519
520      case ir_binop_greater:
521      case ir_binop_gequal:
522      case ir_binop_less:
523      case ir_binop_lequal:
524      case ir_binop_equal:
525      case ir_binop_all_equal:
526      case ir_binop_nequal:
527      case ir_binop_any_nequal:
528	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
529	 inst->conditional_mod =
530	    brw_conditional_for_comparison(expr->operation);
531	 break;
532
533      default:
534	 assert(!"not reached");
535	 break;
536      }
537      return;
538   }
539
540   ir->accept(this);
541
542   if (intel->gen >= 6) {
543      vec4_instruction *inst = emit(BRW_OPCODE_AND, dst_null_d(),
544			       this->result, src_reg(1));
545      inst->conditional_mod = BRW_CONDITIONAL_NZ;
546   } else {
547      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst_null_d(), this->result);
548      inst->conditional_mod = BRW_CONDITIONAL_NZ;
549   }
550}
551
552/**
553 * Emit a gen6 IF statement with the comparison folded into the IF
554 * instruction.
555 */
556void
557vec4_visitor::emit_if_gen6(ir_if *ir)
558{
559   ir_expression *expr = ir->condition->as_expression();
560
561   if (expr) {
562      src_reg op[2];
563      vec4_instruction *inst;
564      dst_reg temp;
565
566      assert(expr->get_num_operands() <= 2);
567      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
568	 expr->operands[i]->accept(this);
569	 op[i] = this->result;
570      }
571
572      switch (expr->operation) {
573      case ir_unop_logic_not:
574	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
575	 inst->conditional_mod = BRW_CONDITIONAL_Z;
576	 return;
577
578      case ir_binop_logic_xor:
579	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
580	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
581	 return;
582
583      case ir_binop_logic_or:
584	 temp = dst_reg(this, glsl_type::bool_type);
585	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
586	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
587	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
588	 return;
589
590      case ir_binop_logic_and:
591	 temp = dst_reg(this, glsl_type::bool_type);
592	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
593	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
594	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
595	 return;
596
597      case ir_unop_f2b:
598	 inst = emit(BRW_OPCODE_IF, dst_null_f(), op[0], src_reg(0));
599	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
600	 return;
601
602      case ir_unop_i2b:
603	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
604	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
605	 return;
606
607      case ir_binop_greater:
608      case ir_binop_gequal:
609      case ir_binop_less:
610      case ir_binop_lequal:
611      case ir_binop_equal:
612      case ir_binop_nequal:
613	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
614	 inst->conditional_mod =
615	    brw_conditional_for_comparison(expr->operation);
616	 return;
617
618      case ir_binop_all_equal:
619	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
620	 inst->conditional_mod = BRW_CONDITIONAL_Z;
621
622	 inst = emit(BRW_OPCODE_IF);
623	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
624	 return;
625
626      case ir_binop_any_nequal:
627	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
628	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
629
630	 inst = emit(BRW_OPCODE_IF);
631	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
632	 return;
633
634      case ir_unop_any:
635	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
636	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
637
638	 inst = emit(BRW_OPCODE_IF);
639	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
640	 return;
641
642      default:
643	 assert(!"not reached");
644	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
645	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
646	 return;
647      }
648      return;
649   }
650
651   ir->condition->accept(this);
652
653   vec4_instruction *inst = emit(BRW_OPCODE_IF, dst_null_d(),
654			    this->result, src_reg(0));
655   inst->conditional_mod = BRW_CONDITIONAL_NZ;
656}
657
658void
659vec4_visitor::visit(ir_variable *ir)
660{
661   dst_reg *reg = NULL;
662
663   if (variable_storage(ir))
664      return;
665
666   switch (ir->mode) {
667   case ir_var_in:
668      reg = new(mem_ctx) dst_reg(ATTR, ir->location);
669      break;
670
671   case ir_var_out:
672      reg = new(mem_ctx) dst_reg(this, ir->type);
673
674      for (int i = 0; i < type_size(ir->type); i++) {
675	 output_reg[ir->location + i] = *reg;
676	 output_reg[ir->location + i].reg_offset = i;
677	 output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
678      }
679      break;
680
681   case ir_var_auto:
682   case ir_var_temporary:
683      reg = new(mem_ctx) dst_reg(this, ir->type);
684      break;
685
686   case ir_var_uniform:
687      reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
688
689      /* Track how big the whole uniform variable is, in case we need to put a
690       * copy of its data into pull constants for array access.
691       */
692      this->uniform_size[this->uniforms] = type_size(ir->type);
693
694      if (!strncmp(ir->name, "gl_", 3)) {
695	 setup_builtin_uniform_values(ir);
696      } else {
697	 setup_uniform_values(ir->location, ir->type);
698      }
699      break;
700
701   default:
702      assert(!"not reached");
703   }
704
705   reg->type = brw_type_for_base_type(ir->type);
706   hash_table_insert(this->variable_ht, reg, ir);
707}
708
709void
710vec4_visitor::visit(ir_loop *ir)
711{
712   dst_reg counter;
713
714   /* We don't want debugging output to print the whole body of the
715    * loop as the annotation.
716    */
717   this->base_ir = NULL;
718
719   if (ir->counter != NULL) {
720      this->base_ir = ir->counter;
721      ir->counter->accept(this);
722      counter = *(variable_storage(ir->counter));
723
724      if (ir->from != NULL) {
725	 this->base_ir = ir->from;
726	 ir->from->accept(this);
727
728	 emit(BRW_OPCODE_MOV, counter, this->result);
729      }
730   }
731
732   emit(BRW_OPCODE_DO);
733
734   if (ir->to) {
735      this->base_ir = ir->to;
736      ir->to->accept(this);
737
738      vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst_null_d(),
739				    src_reg(counter), this->result);
740      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
741
742      inst = emit(BRW_OPCODE_BREAK);
743      inst->predicate = BRW_PREDICATE_NORMAL;
744   }
745
746   visit_instructions(&ir->body_instructions);
747
748
749   if (ir->increment) {
750      this->base_ir = ir->increment;
751      ir->increment->accept(this);
752      emit(BRW_OPCODE_ADD, counter, src_reg(counter), this->result);
753   }
754
755   emit(BRW_OPCODE_WHILE);
756}
757
758void
759vec4_visitor::visit(ir_loop_jump *ir)
760{
761   switch (ir->mode) {
762   case ir_loop_jump::jump_break:
763      emit(BRW_OPCODE_BREAK);
764      break;
765   case ir_loop_jump::jump_continue:
766      emit(BRW_OPCODE_CONTINUE);
767      break;
768   }
769}
770
771
772void
773vec4_visitor::visit(ir_function_signature *ir)
774{
775   assert(0);
776   (void)ir;
777}
778
779void
780vec4_visitor::visit(ir_function *ir)
781{
782   /* Ignore function bodies other than main() -- we shouldn't see calls to
783    * them since they should all be inlined.
784    */
785   if (strcmp(ir->name, "main") == 0) {
786      const ir_function_signature *sig;
787      exec_list empty;
788
789      sig = ir->matching_signature(&empty);
790
791      assert(sig);
792
793      visit_instructions(&sig->body);
794   }
795}
796
797GLboolean
798vec4_visitor::try_emit_sat(ir_expression *ir)
799{
800   ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
801   if (!sat_src)
802      return false;
803
804   sat_src->accept(this);
805   src_reg src = this->result;
806
807   this->result = src_reg(this, ir->type);
808   vec4_instruction *inst;
809   inst = emit(BRW_OPCODE_MOV, dst_reg(this->result), src);
810   inst->saturate = true;
811
812   return true;
813}
814
815void
816vec4_visitor::emit_bool_comparison(unsigned int op,
817				 dst_reg dst, src_reg src0, src_reg src1)
818{
819   /* original gen4 does destination conversion before comparison. */
820   if (intel->gen < 5)
821      dst.type = src0.type;
822
823   vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst, src0, src1);
824   inst->conditional_mod = brw_conditional_for_comparison(op);
825
826   dst.type = BRW_REGISTER_TYPE_D;
827   emit(BRW_OPCODE_AND, dst, src_reg(dst), src_reg(0x1));
828}
829
830void
831vec4_visitor::visit(ir_expression *ir)
832{
833   unsigned int operand;
834   src_reg op[Elements(ir->operands)];
835   src_reg result_src;
836   dst_reg result_dst;
837   vec4_instruction *inst;
838
839   if (try_emit_sat(ir))
840      return;
841
842   for (operand = 0; operand < ir->get_num_operands(); operand++) {
843      this->result.file = BAD_FILE;
844      ir->operands[operand]->accept(this);
845      if (this->result.file == BAD_FILE) {
846	 printf("Failed to get tree for expression operand:\n");
847	 ir->operands[operand]->print();
848	 exit(1);
849      }
850      op[operand] = this->result;
851
852      /* Matrix expression operands should have been broken down to vector
853       * operations already.
854       */
855      assert(!ir->operands[operand]->type->is_matrix());
856   }
857
858   int vector_elements = ir->operands[0]->type->vector_elements;
859   if (ir->operands[1]) {
860      vector_elements = MAX2(vector_elements,
861			     ir->operands[1]->type->vector_elements);
862   }
863
864   this->result.file = BAD_FILE;
865
866   /* Storage for our result.  Ideally for an assignment we'd be using
867    * the actual storage for the result here, instead.
868    */
869   result_src = src_reg(this, ir->type);
870   /* convenience for the emit functions below. */
871   result_dst = dst_reg(result_src);
872   /* If nothing special happens, this is the result. */
873   this->result = result_src;
874   /* Limit writes to the channels that will be used by result_src later.
875    * This does limit this temp's use as a temporary for multi-instruction
876    * sequences.
877    */
878   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
879
880   switch (ir->operation) {
881   case ir_unop_logic_not:
882      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
883       * ones complement of the whole register, not just bit 0.
884       */
885      emit(BRW_OPCODE_XOR, result_dst, op[0], src_reg(1));
886      break;
887   case ir_unop_neg:
888      op[0].negate = !op[0].negate;
889      this->result = op[0];
890      break;
891   case ir_unop_abs:
892      op[0].abs = true;
893      op[0].negate = false;
894      this->result = op[0];
895      break;
896
897   case ir_unop_sign:
898      emit(BRW_OPCODE_MOV, result_dst, src_reg(0.0f));
899
900      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
901      inst->conditional_mod = BRW_CONDITIONAL_G;
902      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1.0f));
903      inst->predicate = BRW_PREDICATE_NORMAL;
904
905      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
906      inst->conditional_mod = BRW_CONDITIONAL_L;
907      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(-1.0f));
908      inst->predicate = BRW_PREDICATE_NORMAL;
909
910      break;
911
912   case ir_unop_rcp:
913      emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
914      break;
915
916   case ir_unop_exp2:
917      emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
918      break;
919   case ir_unop_log2:
920      emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
921      break;
922   case ir_unop_exp:
923   case ir_unop_log:
924      assert(!"not reached: should be handled by ir_explog_to_explog2");
925      break;
926   case ir_unop_sin:
927   case ir_unop_sin_reduced:
928      emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
929      break;
930   case ir_unop_cos:
931   case ir_unop_cos_reduced:
932      emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
933      break;
934
935   case ir_unop_dFdx:
936   case ir_unop_dFdy:
937      assert(!"derivatives not valid in vertex shader");
938      break;
939
940   case ir_unop_noise:
941      assert(!"not reached: should be handled by lower_noise");
942      break;
943
944   case ir_binop_add:
945      emit(BRW_OPCODE_ADD, result_dst, op[0], op[1]);
946      break;
947   case ir_binop_sub:
948      assert(!"not reached: should be handled by ir_sub_to_add_neg");
949      break;
950
951   case ir_binop_mul:
952      if (ir->type->is_integer()) {
953	 /* For integer multiplication, the MUL uses the low 16 bits
954	  * of one of the operands (src0 on gen6, src1 on gen7).  The
955	  * MACH accumulates in the contribution of the upper 16 bits
956	  * of that operand.
957	  *
958	  * FINISHME: Emit just the MUL if we know an operand is small
959	  * enough.
960	  */
961	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
962
963	 emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
964	 emit(BRW_OPCODE_MACH, dst_null_d(), op[0], op[1]);
965	 emit(BRW_OPCODE_MOV, result_dst, src_reg(acc));
966      } else {
967	 emit(BRW_OPCODE_MUL, result_dst, op[0], op[1]);
968      }
969      break;
970   case ir_binop_div:
971      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
972   case ir_binop_mod:
973      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
974      break;
975
976   case ir_binop_less:
977   case ir_binop_greater:
978   case ir_binop_lequal:
979   case ir_binop_gequal:
980   case ir_binop_equal:
981   case ir_binop_nequal: {
982      dst_reg temp = result_dst;
983      /* original gen4 does implicit conversion before comparison. */
984      if (intel->gen < 5)
985	 temp.type = op[0].type;
986
987      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
988      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
989      emit(BRW_OPCODE_AND, result_dst, this->result, src_reg(0x1));
990      break;
991   }
992
993   case ir_binop_all_equal:
994      /* "==" operator producing a scalar boolean. */
995      if (ir->operands[0]->type->is_vector() ||
996	  ir->operands[1]->type->is_vector()) {
997	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
998	 inst->conditional_mod = BRW_CONDITIONAL_Z;
999
1000	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1001	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1002	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1003      } else {
1004	 dst_reg temp = result_dst;
1005	 /* original gen4 does implicit conversion before comparison. */
1006	 if (intel->gen < 5)
1007	    temp.type = op[0].type;
1008
1009	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1010	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1011	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
1012      }
1013      break;
1014   case ir_binop_any_nequal:
1015      /* "!=" operator producing a scalar boolean. */
1016      if (ir->operands[0]->type->is_vector() ||
1017	  ir->operands[1]->type->is_vector()) {
1018	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
1019	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1020
1021	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1022	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1023	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1024      } else {
1025	 dst_reg temp = result_dst;
1026	 /* original gen4 does implicit conversion before comparison. */
1027	 if (intel->gen < 5)
1028	    temp.type = op[0].type;
1029
1030	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1031	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1032	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
1033      }
1034      break;
1035
1036   case ir_unop_any:
1037      inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
1038      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1039
1040      emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1041
1042      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1043      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1044      break;
1045
1046   case ir_binop_logic_xor:
1047      emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
1048      break;
1049
1050   case ir_binop_logic_or:
1051      emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
1052      break;
1053
1054   case ir_binop_logic_and:
1055      emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
1056      break;
1057
1058   case ir_binop_dot:
1059      assert(ir->operands[0]->type->is_vector());
1060      assert(ir->operands[0]->type == ir->operands[1]->type);
1061      emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1062      break;
1063
1064   case ir_unop_sqrt:
1065      emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1066      break;
1067   case ir_unop_rsq:
1068      emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1069      break;
1070   case ir_unop_i2f:
1071   case ir_unop_i2u:
1072   case ir_unop_u2i:
1073   case ir_unop_u2f:
1074   case ir_unop_b2f:
1075   case ir_unop_b2i:
1076   case ir_unop_f2i:
1077      emit(BRW_OPCODE_MOV, result_dst, op[0]);
1078      break;
1079   case ir_unop_f2b:
1080   case ir_unop_i2b: {
1081      dst_reg temp = result_dst;
1082      /* original gen4 does implicit conversion before comparison. */
1083      if (intel->gen < 5)
1084	 temp.type = op[0].type;
1085
1086      inst = emit(BRW_OPCODE_CMP, temp, op[0], src_reg(0.0f));
1087      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1088      inst = emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(1));
1089      break;
1090   }
1091
1092   case ir_unop_trunc:
1093      emit(BRW_OPCODE_RNDZ, result_dst, op[0]);
1094      break;
1095   case ir_unop_ceil:
1096      op[0].negate = !op[0].negate;
1097      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
1098      this->result.negate = true;
1099      break;
1100   case ir_unop_floor:
1101      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
1102      break;
1103   case ir_unop_fract:
1104      inst = emit(BRW_OPCODE_FRC, result_dst, op[0]);
1105      break;
1106   case ir_unop_round_even:
1107      emit(BRW_OPCODE_RNDE, result_dst, op[0]);
1108      break;
1109
1110   case ir_binop_min:
1111      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
1112      inst->conditional_mod = BRW_CONDITIONAL_L;
1113
1114      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1115      inst->predicate = BRW_PREDICATE_NORMAL;
1116      break;
1117   case ir_binop_max:
1118      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
1119      inst->conditional_mod = BRW_CONDITIONAL_G;
1120
1121      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1122      inst->predicate = BRW_PREDICATE_NORMAL;
1123      break;
1124
1125   case ir_binop_pow:
1126      emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1127      break;
1128
1129   case ir_unop_bit_not:
1130      inst = emit(BRW_OPCODE_NOT, result_dst, op[0]);
1131      break;
1132   case ir_binop_bit_and:
1133      inst = emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
1134      break;
1135   case ir_binop_bit_xor:
1136      inst = emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
1137      break;
1138   case ir_binop_bit_or:
1139      inst = emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
1140      break;
1141
1142   case ir_binop_lshift:
1143   case ir_binop_rshift:
1144      assert(!"GLSL 1.30 features unsupported");
1145      break;
1146
1147   case ir_quadop_vector:
1148      assert(!"not reached: should be handled by lower_quadop_vector");
1149      break;
1150   }
1151}
1152
1153
1154void
1155vec4_visitor::visit(ir_swizzle *ir)
1156{
1157   src_reg src;
1158   int i = 0;
1159   int swizzle[4];
1160
1161   /* Note that this is only swizzles in expressions, not those on the left
1162    * hand side of an assignment, which do write masking.  See ir_assignment
1163    * for that.
1164    */
1165
1166   ir->val->accept(this);
1167   src = this->result;
1168   assert(src.file != BAD_FILE);
1169
1170   for (i = 0; i < ir->type->vector_elements; i++) {
1171      switch (i) {
1172      case 0:
1173	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1174	 break;
1175      case 1:
1176	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1177	 break;
1178      case 2:
1179	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1180	 break;
1181      case 3:
1182	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1183	    break;
1184      }
1185   }
1186   for (; i < 4; i++) {
1187      /* Replicate the last channel out. */
1188      swizzle[i] = swizzle[ir->type->vector_elements - 1];
1189   }
1190
1191   src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1192
1193   this->result = src;
1194}
1195
1196void
1197vec4_visitor::visit(ir_dereference_variable *ir)
1198{
1199   const struct glsl_type *type = ir->type;
1200   dst_reg *reg = variable_storage(ir->var);
1201
1202   if (!reg) {
1203      fail("Failed to find variable storage for %s\n", ir->var->name);
1204      this->result = src_reg(brw_null_reg());
1205      return;
1206   }
1207
1208   this->result = src_reg(*reg);
1209
1210   if (type->is_scalar() || type->is_vector() || type->is_matrix())
1211      this->result.swizzle = swizzle_for_size(type->vector_elements);
1212}
1213
1214void
1215vec4_visitor::visit(ir_dereference_array *ir)
1216{
1217   ir_constant *constant_index;
1218   src_reg src;
1219   int element_size = type_size(ir->type);
1220
1221   constant_index = ir->array_index->constant_expression_value();
1222
1223   ir->array->accept(this);
1224   src = this->result;
1225
1226   if (constant_index) {
1227      src.reg_offset += constant_index->value.i[0] * element_size;
1228   } else {
1229      /* Variable index array dereference.  It eats the "vec4" of the
1230       * base of the array and an index that offsets the Mesa register
1231       * index.
1232       */
1233      ir->array_index->accept(this);
1234
1235      src_reg index_reg;
1236
1237      if (element_size == 1) {
1238	 index_reg = this->result;
1239      } else {
1240	 index_reg = src_reg(this, glsl_type::int_type);
1241
1242	 emit(BRW_OPCODE_MUL, dst_reg(index_reg),
1243	      this->result, src_reg(element_size));
1244      }
1245
1246      if (src.reladdr) {
1247	 src_reg temp = src_reg(this, glsl_type::int_type);
1248
1249	 emit(BRW_OPCODE_ADD, dst_reg(temp), *src.reladdr, index_reg);
1250
1251	 index_reg = temp;
1252      }
1253
1254      src.reladdr = ralloc(mem_ctx, src_reg);
1255      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1256   }
1257
1258   /* If the type is smaller than a vec4, replicate the last channel out. */
1259   if (ir->type->is_scalar() || ir->type->is_vector())
1260      src.swizzle = swizzle_for_size(ir->type->vector_elements);
1261   else
1262      src.swizzle = BRW_SWIZZLE_NOOP;
1263   src.type = brw_type_for_base_type(ir->type);
1264
1265   this->result = src;
1266}
1267
1268void
1269vec4_visitor::visit(ir_dereference_record *ir)
1270{
1271   unsigned int i;
1272   const glsl_type *struct_type = ir->record->type;
1273   int offset = 0;
1274
1275   ir->record->accept(this);
1276
1277   for (i = 0; i < struct_type->length; i++) {
1278      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1279	 break;
1280      offset += type_size(struct_type->fields.structure[i].type);
1281   }
1282
1283   /* If the type is smaller than a vec4, replicate the last channel out. */
1284   if (ir->type->is_scalar() || ir->type->is_vector())
1285      this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1286   else
1287      this->result.swizzle = BRW_SWIZZLE_NOOP;
1288   this->result.type = brw_type_for_base_type(ir->type);
1289
1290   this->result.reg_offset += offset;
1291}
1292
1293/**
1294 * We want to be careful in assignment setup to hit the actual storage
1295 * instead of potentially using a temporary like we might with the
1296 * ir_dereference handler.
1297 */
1298static dst_reg
1299get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1300{
1301   /* The LHS must be a dereference.  If the LHS is a variable indexed array
1302    * access of a vector, it must be separated into a series conditional moves
1303    * before reaching this point (see ir_vec_index_to_cond_assign).
1304    */
1305   assert(ir->as_dereference());
1306   ir_dereference_array *deref_array = ir->as_dereference_array();
1307   if (deref_array) {
1308      assert(!deref_array->array->type->is_vector());
1309   }
1310
1311   /* Use the rvalue deref handler for the most part.  We'll ignore
1312    * swizzles in it and write swizzles using writemask, though.
1313    */
1314   ir->accept(v);
1315   return dst_reg(v->result);
1316}
1317
1318void
1319vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1320			      const struct glsl_type *type, bool predicated)
1321{
1322   if (type->base_type == GLSL_TYPE_STRUCT) {
1323      for (unsigned int i = 0; i < type->length; i++) {
1324	 emit_block_move(dst, src, type->fields.structure[i].type, predicated);
1325      }
1326      return;
1327   }
1328
1329   if (type->is_array()) {
1330      for (unsigned int i = 0; i < type->length; i++) {
1331	 emit_block_move(dst, src, type->fields.array, predicated);
1332      }
1333      return;
1334   }
1335
1336   if (type->is_matrix()) {
1337      const struct glsl_type *vec_type;
1338
1339      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1340					 type->vector_elements, 1);
1341
1342      for (int i = 0; i < type->matrix_columns; i++) {
1343	 emit_block_move(dst, src, vec_type, predicated);
1344      }
1345      return;
1346   }
1347
1348   assert(type->is_scalar() || type->is_vector());
1349
1350   dst->type = brw_type_for_base_type(type);
1351   src->type = dst->type;
1352
1353   dst->writemask = (1 << type->vector_elements) - 1;
1354
1355   /* Do we need to worry about swizzling a swizzle? */
1356   assert(src->swizzle = BRW_SWIZZLE_NOOP);
1357   src->swizzle = swizzle_for_size(type->vector_elements);
1358
1359   vec4_instruction *inst = emit(BRW_OPCODE_MOV, *dst, *src);
1360   if (predicated)
1361      inst->predicate = BRW_PREDICATE_NORMAL;
1362
1363   dst->reg_offset++;
1364   src->reg_offset++;
1365}
1366
1367
1368/* If the RHS processing resulted in an instruction generating a
1369 * temporary value, and it would be easy to rewrite the instruction to
1370 * generate its result right into the LHS instead, do so.  This ends
1371 * up reliably removing instructions where it can be tricky to do so
1372 * later without real UD chain information.
1373 */
1374bool
1375vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1376				     dst_reg dst,
1377				     src_reg src,
1378				     vec4_instruction *pre_rhs_inst,
1379				     vec4_instruction *last_rhs_inst)
1380{
1381   /* This could be supported, but it would take more smarts. */
1382   if (ir->condition)
1383      return false;
1384
1385   if (pre_rhs_inst == last_rhs_inst)
1386      return false; /* No instructions generated to work with. */
1387
1388   /* Make sure the last instruction generated our source reg. */
1389   if (src.file != GRF ||
1390       src.file != last_rhs_inst->dst.file ||
1391       src.reg != last_rhs_inst->dst.reg ||
1392       src.reg_offset != last_rhs_inst->dst.reg_offset ||
1393       src.reladdr ||
1394       src.abs ||
1395       src.negate ||
1396       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1397      return false;
1398
1399   /* Check that that last instruction fully initialized the channels
1400    * we want to use, in the order we want to use them.  We could
1401    * potentially reswizzle the operands of many instructions so that
1402    * we could handle out of order channels, but don't yet.
1403    */
1404   for (int i = 0; i < 4; i++) {
1405      if (dst.writemask & (1 << i)) {
1406	 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1407	    return false;
1408
1409	 if (BRW_GET_SWZ(src.swizzle, i) != i)
1410	    return false;
1411      }
1412   }
1413
1414   /* Success!  Rewrite the instruction. */
1415   last_rhs_inst->dst.file = dst.file;
1416   last_rhs_inst->dst.reg = dst.reg;
1417   last_rhs_inst->dst.reg_offset = dst.reg_offset;
1418   last_rhs_inst->dst.reladdr = dst.reladdr;
1419   last_rhs_inst->dst.writemask &= dst.writemask;
1420
1421   return true;
1422}
1423
1424void
1425vec4_visitor::visit(ir_assignment *ir)
1426{
1427   dst_reg dst = get_assignment_lhs(ir->lhs, this);
1428
1429   if (!ir->lhs->type->is_scalar() &&
1430       !ir->lhs->type->is_vector()) {
1431      ir->rhs->accept(this);
1432      src_reg src = this->result;
1433
1434      if (ir->condition) {
1435	 emit_bool_to_cond_code(ir->condition);
1436      }
1437
1438      emit_block_move(&dst, &src, ir->rhs->type, ir->condition != NULL);
1439      return;
1440   }
1441
1442   /* Now we're down to just a scalar/vector with writemasks. */
1443   int i;
1444
1445   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1446   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1447
1448   ir->rhs->accept(this);
1449
1450   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1451
1452   src_reg src = this->result;
1453
1454   int swizzles[4];
1455   int first_enabled_chan = 0;
1456   int src_chan = 0;
1457
1458   assert(ir->lhs->type->is_vector() ||
1459	  ir->lhs->type->is_scalar());
1460   dst.writemask = ir->write_mask;
1461
1462   for (int i = 0; i < 4; i++) {
1463      if (dst.writemask & (1 << i)) {
1464	 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1465	 break;
1466      }
1467   }
1468
1469   /* Swizzle a small RHS vector into the channels being written.
1470    *
1471    * glsl ir treats write_mask as dictating how many channels are
1472    * present on the RHS while in our instructions we need to make
1473    * those channels appear in the slots of the vec4 they're written to.
1474    */
1475   for (int i = 0; i < 4; i++) {
1476      if (dst.writemask & (1 << i))
1477	 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1478      else
1479	 swizzles[i] = first_enabled_chan;
1480   }
1481   src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1482			      swizzles[2], swizzles[3]);
1483
1484   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1485      return;
1486   }
1487
1488   if (ir->condition) {
1489      emit_bool_to_cond_code(ir->condition);
1490   }
1491
1492   for (i = 0; i < type_size(ir->lhs->type); i++) {
1493      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
1494
1495      if (ir->condition)
1496	 inst->predicate = BRW_PREDICATE_NORMAL;
1497
1498      dst.reg_offset++;
1499      src.reg_offset++;
1500   }
1501}
1502
1503void
1504vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1505{
1506   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1507      foreach_list(node, &ir->components) {
1508	 ir_constant *field_value = (ir_constant *)node;
1509
1510	 emit_constant_values(dst, field_value);
1511      }
1512      return;
1513   }
1514
1515   if (ir->type->is_array()) {
1516      for (unsigned int i = 0; i < ir->type->length; i++) {
1517	 emit_constant_values(dst, ir->array_elements[i]);
1518      }
1519      return;
1520   }
1521
1522   if (ir->type->is_matrix()) {
1523      for (int i = 0; i < ir->type->matrix_columns; i++) {
1524	 for (int j = 0; j < ir->type->vector_elements; j++) {
1525	    dst->writemask = 1 << j;
1526	    dst->type = BRW_REGISTER_TYPE_F;
1527
1528	    emit(BRW_OPCODE_MOV, *dst,
1529		 src_reg(ir->value.f[i * ir->type->vector_elements + j]));
1530	 }
1531	 dst->reg_offset++;
1532      }
1533      return;
1534   }
1535
1536   for (int i = 0; i < ir->type->vector_elements; i++) {
1537      dst->writemask = 1 << i;
1538      dst->type = brw_type_for_base_type(ir->type);
1539
1540      switch (ir->type->base_type) {
1541      case GLSL_TYPE_FLOAT:
1542	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.f[i]));
1543	 break;
1544      case GLSL_TYPE_INT:
1545	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.i[i]));
1546	 break;
1547      case GLSL_TYPE_UINT:
1548	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.u[i]));
1549	 break;
1550      case GLSL_TYPE_BOOL:
1551	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.b[i]));
1552	 break;
1553      default:
1554	 assert(!"Non-float/uint/int/bool constant");
1555	 break;
1556      }
1557   }
1558   dst->reg_offset++;
1559}
1560
1561void
1562vec4_visitor::visit(ir_constant *ir)
1563{
1564   dst_reg dst = dst_reg(this, ir->type);
1565   this->result = src_reg(dst);
1566
1567   emit_constant_values(&dst, ir);
1568}
1569
1570void
1571vec4_visitor::visit(ir_call *ir)
1572{
1573   assert(!"not reached");
1574}
1575
1576void
1577vec4_visitor::visit(ir_texture *ir)
1578{
1579   /* FINISHME: Implement vertex texturing.
1580    *
1581    * With 0 vertex samplers available, the linker will reject
1582    * programs that do vertex texturing, but after our visitor has
1583    * run.
1584    */
1585}
1586
1587void
1588vec4_visitor::visit(ir_return *ir)
1589{
1590   assert(!"not reached");
1591}
1592
1593void
1594vec4_visitor::visit(ir_discard *ir)
1595{
1596   assert(!"not reached");
1597}
1598
1599void
1600vec4_visitor::visit(ir_if *ir)
1601{
1602   /* Don't point the annotation at the if statement, because then it plus
1603    * the then and else blocks get printed.
1604    */
1605   this->base_ir = ir->condition;
1606
1607   if (intel->gen == 6) {
1608      emit_if_gen6(ir);
1609   } else {
1610      emit_bool_to_cond_code(ir->condition);
1611      vec4_instruction *inst = emit(BRW_OPCODE_IF);
1612      inst->predicate = BRW_PREDICATE_NORMAL;
1613   }
1614
1615   visit_instructions(&ir->then_instructions);
1616
1617   if (!ir->else_instructions.is_empty()) {
1618      this->base_ir = ir->condition;
1619      emit(BRW_OPCODE_ELSE);
1620
1621      visit_instructions(&ir->else_instructions);
1622   }
1623
1624   this->base_ir = ir->condition;
1625   emit(BRW_OPCODE_ENDIF);
1626}
1627
1628int
1629vec4_visitor::emit_vue_header_gen4(int header_mrf)
1630{
1631   /* Get the position */
1632   src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1633
1634   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1635   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1636
1637   current_annotation = "NDC";
1638   dst_reg ndc_w = ndc;
1639   ndc_w.writemask = WRITEMASK_W;
1640   src_reg pos_w = pos;
1641   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1642   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1643
1644   dst_reg ndc_xyz = ndc;
1645   ndc_xyz.writemask = WRITEMASK_XYZ;
1646
1647   emit(BRW_OPCODE_MUL, ndc_xyz, pos, src_reg(ndc_w));
1648
1649   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1650       c->key.nr_userclip || brw->has_negative_rhw_bug) {
1651      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1652      GLuint i;
1653
1654      emit(BRW_OPCODE_MOV, header1, 0u);
1655
1656      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1657	 assert(!"finishme: psiz");
1658	 src_reg psiz;
1659
1660	 header1.writemask = WRITEMASK_W;
1661	 emit(BRW_OPCODE_MUL, header1, psiz, 1u << 11);
1662	 emit(BRW_OPCODE_AND, header1, src_reg(header1), 0x7ff << 8);
1663      }
1664
1665      for (i = 0; i < c->key.nr_userclip; i++) {
1666	 vec4_instruction *inst;
1667
1668	 inst = emit(BRW_OPCODE_DP4, dst_reg(brw_null_reg()),
1669		     pos, src_reg(c->userplane[i]));
1670	 inst->conditional_mod = BRW_CONDITIONAL_L;
1671
1672	 emit(BRW_OPCODE_OR, header1, src_reg(header1), 1u << i);
1673	 inst->predicate = BRW_PREDICATE_NORMAL;
1674      }
1675
1676      /* i965 clipping workaround:
1677       * 1) Test for -ve rhw
1678       * 2) If set,
1679       *      set ndc = (0,0,0,0)
1680       *      set ucp[6] = 1
1681       *
1682       * Later, clipping will detect ucp[6] and ensure the primitive is
1683       * clipped against all fixed planes.
1684       */
1685      if (brw->has_negative_rhw_bug) {
1686#if 0
1687	 /* FINISHME */
1688	 brw_CMP(p,
1689		 vec8(brw_null_reg()),
1690		 BRW_CONDITIONAL_L,
1691		 brw_swizzle1(ndc, 3),
1692		 brw_imm_f(0));
1693
1694	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1695	 brw_MOV(p, ndc, brw_imm_f(0));
1696	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1697#endif
1698      }
1699
1700      header1.writemask = WRITEMASK_XYZW;
1701      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(header1));
1702   } else {
1703      emit(BRW_OPCODE_MOV, retype(brw_message_reg(header_mrf++),
1704				  BRW_REGISTER_TYPE_UD), 0u);
1705   }
1706
1707   if (intel->gen == 5) {
1708      /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1709       * dword 0-3 (m1) of the header is indices, point width, clip flags.
1710       * dword 4-7 (m2) is the ndc position (set above)
1711       * dword 8-11 (m3) of the vertex header is the 4D space position
1712       * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1713       * m6 is a pad so that the vertex element data is aligned
1714       * m7 is the first vertex data we fill.
1715       */
1716      current_annotation = "NDC";
1717      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
1718
1719      current_annotation = "gl_Position";
1720      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
1721
1722      /* user clip distance. */
1723      header_mrf += 2;
1724
1725      /* Pad so that vertex element data is aligned. */
1726      header_mrf++;
1727   } else {
1728      /* There are 8 dwords in VUE header pre-Ironlake:
1729       * dword 0-3 (m1) is indices, point width, clip flags.
1730       * dword 4-7 (m2) is ndc position (set above)
1731       *
1732       * dword 8-11 (m3) is the first vertex data.
1733       */
1734      current_annotation = "NDC";
1735      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
1736
1737      current_annotation = "gl_Position";
1738      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
1739   }
1740
1741   return header_mrf;
1742}
1743
1744int
1745vec4_visitor::emit_vue_header_gen6(int header_mrf)
1746{
1747   struct brw_reg reg;
1748
1749   /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1750    * dword 0-3 (m2) of the header is indices, point width, clip flags.
1751    * dword 4-7 (m3) is the 4D space position
1752    * dword 8-15 (m4,m5) of the vertex header is the user clip distance if
1753    * enabled.
1754    *
1755    * m4 or 6 is the first vertex element data we fill.
1756    */
1757
1758   current_annotation = "indices, point width, clip flags";
1759   reg = brw_message_reg(header_mrf++);
1760   emit(BRW_OPCODE_MOV, retype(reg, BRW_REGISTER_TYPE_D), src_reg(0));
1761   if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1762      emit(BRW_OPCODE_MOV, brw_writemask(reg, WRITEMASK_W),
1763	   src_reg(output_reg[VERT_RESULT_PSIZ]));
1764   }
1765
1766   current_annotation = "gl_Position";
1767   emit(BRW_OPCODE_MOV,
1768	brw_message_reg(header_mrf++), src_reg(output_reg[VERT_RESULT_HPOS]));
1769
1770   current_annotation = "user clip distances";
1771   if (c->key.nr_userclip) {
1772      for (int i = 0; i < c->key.nr_userclip; i++) {
1773	 struct brw_reg m;
1774	 if (i < 4)
1775	    m = brw_message_reg(header_mrf);
1776	 else
1777	    m = brw_message_reg(header_mrf + 1);
1778
1779	 emit(BRW_OPCODE_DP4,
1780	      dst_reg(brw_writemask(m, 1 << (i & 3))),
1781	      src_reg(c->userplane[i]));
1782      }
1783      header_mrf += 2;
1784   }
1785
1786   current_annotation = NULL;
1787
1788   return header_mrf;
1789}
1790
1791static int
1792align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1793{
1794   struct intel_context *intel = &brw->intel;
1795
1796   if (intel->gen >= 6) {
1797      /* URB data written (does not include the message header reg) must
1798       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1799       * section 5.4.3.2.2: URB_INTERLEAVED.
1800       *
1801       * URB entries are allocated on a multiple of 1024 bits, so an
1802       * extra 128 bits written here to make the end align to 256 is
1803       * no problem.
1804       */
1805      if ((mlen % 2) != 1)
1806	 mlen++;
1807   }
1808
1809   return mlen;
1810}
1811
1812/**
1813 * Generates the VUE payload plus the 1 or 2 URB write instructions to
1814 * complete the VS thread.
1815 *
1816 * The VUE layout is documented in Volume 2a.
1817 */
1818void
1819vec4_visitor::emit_urb_writes()
1820{
1821   /* MRF 0 is reserved for the debugger, so start with message header
1822    * in MRF 1.
1823    */
1824   int base_mrf = 1;
1825   int mrf = base_mrf;
1826   int urb_entry_size;
1827   uint64_t outputs_remaining = c->prog_data.outputs_written;
1828   /* In the process of generating our URB write message contents, we
1829    * may need to unspill a register or load from an array.  Those
1830    * reads would use MRFs 14-15.
1831    */
1832   int max_usable_mrf = 13;
1833
1834   /* FINISHME: edgeflag */
1835
1836   /* First mrf is the g0-based message header containing URB handles and such,
1837    * which is implied in VS_OPCODE_URB_WRITE.
1838    */
1839   mrf++;
1840
1841   if (intel->gen >= 6) {
1842      mrf = emit_vue_header_gen6(mrf);
1843   } else {
1844      mrf = emit_vue_header_gen4(mrf);
1845   }
1846
1847   /* Set up the VUE data for the first URB write */
1848   int attr;
1849   for (attr = 0; attr < VERT_RESULT_MAX; attr++) {
1850      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1851	 continue;
1852
1853      outputs_remaining &= ~BITFIELD64_BIT(attr);
1854
1855      /* This is set up in the VUE header. */
1856      if (attr == VERT_RESULT_HPOS)
1857	 continue;
1858
1859      /* This is loaded into the VUE header, and thus doesn't occupy
1860       * an attribute slot.
1861       */
1862      if (attr == VERT_RESULT_PSIZ)
1863	 continue;
1864
1865      vec4_instruction *inst = emit(BRW_OPCODE_MOV, brw_message_reg(mrf++),
1866				    src_reg(output_reg[attr]));
1867
1868      if ((attr == VERT_RESULT_COL0 ||
1869	   attr == VERT_RESULT_COL1 ||
1870	   attr == VERT_RESULT_BFC0 ||
1871	   attr == VERT_RESULT_BFC1) &&
1872	  c->key.clamp_vertex_color) {
1873	 inst->saturate = true;
1874      }
1875
1876      /* If this was MRF 15, we can't fit anything more into this URB
1877       * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
1878       * even-numbered amount of URB write data, which will meet
1879       * gen6's requirements for length alignment.
1880       */
1881      if (mrf > max_usable_mrf) {
1882	 attr++;
1883	 break;
1884      }
1885   }
1886
1887   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
1888   inst->base_mrf = base_mrf;
1889   inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1890   inst->eot = !outputs_remaining;
1891
1892   urb_entry_size = mrf - base_mrf;
1893
1894   /* Optional second URB write */
1895   if (outputs_remaining) {
1896      mrf = base_mrf + 1;
1897
1898      for (; attr < VERT_RESULT_MAX; attr++) {
1899	 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1900	    continue;
1901
1902	 assert(mrf < max_usable_mrf);
1903
1904	 emit(BRW_OPCODE_MOV, brw_message_reg(mrf++), src_reg(output_reg[attr]));
1905      }
1906
1907      inst = emit(VS_OPCODE_URB_WRITE);
1908      inst->base_mrf = base_mrf;
1909      inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1910      inst->eot = true;
1911      /* URB destination offset.  In the previous write, we got MRFs
1912       * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
1913       * URB row increments, and each of our MRFs is half of one of
1914       * those, since we're doing interleaved writes.
1915       */
1916      inst->offset = (max_usable_mrf - base_mrf) / 2;
1917
1918      urb_entry_size += mrf - base_mrf;
1919   }
1920
1921   if (intel->gen == 6)
1922      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
1923   else
1924      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
1925}
1926
1927src_reg
1928vec4_visitor::get_scratch_offset(vec4_instruction *inst,
1929				 src_reg *reladdr, int reg_offset)
1930{
1931   /* Because we store the values to scratch interleaved like our
1932    * vertex data, we need to scale the vec4 index by 2.
1933    */
1934   int message_header_scale = 2;
1935
1936   /* Pre-gen6, the message header uses byte offsets instead of vec4
1937    * (16-byte) offset units.
1938    */
1939   if (intel->gen < 6)
1940      message_header_scale *= 16;
1941
1942   if (reladdr) {
1943      src_reg index = src_reg(this, glsl_type::int_type);
1944
1945      vec4_instruction *add = emit(BRW_OPCODE_ADD,
1946				   dst_reg(index),
1947				   *reladdr,
1948				   src_reg(reg_offset));
1949      /* Move our new instruction from the tail to its correct place. */
1950      add->remove();
1951      inst->insert_before(add);
1952
1953      vec4_instruction *mul = emit(BRW_OPCODE_MUL, dst_reg(index),
1954				   index, src_reg(message_header_scale));
1955      mul->remove();
1956      inst->insert_before(mul);
1957
1958      return index;
1959   } else {
1960      return src_reg(reg_offset * message_header_scale);
1961   }
1962}
1963
1964src_reg
1965vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
1966				       src_reg *reladdr, int reg_offset)
1967{
1968   if (reladdr) {
1969      src_reg index = src_reg(this, glsl_type::int_type);
1970
1971      vec4_instruction *add = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_ADD,
1972							    dst_reg(index),
1973							    *reladdr,
1974							    src_reg(reg_offset));
1975      add->ir = inst->ir;
1976      add->annotation = inst->annotation;
1977      inst->insert_before(add);
1978
1979      /* Pre-gen6, the message header uses byte offsets instead of vec4
1980       * (16-byte) offset units.
1981       */
1982      if (intel->gen < 6) {
1983	 vec4_instruction *mul = new(mem_ctx) vec4_instruction(this,
1984							       BRW_OPCODE_MUL,
1985							       dst_reg(index),
1986							       index,
1987							       src_reg(16));
1988	 mul->ir = inst->ir;
1989	 mul->annotation = inst->annotation;
1990	 inst->insert_before(mul);
1991      }
1992
1993      return index;
1994   } else {
1995      int message_header_scale = intel->gen < 6 ? 16 : 1;
1996      return src_reg(reg_offset * message_header_scale);
1997   }
1998}
1999
2000/**
2001 * Emits an instruction before @inst to load the value named by @orig_src
2002 * from scratch space at @base_offset to @temp.
2003 */
2004void
2005vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2006				dst_reg temp, src_reg orig_src,
2007				int base_offset)
2008{
2009   int reg_offset = base_offset + orig_src.reg_offset;
2010   src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2011
2012   vec4_instruction *scratch_read_inst = emit(VS_OPCODE_SCRATCH_READ,
2013					      temp, index);
2014
2015   scratch_read_inst->base_mrf = 14;
2016   scratch_read_inst->mlen = 1;
2017   /* Move our instruction from the tail to its correct place. */
2018   scratch_read_inst->remove();
2019   inst->insert_before(scratch_read_inst);
2020}
2021
2022/**
2023 * Emits an instruction after @inst to store the value to be written
2024 * to @orig_dst to scratch space at @base_offset, from @temp.
2025 */
2026void
2027vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2028				 src_reg temp, dst_reg orig_dst,
2029				 int base_offset)
2030{
2031   int reg_offset = base_offset + orig_dst.reg_offset;
2032   src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2033
2034   dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2035				       orig_dst.writemask));
2036   vec4_instruction *scratch_write_inst = emit(VS_OPCODE_SCRATCH_WRITE,
2037					       dst, temp, index);
2038   scratch_write_inst->base_mrf = 13;
2039   scratch_write_inst->mlen = 2;
2040   scratch_write_inst->predicate = inst->predicate;
2041   /* Move our instruction from the tail to its correct place. */
2042   scratch_write_inst->remove();
2043   inst->insert_after(scratch_write_inst);
2044}
2045
2046/**
2047 * We can't generally support array access in GRF space, because a
2048 * single instruction's destination can only span 2 contiguous
2049 * registers.  So, we send all GRF arrays that get variable index
2050 * access to scratch space.
2051 */
2052void
2053vec4_visitor::move_grf_array_access_to_scratch()
2054{
2055   int scratch_loc[this->virtual_grf_count];
2056
2057   for (int i = 0; i < this->virtual_grf_count; i++) {
2058      scratch_loc[i] = -1;
2059   }
2060
2061   /* First, calculate the set of virtual GRFs that need to be punted
2062    * to scratch due to having any array access on them, and where in
2063    * scratch.
2064    */
2065   foreach_list(node, &this->instructions) {
2066      vec4_instruction *inst = (vec4_instruction *)node;
2067
2068      if (inst->dst.file == GRF && inst->dst.reladdr &&
2069	  scratch_loc[inst->dst.reg] == -1) {
2070	 scratch_loc[inst->dst.reg] = c->last_scratch;
2071	 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2072      }
2073
2074      for (int i = 0 ; i < 3; i++) {
2075	 src_reg *src = &inst->src[i];
2076
2077	 if (src->file == GRF && src->reladdr &&
2078	     scratch_loc[src->reg] == -1) {
2079	    scratch_loc[src->reg] = c->last_scratch;
2080	    c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2081	 }
2082      }
2083   }
2084
2085   /* Now, for anything that will be accessed through scratch, rewrite
2086    * it to load/store.  Note that this is a _safe list walk, because
2087    * we may generate a new scratch_write instruction after the one
2088    * we're processing.
2089    */
2090   foreach_list_safe(node, &this->instructions) {
2091      vec4_instruction *inst = (vec4_instruction *)node;
2092
2093      /* Set up the annotation tracking for new generated instructions. */
2094      base_ir = inst->ir;
2095      current_annotation = inst->annotation;
2096
2097      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2098	 src_reg temp = src_reg(this, glsl_type::vec4_type);
2099
2100	 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2101
2102	 inst->dst.file = temp.file;
2103	 inst->dst.reg = temp.reg;
2104	 inst->dst.reg_offset = temp.reg_offset;
2105	 inst->dst.reladdr = NULL;
2106      }
2107
2108      for (int i = 0 ; i < 3; i++) {
2109	 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2110	    continue;
2111
2112	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2113
2114	 emit_scratch_read(inst, temp, inst->src[i],
2115			   scratch_loc[inst->src[i].reg]);
2116
2117	 inst->src[i].file = temp.file;
2118	 inst->src[i].reg = temp.reg;
2119	 inst->src[i].reg_offset = temp.reg_offset;
2120	 inst->src[i].reladdr = NULL;
2121      }
2122   }
2123}
2124
2125/**
2126 * Emits an instruction before @inst to load the value named by @orig_src
2127 * from the pull constant buffer (surface) at @base_offset to @temp.
2128 */
2129void
2130vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2131				      dst_reg temp, src_reg orig_src,
2132				      int base_offset)
2133{
2134   int reg_offset = base_offset + orig_src.reg_offset;
2135   src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2136   vec4_instruction *load;
2137
2138   load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2139					temp, index);
2140   load->annotation = inst->annotation;
2141   load->ir = inst->ir;
2142   load->base_mrf = 14;
2143   load->mlen = 1;
2144   inst->insert_before(load);
2145}
2146
2147/**
2148 * Implements array access of uniforms by inserting a
2149 * PULL_CONSTANT_LOAD instruction.
2150 *
2151 * Unlike temporary GRF array access (where we don't support it due to
2152 * the difficulty of doing relative addressing on instruction
2153 * destinations), we could potentially do array access of uniforms
2154 * that were loaded in GRF space as push constants.  In real-world
2155 * usage we've seen, though, the arrays being used are always larger
2156 * than we could load as push constants, so just always move all
2157 * uniform array access out to a pull constant buffer.
2158 */
2159void
2160vec4_visitor::move_uniform_array_access_to_pull_constants()
2161{
2162   int pull_constant_loc[this->uniforms];
2163
2164   for (int i = 0; i < this->uniforms; i++) {
2165      pull_constant_loc[i] = -1;
2166   }
2167
2168   /* Walk through and find array access of uniforms.  Put a copy of that
2169    * uniform in the pull constant buffer.
2170    *
2171    * Note that we don't move constant-indexed accesses to arrays.  No
2172    * testing has been done of the performance impact of this choice.
2173    */
2174   foreach_list_safe(node, &this->instructions) {
2175      vec4_instruction *inst = (vec4_instruction *)node;
2176
2177      for (int i = 0 ; i < 3; i++) {
2178	 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2179	    continue;
2180
2181	 int uniform = inst->src[i].reg;
2182
2183	 /* If this array isn't already present in the pull constant buffer,
2184	  * add it.
2185	  */
2186	 if (pull_constant_loc[uniform] == -1) {
2187	    const float **values = &prog_data->param[uniform * 4];
2188
2189	    pull_constant_loc[uniform] = prog_data->nr_pull_params;
2190
2191	    for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2192	       prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2193	    }
2194	 }
2195
2196	 /* Set up the annotation tracking for new generated instructions. */
2197	 base_ir = inst->ir;
2198	 current_annotation = inst->annotation;
2199
2200	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2201
2202	 emit_pull_constant_load(inst, temp, inst->src[i],
2203				 pull_constant_loc[uniform]);
2204
2205	 inst->src[i].file = temp.file;
2206	 inst->src[i].reg = temp.reg;
2207	 inst->src[i].reg_offset = temp.reg_offset;
2208	 inst->src[i].reladdr = NULL;
2209      }
2210   }
2211
2212   /* Now there are no accesses of the UNIFORM file with a reladdr, so
2213    * no need to track them as larger-than-vec4 objects.  This will be
2214    * relied on in cutting out unused uniform vectors from push
2215    * constants.
2216    */
2217   split_uniform_registers();
2218}
2219
2220vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2221			   struct gl_shader_program *prog,
2222			   struct brw_shader *shader)
2223{
2224   this->c = c;
2225   this->p = &c->func;
2226   this->brw = p->brw;
2227   this->intel = &brw->intel;
2228   this->ctx = &intel->ctx;
2229   this->prog = prog;
2230   this->shader = shader;
2231
2232   this->mem_ctx = ralloc_context(NULL);
2233   this->failed = false;
2234
2235   this->base_ir = NULL;
2236   this->current_annotation = NULL;
2237
2238   this->c = c;
2239   this->vp = prog->VertexProgram;
2240   this->prog_data = &c->prog_data;
2241
2242   this->variable_ht = hash_table_ctor(0,
2243				       hash_table_pointer_hash,
2244				       hash_table_pointer_compare);
2245
2246   this->virtual_grf_def = NULL;
2247   this->virtual_grf_use = NULL;
2248   this->virtual_grf_sizes = NULL;
2249   this->virtual_grf_count = 0;
2250   this->virtual_grf_array_size = 0;
2251   this->live_intervals_valid = false;
2252
2253   this->uniforms = 0;
2254
2255   this->variable_ht = hash_table_ctor(0,
2256				       hash_table_pointer_hash,
2257				       hash_table_pointer_compare);
2258}
2259
2260vec4_visitor::~vec4_visitor()
2261{
2262   ralloc_free(this->mem_ctx);
2263   hash_table_dtor(this->variable_ht);
2264}
2265
2266
2267void
2268vec4_visitor::fail(const char *format, ...)
2269{
2270   va_list va;
2271   char *msg;
2272
2273   if (failed)
2274      return;
2275
2276   failed = true;
2277
2278   va_start(va, format);
2279   msg = ralloc_vasprintf(mem_ctx, format, va);
2280   va_end(va);
2281   msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2282
2283   this->fail_msg = msg;
2284
2285   if (INTEL_DEBUG & DEBUG_VS) {
2286      fprintf(stderr, "%s",  msg);
2287   }
2288}
2289
2290} /* namespace brw */
2291