brw_vec4_visitor.cpp revision a8e29987f5a64d566a128a1c1ac18dae3f3953db
1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_vec4.h"
25extern "C" {
26#include "main/macros.h"
27#include "program/prog_parameter.h"
28}
29
30namespace brw {
31
32src_reg::src_reg(dst_reg reg)
33{
34   init();
35
36   this->file = reg.file;
37   this->reg = reg.reg;
38   this->reg_offset = reg.reg_offset;
39   this->type = reg.type;
40   this->reladdr = reg.reladdr;
41   this->fixed_hw_reg = reg.fixed_hw_reg;
42
43   int swizzles[4];
44   int next_chan = 0;
45   int last = 0;
46
47   for (int i = 0; i < 4; i++) {
48      if (!(reg.writemask & (1 << i)))
49	 continue;
50
51      swizzles[next_chan++] = last = i;
52   }
53
54   for (; next_chan < 4; next_chan++) {
55      swizzles[next_chan] = last;
56   }
57
58   this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
59				swizzles[2], swizzles[3]);
60}
61
62dst_reg::dst_reg(src_reg reg)
63{
64   init();
65
66   this->file = reg.file;
67   this->reg = reg.reg;
68   this->reg_offset = reg.reg_offset;
69   this->type = reg.type;
70   this->writemask = WRITEMASK_XYZW;
71   this->reladdr = reg.reladdr;
72   this->fixed_hw_reg = reg.fixed_hw_reg;
73}
74
75vec4_instruction::vec4_instruction(vec4_visitor *v,
76				   enum opcode opcode, dst_reg dst,
77				   src_reg src0, src_reg src1, src_reg src2)
78{
79   this->opcode = opcode;
80   this->dst = dst;
81   this->src[0] = src0;
82   this->src[1] = src1;
83   this->src[2] = src2;
84   this->ir = v->base_ir;
85   this->annotation = v->current_annotation;
86}
87
88vec4_instruction *
89vec4_visitor::emit(vec4_instruction *inst)
90{
91   this->instructions.push_tail(inst);
92
93   return inst;
94}
95
96vec4_instruction *
97vec4_visitor::emit(enum opcode opcode, dst_reg dst,
98		   src_reg src0, src_reg src1, src_reg src2)
99{
100   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
101					     src0, src1, src2));
102}
103
104
105vec4_instruction *
106vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
107{
108   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
109}
110
111vec4_instruction *
112vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
113{
114   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
115}
116
117vec4_instruction *
118vec4_visitor::emit(enum opcode opcode)
119{
120   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
121}
122
123#define ALU1(op)							\
124   vec4_instruction *							\
125   vec4_visitor::op(dst_reg dst, src_reg src0)				\
126   {									\
127      return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
128					   src0);			\
129   }
130
131#define ALU2(op)							\
132   vec4_instruction *							\
133   vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)		\
134   {									\
135      return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
136					   src0, src1);			\
137   }
138
139ALU1(NOT)
140ALU1(MOV)
141ALU1(FRC)
142ALU1(RNDD)
143ALU1(RNDE)
144ALU1(RNDZ)
145ALU2(ADD)
146ALU2(MUL)
147ALU2(MACH)
148ALU2(AND)
149ALU2(OR)
150ALU2(XOR)
151ALU2(DP3)
152ALU2(DP4)
153
154/** Gen4 predicated IF. */
155vec4_instruction *
156vec4_visitor::IF(uint32_t predicate)
157{
158   vec4_instruction *inst;
159
160   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
161   inst->predicate = predicate;
162
163   return inst;
164}
165
166/** Gen6+ IF with embedded comparison. */
167vec4_instruction *
168vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
169{
170   assert(intel->gen >= 6);
171
172   vec4_instruction *inst;
173
174   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
175					src0, src1);
176   inst->conditional_mod = condition;
177
178   return inst;
179}
180
181vec4_instruction *
182vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
183{
184   vec4_instruction *inst;
185
186   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst,
187					src0, src1, src_reg());
188   inst->conditional_mod = condition;
189
190   return inst;
191}
192
193void
194vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
195{
196   static enum opcode dot_opcodes[] = {
197      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
198   };
199
200   emit(dot_opcodes[elements - 2], dst, src0, src1);
201}
202
203void
204vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
205{
206   /* The gen6 math instruction ignores the source modifiers --
207    * swizzle, abs, negate, and at least some parts of the register
208    * region description.
209    *
210    * While it would seem that this MOV could be avoided at this point
211    * in the case that the swizzle is matched up with the destination
212    * writemask, note that uniform packing and register allocation
213    * could rearrange our swizzle, so let's leave this matter up to
214    * copy propagation later.
215    */
216   src_reg temp_src = src_reg(this, glsl_type::vec4_type);
217   emit(MOV(dst_reg(temp_src), src));
218
219   if (dst.writemask != WRITEMASK_XYZW) {
220      /* The gen6 math instruction must be align1, so we can't do
221       * writemasks.
222       */
223      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
224
225      emit(opcode, temp_dst, temp_src);
226
227      emit(MOV(dst, src_reg(temp_dst)));
228   } else {
229      emit(opcode, dst, temp_src);
230   }
231}
232
233void
234vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
235{
236   vec4_instruction *inst = emit(opcode, dst, src);
237   inst->base_mrf = 1;
238   inst->mlen = 1;
239}
240
241void
242vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
243{
244   switch (opcode) {
245   case SHADER_OPCODE_RCP:
246   case SHADER_OPCODE_RSQ:
247   case SHADER_OPCODE_SQRT:
248   case SHADER_OPCODE_EXP2:
249   case SHADER_OPCODE_LOG2:
250   case SHADER_OPCODE_SIN:
251   case SHADER_OPCODE_COS:
252      break;
253   default:
254      assert(!"not reached: bad math opcode");
255      return;
256   }
257
258   if (intel->gen >= 6) {
259      return emit_math1_gen6(opcode, dst, src);
260   } else {
261      return emit_math1_gen4(opcode, dst, src);
262   }
263}
264
265void
266vec4_visitor::emit_math2_gen6(enum opcode opcode,
267			      dst_reg dst, src_reg src0, src_reg src1)
268{
269   src_reg expanded;
270
271   /* The gen6 math instruction ignores the source modifiers --
272    * swizzle, abs, negate, and at least some parts of the register
273    * region description.  Move the sources to temporaries to make it
274    * generally work.
275    */
276
277   expanded = src_reg(this, glsl_type::vec4_type);
278   emit(MOV(dst_reg(expanded), src0));
279   src0 = expanded;
280
281   expanded = src_reg(this, glsl_type::vec4_type);
282   emit(MOV(dst_reg(expanded), src1));
283   src1 = expanded;
284
285   if (dst.writemask != WRITEMASK_XYZW) {
286      /* The gen6 math instruction must be align1, so we can't do
287       * writemasks.
288       */
289      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
290
291      emit(opcode, temp_dst, src0, src1);
292
293      emit(MOV(dst, src_reg(temp_dst)));
294   } else {
295      emit(opcode, dst, src0, src1);
296   }
297}
298
299void
300vec4_visitor::emit_math2_gen4(enum opcode opcode,
301			      dst_reg dst, src_reg src0, src_reg src1)
302{
303   vec4_instruction *inst = emit(opcode, dst, src0, src1);
304   inst->base_mrf = 1;
305   inst->mlen = 2;
306}
307
308void
309vec4_visitor::emit_math(enum opcode opcode,
310			dst_reg dst, src_reg src0, src_reg src1)
311{
312   assert(opcode == SHADER_OPCODE_POW);
313
314   if (intel->gen >= 6) {
315      return emit_math2_gen6(opcode, dst, src0, src1);
316   } else {
317      return emit_math2_gen4(opcode, dst, src0, src1);
318   }
319}
320
321void
322vec4_visitor::visit_instructions(const exec_list *list)
323{
324   foreach_list(node, list) {
325      ir_instruction *ir = (ir_instruction *)node;
326
327      base_ir = ir;
328      ir->accept(this);
329   }
330}
331
332
333static int
334type_size(const struct glsl_type *type)
335{
336   unsigned int i;
337   int size;
338
339   switch (type->base_type) {
340   case GLSL_TYPE_UINT:
341   case GLSL_TYPE_INT:
342   case GLSL_TYPE_FLOAT:
343   case GLSL_TYPE_BOOL:
344      if (type->is_matrix()) {
345	 return type->matrix_columns;
346      } else {
347	 /* Regardless of size of vector, it gets a vec4. This is bad
348	  * packing for things like floats, but otherwise arrays become a
349	  * mess.  Hopefully a later pass over the code can pack scalars
350	  * down if appropriate.
351	  */
352	 return 1;
353      }
354   case GLSL_TYPE_ARRAY:
355      assert(type->length > 0);
356      return type_size(type->fields.array) * type->length;
357   case GLSL_TYPE_STRUCT:
358      size = 0;
359      for (i = 0; i < type->length; i++) {
360	 size += type_size(type->fields.structure[i].type);
361      }
362      return size;
363   case GLSL_TYPE_SAMPLER:
364      /* Samplers take up one slot in UNIFORMS[], but they're baked in
365       * at link time.
366       */
367      return 1;
368   default:
369      assert(0);
370      return 0;
371   }
372}
373
374int
375vec4_visitor::virtual_grf_alloc(int size)
376{
377   if (virtual_grf_array_size <= virtual_grf_count) {
378      if (virtual_grf_array_size == 0)
379	 virtual_grf_array_size = 16;
380      else
381	 virtual_grf_array_size *= 2;
382      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
383				   virtual_grf_array_size);
384   }
385   virtual_grf_sizes[virtual_grf_count] = size;
386   return virtual_grf_count++;
387}
388
389src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
390{
391   init();
392
393   this->file = GRF;
394   this->reg = v->virtual_grf_alloc(type_size(type));
395
396   if (type->is_array() || type->is_record()) {
397      this->swizzle = BRW_SWIZZLE_NOOP;
398   } else {
399      this->swizzle = swizzle_for_size(type->vector_elements);
400   }
401
402   this->type = brw_type_for_base_type(type);
403}
404
405dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
406{
407   init();
408
409   this->file = GRF;
410   this->reg = v->virtual_grf_alloc(type_size(type));
411
412   if (type->is_array() || type->is_record()) {
413      this->writemask = WRITEMASK_XYZW;
414   } else {
415      this->writemask = (1 << type->vector_elements) - 1;
416   }
417
418   this->type = brw_type_for_base_type(type);
419}
420
421/* Our support for uniforms is piggy-backed on the struct
422 * gl_fragment_program, because that's where the values actually
423 * get stored, rather than in some global gl_shader_program uniform
424 * store.
425 */
426int
427vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
428{
429   unsigned int offset = 0;
430   float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
431
432   if (type->is_matrix()) {
433      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
434							type->vector_elements,
435							1);
436
437      for (unsigned int i = 0; i < type->matrix_columns; i++) {
438	 offset += setup_uniform_values(loc + offset, column);
439      }
440
441      return offset;
442   }
443
444   switch (type->base_type) {
445   case GLSL_TYPE_FLOAT:
446   case GLSL_TYPE_UINT:
447   case GLSL_TYPE_INT:
448   case GLSL_TYPE_BOOL:
449      for (unsigned int i = 0; i < type->vector_elements; i++) {
450	 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
451      }
452
453      /* Set up pad elements to get things aligned to a vec4 boundary. */
454      for (unsigned int i = type->vector_elements; i < 4; i++) {
455	 static float zero = 0;
456
457	 c->prog_data.param[this->uniforms * 4 + i] = &zero;
458      }
459
460      /* Track the size of this uniform vector, for future packing of
461       * uniforms.
462       */
463      this->uniform_vector_size[this->uniforms] = type->vector_elements;
464      this->uniforms++;
465
466      return 1;
467
468   case GLSL_TYPE_STRUCT:
469      for (unsigned int i = 0; i < type->length; i++) {
470	 offset += setup_uniform_values(loc + offset,
471					type->fields.structure[i].type);
472      }
473      return offset;
474
475   case GLSL_TYPE_ARRAY:
476      for (unsigned int i = 0; i < type->length; i++) {
477	 offset += setup_uniform_values(loc + offset, type->fields.array);
478      }
479      return offset;
480
481   case GLSL_TYPE_SAMPLER:
482      /* The sampler takes up a slot, but we don't use any values from it. */
483      return 1;
484
485   default:
486      assert(!"not reached");
487      return 0;
488   }
489}
490
491/* Our support for builtin uniforms is even scarier than non-builtin.
492 * It sits on top of the PROG_STATE_VAR parameters that are
493 * automatically updated from GL context state.
494 */
495void
496vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
497{
498   const ir_state_slot *const slots = ir->state_slots;
499   assert(ir->state_slots != NULL);
500
501   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
502      /* This state reference has already been setup by ir_to_mesa,
503       * but we'll get the same index back here.  We can reference
504       * ParameterValues directly, since unlike brw_fs.cpp, we never
505       * add new state references during compile.
506       */
507      int index = _mesa_add_state_reference(this->vp->Base.Parameters,
508					    (gl_state_index *)slots[i].tokens);
509      float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
510
511      this->uniform_vector_size[this->uniforms] = 0;
512      /* Add each of the unique swizzled channels of the element.
513       * This will end up matching the size of the glsl_type of this field.
514       */
515      int last_swiz = -1;
516      for (unsigned int j = 0; j < 4; j++) {
517	 int swiz = GET_SWZ(slots[i].swizzle, j);
518	 last_swiz = swiz;
519
520	 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
521	 if (swiz <= last_swiz)
522	    this->uniform_vector_size[this->uniforms]++;
523      }
524      this->uniforms++;
525   }
526}
527
528dst_reg *
529vec4_visitor::variable_storage(ir_variable *var)
530{
531   return (dst_reg *)hash_table_find(this->variable_ht, var);
532}
533
534void
535vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
536{
537   ir_expression *expr = ir->as_expression();
538
539   if (expr) {
540      src_reg op[2];
541      vec4_instruction *inst;
542
543      assert(expr->get_num_operands() <= 2);
544      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
545	 assert(expr->operands[i]->type->is_scalar());
546
547	 expr->operands[i]->accept(this);
548	 op[i] = this->result;
549      }
550
551      switch (expr->operation) {
552      case ir_unop_logic_not:
553	 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
554	 inst->conditional_mod = BRW_CONDITIONAL_Z;
555	 break;
556
557      case ir_binop_logic_xor:
558	 inst = emit(XOR(dst_null_d(), op[0], op[1]));
559	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
560	 break;
561
562      case ir_binop_logic_or:
563	 inst = emit(OR(dst_null_d(), op[0], op[1]));
564	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
565	 break;
566
567      case ir_binop_logic_and:
568	 inst = emit(AND(dst_null_d(), op[0], op[1]));
569	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
570	 break;
571
572      case ir_unop_f2b:
573	 if (intel->gen >= 6) {
574	    emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
575	 } else {
576	    inst = emit(MOV(dst_null_f(), op[0]));
577	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
578	 }
579	 break;
580
581      case ir_unop_i2b:
582	 if (intel->gen >= 6) {
583	    emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
584	 } else {
585	    inst = emit(MOV(dst_null_d(), op[0]));
586	    inst->conditional_mod = BRW_CONDITIONAL_NZ;
587	 }
588	 break;
589
590      case ir_binop_greater:
591      case ir_binop_gequal:
592      case ir_binop_less:
593      case ir_binop_lequal:
594      case ir_binop_equal:
595      case ir_binop_all_equal:
596      case ir_binop_nequal:
597      case ir_binop_any_nequal:
598	 emit(CMP(dst_null_cmp(), op[0], op[1],
599		  brw_conditional_for_comparison(expr->operation)));
600	 break;
601
602      default:
603	 assert(!"not reached");
604	 break;
605      }
606      return;
607   }
608
609   ir->accept(this);
610
611   if (intel->gen >= 6) {
612      vec4_instruction *inst = emit(AND(dst_null_d(),
613					this->result, src_reg(1)));
614      inst->conditional_mod = BRW_CONDITIONAL_NZ;
615   } else {
616      vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
617      inst->conditional_mod = BRW_CONDITIONAL_NZ;
618   }
619}
620
621/**
622 * Emit a gen6 IF statement with the comparison folded into the IF
623 * instruction.
624 */
625void
626vec4_visitor::emit_if_gen6(ir_if *ir)
627{
628   ir_expression *expr = ir->condition->as_expression();
629
630   if (expr) {
631      src_reg op[2];
632      dst_reg temp;
633
634      assert(expr->get_num_operands() <= 2);
635      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
636	 expr->operands[i]->accept(this);
637	 op[i] = this->result;
638      }
639
640      switch (expr->operation) {
641      case ir_unop_logic_not:
642	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
643	 return;
644
645      case ir_binop_logic_xor:
646	 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
647	 return;
648
649      case ir_binop_logic_or:
650	 temp = dst_reg(this, glsl_type::bool_type);
651	 emit(OR(temp, op[0], op[1]));
652	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
653	 return;
654
655      case ir_binop_logic_and:
656	 temp = dst_reg(this, glsl_type::bool_type);
657	 emit(AND(temp, op[0], op[1]));
658	 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
659	 return;
660
661      case ir_unop_f2b:
662	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
663	 return;
664
665      case ir_unop_i2b:
666	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
667	 return;
668
669      case ir_binop_greater:
670      case ir_binop_gequal:
671      case ir_binop_less:
672      case ir_binop_lequal:
673      case ir_binop_equal:
674      case ir_binop_nequal:
675	 emit(IF(op[0], op[1],
676		 brw_conditional_for_comparison(expr->operation)));
677	 return;
678
679      case ir_binop_all_equal:
680	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
681	 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
682	 return;
683
684      case ir_binop_any_nequal:
685	 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
686	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
687	 return;
688
689      case ir_unop_any:
690	 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
691	 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
692	 return;
693
694      default:
695	 assert(!"not reached");
696	 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
697	 return;
698      }
699      return;
700   }
701
702   ir->condition->accept(this);
703
704   emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
705}
706
707void
708vec4_visitor::visit(ir_variable *ir)
709{
710   dst_reg *reg = NULL;
711
712   if (variable_storage(ir))
713      return;
714
715   switch (ir->mode) {
716   case ir_var_in:
717      reg = new(mem_ctx) dst_reg(ATTR, ir->location);
718
719      /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
720       * come in as floating point conversions of the integer values.
721       */
722      for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
723	 if (!c->key.gl_fixed_input_size[i])
724	    continue;
725
726	 dst_reg dst = *reg;
727	 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
728	 emit(BRW_OPCODE_MUL, dst, src_reg(dst), src_reg(1.0f / 65536.0f));
729      }
730      break;
731
732   case ir_var_out:
733      reg = new(mem_ctx) dst_reg(this, ir->type);
734
735      for (int i = 0; i < type_size(ir->type); i++) {
736	 output_reg[ir->location + i] = *reg;
737	 output_reg[ir->location + i].reg_offset = i;
738	 output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
739      }
740      break;
741
742   case ir_var_auto:
743   case ir_var_temporary:
744      reg = new(mem_ctx) dst_reg(this, ir->type);
745      break;
746
747   case ir_var_uniform:
748      reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
749
750      /* Track how big the whole uniform variable is, in case we need to put a
751       * copy of its data into pull constants for array access.
752       */
753      this->uniform_size[this->uniforms] = type_size(ir->type);
754
755      if (!strncmp(ir->name, "gl_", 3)) {
756	 setup_builtin_uniform_values(ir);
757      } else {
758	 setup_uniform_values(ir->location, ir->type);
759      }
760      break;
761
762   default:
763      assert(!"not reached");
764   }
765
766   reg->type = brw_type_for_base_type(ir->type);
767   hash_table_insert(this->variable_ht, reg, ir);
768}
769
770void
771vec4_visitor::visit(ir_loop *ir)
772{
773   dst_reg counter;
774
775   /* We don't want debugging output to print the whole body of the
776    * loop as the annotation.
777    */
778   this->base_ir = NULL;
779
780   if (ir->counter != NULL) {
781      this->base_ir = ir->counter;
782      ir->counter->accept(this);
783      counter = *(variable_storage(ir->counter));
784
785      if (ir->from != NULL) {
786	 this->base_ir = ir->from;
787	 ir->from->accept(this);
788
789	 emit(MOV(counter, this->result));
790      }
791   }
792
793   emit(BRW_OPCODE_DO);
794
795   if (ir->to) {
796      this->base_ir = ir->to;
797      ir->to->accept(this);
798
799      emit(CMP(dst_null_d(), src_reg(counter), this->result,
800	       brw_conditional_for_comparison(ir->cmp)));
801
802      vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
803      inst->predicate = BRW_PREDICATE_NORMAL;
804   }
805
806   visit_instructions(&ir->body_instructions);
807
808
809   if (ir->increment) {
810      this->base_ir = ir->increment;
811      ir->increment->accept(this);
812      emit(ADD(counter, src_reg(counter), this->result));
813   }
814
815   emit(BRW_OPCODE_WHILE);
816}
817
818void
819vec4_visitor::visit(ir_loop_jump *ir)
820{
821   switch (ir->mode) {
822   case ir_loop_jump::jump_break:
823      emit(BRW_OPCODE_BREAK);
824      break;
825   case ir_loop_jump::jump_continue:
826      emit(BRW_OPCODE_CONTINUE);
827      break;
828   }
829}
830
831
832void
833vec4_visitor::visit(ir_function_signature *ir)
834{
835   assert(0);
836   (void)ir;
837}
838
839void
840vec4_visitor::visit(ir_function *ir)
841{
842   /* Ignore function bodies other than main() -- we shouldn't see calls to
843    * them since they should all be inlined.
844    */
845   if (strcmp(ir->name, "main") == 0) {
846      const ir_function_signature *sig;
847      exec_list empty;
848
849      sig = ir->matching_signature(&empty);
850
851      assert(sig);
852
853      visit_instructions(&sig->body);
854   }
855}
856
857GLboolean
858vec4_visitor::try_emit_sat(ir_expression *ir)
859{
860   ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
861   if (!sat_src)
862      return false;
863
864   sat_src->accept(this);
865   src_reg src = this->result;
866
867   this->result = src_reg(this, ir->type);
868   vec4_instruction *inst;
869   inst = emit(MOV(dst_reg(this->result), src));
870   inst->saturate = true;
871
872   return true;
873}
874
875void
876vec4_visitor::emit_bool_comparison(unsigned int op,
877				 dst_reg dst, src_reg src0, src_reg src1)
878{
879   /* original gen4 does destination conversion before comparison. */
880   if (intel->gen < 5)
881      dst.type = src0.type;
882
883   emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
884
885   dst.type = BRW_REGISTER_TYPE_D;
886   emit(AND(dst, src_reg(dst), src_reg(0x1)));
887}
888
889void
890vec4_visitor::visit(ir_expression *ir)
891{
892   unsigned int operand;
893   src_reg op[Elements(ir->operands)];
894   src_reg result_src;
895   dst_reg result_dst;
896   vec4_instruction *inst;
897
898   if (try_emit_sat(ir))
899      return;
900
901   for (operand = 0; operand < ir->get_num_operands(); operand++) {
902      this->result.file = BAD_FILE;
903      ir->operands[operand]->accept(this);
904      if (this->result.file == BAD_FILE) {
905	 printf("Failed to get tree for expression operand:\n");
906	 ir->operands[operand]->print();
907	 exit(1);
908      }
909      op[operand] = this->result;
910
911      /* Matrix expression operands should have been broken down to vector
912       * operations already.
913       */
914      assert(!ir->operands[operand]->type->is_matrix());
915   }
916
917   int vector_elements = ir->operands[0]->type->vector_elements;
918   if (ir->operands[1]) {
919      vector_elements = MAX2(vector_elements,
920			     ir->operands[1]->type->vector_elements);
921   }
922
923   this->result.file = BAD_FILE;
924
925   /* Storage for our result.  Ideally for an assignment we'd be using
926    * the actual storage for the result here, instead.
927    */
928   result_src = src_reg(this, ir->type);
929   /* convenience for the emit functions below. */
930   result_dst = dst_reg(result_src);
931   /* If nothing special happens, this is the result. */
932   this->result = result_src;
933   /* Limit writes to the channels that will be used by result_src later.
934    * This does limit this temp's use as a temporary for multi-instruction
935    * sequences.
936    */
937   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
938
939   switch (ir->operation) {
940   case ir_unop_logic_not:
941      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
942       * ones complement of the whole register, not just bit 0.
943       */
944      emit(XOR(result_dst, op[0], src_reg(1)));
945      break;
946   case ir_unop_neg:
947      op[0].negate = !op[0].negate;
948      this->result = op[0];
949      break;
950   case ir_unop_abs:
951      op[0].abs = true;
952      op[0].negate = false;
953      this->result = op[0];
954      break;
955
956   case ir_unop_sign:
957      emit(MOV(result_dst, src_reg(0.0f)));
958
959      emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
960      inst = emit(MOV(result_dst, src_reg(1.0f)));
961      inst->predicate = BRW_PREDICATE_NORMAL;
962
963      emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
964      inst = emit(MOV(result_dst, src_reg(-1.0f)));
965      inst->predicate = BRW_PREDICATE_NORMAL;
966
967      break;
968
969   case ir_unop_rcp:
970      emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
971      break;
972
973   case ir_unop_exp2:
974      emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
975      break;
976   case ir_unop_log2:
977      emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
978      break;
979   case ir_unop_exp:
980   case ir_unop_log:
981      assert(!"not reached: should be handled by ir_explog_to_explog2");
982      break;
983   case ir_unop_sin:
984   case ir_unop_sin_reduced:
985      emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
986      break;
987   case ir_unop_cos:
988   case ir_unop_cos_reduced:
989      emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
990      break;
991
992   case ir_unop_dFdx:
993   case ir_unop_dFdy:
994      assert(!"derivatives not valid in vertex shader");
995      break;
996
997   case ir_unop_noise:
998      assert(!"not reached: should be handled by lower_noise");
999      break;
1000
1001   case ir_binop_add:
1002      emit(ADD(result_dst, op[0], op[1]));
1003      break;
1004   case ir_binop_sub:
1005      assert(!"not reached: should be handled by ir_sub_to_add_neg");
1006      break;
1007
1008   case ir_binop_mul:
1009      if (ir->type->is_integer()) {
1010	 /* For integer multiplication, the MUL uses the low 16 bits
1011	  * of one of the operands (src0 on gen6, src1 on gen7).  The
1012	  * MACH accumulates in the contribution of the upper 16 bits
1013	  * of that operand.
1014	  *
1015	  * FINISHME: Emit just the MUL if we know an operand is small
1016	  * enough.
1017	  */
1018	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1019
1020	 emit(MUL(acc, op[0], op[1]));
1021	 emit(MACH(dst_null_d(), op[0], op[1]));
1022	 emit(MOV(result_dst, src_reg(acc)));
1023      } else {
1024	 emit(MUL(result_dst, op[0], op[1]));
1025      }
1026      break;
1027   case ir_binop_div:
1028      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
1029   case ir_binop_mod:
1030      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1031      break;
1032
1033   case ir_binop_less:
1034   case ir_binop_greater:
1035   case ir_binop_lequal:
1036   case ir_binop_gequal:
1037   case ir_binop_equal:
1038   case ir_binop_nequal: {
1039      dst_reg temp = result_dst;
1040      /* original gen4 does implicit conversion before comparison. */
1041      if (intel->gen < 5)
1042	 temp.type = op[0].type;
1043
1044      emit(CMP(temp, op[0], op[1],
1045	       brw_conditional_for_comparison(ir->operation)));
1046      emit(AND(result_dst, this->result, src_reg(0x1)));
1047      break;
1048   }
1049
1050   case ir_binop_all_equal:
1051      /* "==" operator producing a scalar boolean. */
1052      if (ir->operands[0]->type->is_vector() ||
1053	  ir->operands[1]->type->is_vector()) {
1054	 emit(CMP(dst_null_cmp(), op[0], op[1], BRW_CONDITIONAL_Z));
1055	 emit(MOV(result_dst, src_reg(0)));
1056	 inst = emit(MOV(result_dst, src_reg(1)));
1057	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1058      } else {
1059	 dst_reg temp = result_dst;
1060	 /* original gen4 does implicit conversion before comparison. */
1061	 if (intel->gen < 5)
1062	    temp.type = op[0].type;
1063
1064	 emit(CMP(temp, op[0], op[1], BRW_CONDITIONAL_Z));
1065	 emit(AND(result_dst, result_src, src_reg(0x1)));
1066      }
1067      break;
1068   case ir_binop_any_nequal:
1069      /* "!=" operator producing a scalar boolean. */
1070      if (ir->operands[0]->type->is_vector() ||
1071	  ir->operands[1]->type->is_vector()) {
1072	 emit(CMP(dst_null_cmp(), op[0], op[1], BRW_CONDITIONAL_NZ));
1073
1074	 emit(MOV(result_dst, src_reg(0)));
1075	 inst = emit(MOV(result_dst, src_reg(1)));
1076	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1077      } else {
1078	 dst_reg temp = result_dst;
1079	 /* original gen4 does implicit conversion before comparison. */
1080	 if (intel->gen < 5)
1081	    temp.type = op[0].type;
1082
1083	 emit(CMP(temp, op[0], op[1], BRW_CONDITIONAL_NZ));
1084	 emit(AND(result_dst, result_src, src_reg(0x1)));
1085      }
1086      break;
1087
1088   case ir_unop_any:
1089      emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1090      emit(MOV(result_dst, src_reg(0)));
1091
1092      inst = emit(MOV(result_dst, src_reg(1)));
1093      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1094      break;
1095
1096   case ir_binop_logic_xor:
1097      emit(XOR(result_dst, op[0], op[1]));
1098      break;
1099
1100   case ir_binop_logic_or:
1101      emit(OR(result_dst, op[0], op[1]));
1102      break;
1103
1104   case ir_binop_logic_and:
1105      emit(AND(result_dst, op[0], op[1]));
1106      break;
1107
1108   case ir_binop_dot:
1109      assert(ir->operands[0]->type->is_vector());
1110      assert(ir->operands[0]->type == ir->operands[1]->type);
1111      emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1112      break;
1113
1114   case ir_unop_sqrt:
1115      emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1116      break;
1117   case ir_unop_rsq:
1118      emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1119      break;
1120   case ir_unop_i2f:
1121   case ir_unop_i2u:
1122   case ir_unop_u2i:
1123   case ir_unop_u2f:
1124   case ir_unop_b2f:
1125   case ir_unop_b2i:
1126   case ir_unop_f2i:
1127      emit(MOV(result_dst, op[0]));
1128      break;
1129   case ir_unop_f2b:
1130   case ir_unop_i2b: {
1131      dst_reg temp = result_dst;
1132      /* original gen4 does implicit conversion before comparison. */
1133      if (intel->gen < 5)
1134	 temp.type = op[0].type;
1135
1136      emit(CMP(temp, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1137      emit(AND(result_dst, result_src, src_reg(1)));
1138      break;
1139   }
1140
1141   case ir_unop_trunc:
1142      emit(RNDZ(result_dst, op[0]));
1143      break;
1144   case ir_unop_ceil:
1145      op[0].negate = !op[0].negate;
1146      inst = emit(RNDD(result_dst, op[0]));
1147      this->result.negate = true;
1148      break;
1149   case ir_unop_floor:
1150      inst = emit(RNDD(result_dst, op[0]));
1151      break;
1152   case ir_unop_fract:
1153      inst = emit(FRC(result_dst, op[0]));
1154      break;
1155   case ir_unop_round_even:
1156      emit(RNDE(result_dst, op[0]));
1157      break;
1158
1159   case ir_binop_min:
1160      emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_L));
1161
1162      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1163      inst->predicate = BRW_PREDICATE_NORMAL;
1164      break;
1165   case ir_binop_max:
1166      emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_G));
1167
1168      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1169      inst->predicate = BRW_PREDICATE_NORMAL;
1170      break;
1171
1172   case ir_binop_pow:
1173      emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1174      break;
1175
1176   case ir_unop_bit_not:
1177      inst = emit(NOT(result_dst, op[0]));
1178      break;
1179   case ir_binop_bit_and:
1180      inst = emit(AND(result_dst, op[0], op[1]));
1181      break;
1182   case ir_binop_bit_xor:
1183      inst = emit(XOR(result_dst, op[0], op[1]));
1184      break;
1185   case ir_binop_bit_or:
1186      inst = emit(OR(result_dst, op[0], op[1]));
1187      break;
1188
1189   case ir_binop_lshift:
1190   case ir_binop_rshift:
1191      assert(!"GLSL 1.30 features unsupported");
1192      break;
1193
1194   case ir_quadop_vector:
1195      assert(!"not reached: should be handled by lower_quadop_vector");
1196      break;
1197   }
1198}
1199
1200
1201void
1202vec4_visitor::visit(ir_swizzle *ir)
1203{
1204   src_reg src;
1205   int i = 0;
1206   int swizzle[4];
1207
1208   /* Note that this is only swizzles in expressions, not those on the left
1209    * hand side of an assignment, which do write masking.  See ir_assignment
1210    * for that.
1211    */
1212
1213   ir->val->accept(this);
1214   src = this->result;
1215   assert(src.file != BAD_FILE);
1216
1217   for (i = 0; i < ir->type->vector_elements; i++) {
1218      switch (i) {
1219      case 0:
1220	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1221	 break;
1222      case 1:
1223	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1224	 break;
1225      case 2:
1226	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1227	 break;
1228      case 3:
1229	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1230	    break;
1231      }
1232   }
1233   for (; i < 4; i++) {
1234      /* Replicate the last channel out. */
1235      swizzle[i] = swizzle[ir->type->vector_elements - 1];
1236   }
1237
1238   src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1239
1240   this->result = src;
1241}
1242
1243void
1244vec4_visitor::visit(ir_dereference_variable *ir)
1245{
1246   const struct glsl_type *type = ir->type;
1247   dst_reg *reg = variable_storage(ir->var);
1248
1249   if (!reg) {
1250      fail("Failed to find variable storage for %s\n", ir->var->name);
1251      this->result = src_reg(brw_null_reg());
1252      return;
1253   }
1254
1255   this->result = src_reg(*reg);
1256
1257   if (type->is_scalar() || type->is_vector() || type->is_matrix())
1258      this->result.swizzle = swizzle_for_size(type->vector_elements);
1259}
1260
1261void
1262vec4_visitor::visit(ir_dereference_array *ir)
1263{
1264   ir_constant *constant_index;
1265   src_reg src;
1266   int element_size = type_size(ir->type);
1267
1268   constant_index = ir->array_index->constant_expression_value();
1269
1270   ir->array->accept(this);
1271   src = this->result;
1272
1273   if (constant_index) {
1274      src.reg_offset += constant_index->value.i[0] * element_size;
1275   } else {
1276      /* Variable index array dereference.  It eats the "vec4" of the
1277       * base of the array and an index that offsets the Mesa register
1278       * index.
1279       */
1280      ir->array_index->accept(this);
1281
1282      src_reg index_reg;
1283
1284      if (element_size == 1) {
1285	 index_reg = this->result;
1286      } else {
1287	 index_reg = src_reg(this, glsl_type::int_type);
1288
1289	 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1290      }
1291
1292      if (src.reladdr) {
1293	 src_reg temp = src_reg(this, glsl_type::int_type);
1294
1295	 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1296
1297	 index_reg = temp;
1298      }
1299
1300      src.reladdr = ralloc(mem_ctx, src_reg);
1301      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1302   }
1303
1304   /* If the type is smaller than a vec4, replicate the last channel out. */
1305   if (ir->type->is_scalar() || ir->type->is_vector())
1306      src.swizzle = swizzle_for_size(ir->type->vector_elements);
1307   else
1308      src.swizzle = BRW_SWIZZLE_NOOP;
1309   src.type = brw_type_for_base_type(ir->type);
1310
1311   this->result = src;
1312}
1313
1314void
1315vec4_visitor::visit(ir_dereference_record *ir)
1316{
1317   unsigned int i;
1318   const glsl_type *struct_type = ir->record->type;
1319   int offset = 0;
1320
1321   ir->record->accept(this);
1322
1323   for (i = 0; i < struct_type->length; i++) {
1324      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1325	 break;
1326      offset += type_size(struct_type->fields.structure[i].type);
1327   }
1328
1329   /* If the type is smaller than a vec4, replicate the last channel out. */
1330   if (ir->type->is_scalar() || ir->type->is_vector())
1331      this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1332   else
1333      this->result.swizzle = BRW_SWIZZLE_NOOP;
1334   this->result.type = brw_type_for_base_type(ir->type);
1335
1336   this->result.reg_offset += offset;
1337}
1338
1339/**
1340 * We want to be careful in assignment setup to hit the actual storage
1341 * instead of potentially using a temporary like we might with the
1342 * ir_dereference handler.
1343 */
1344static dst_reg
1345get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1346{
1347   /* The LHS must be a dereference.  If the LHS is a variable indexed array
1348    * access of a vector, it must be separated into a series conditional moves
1349    * before reaching this point (see ir_vec_index_to_cond_assign).
1350    */
1351   assert(ir->as_dereference());
1352   ir_dereference_array *deref_array = ir->as_dereference_array();
1353   if (deref_array) {
1354      assert(!deref_array->array->type->is_vector());
1355   }
1356
1357   /* Use the rvalue deref handler for the most part.  We'll ignore
1358    * swizzles in it and write swizzles using writemask, though.
1359    */
1360   ir->accept(v);
1361   return dst_reg(v->result);
1362}
1363
1364void
1365vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1366			      const struct glsl_type *type, bool predicated)
1367{
1368   if (type->base_type == GLSL_TYPE_STRUCT) {
1369      for (unsigned int i = 0; i < type->length; i++) {
1370	 emit_block_move(dst, src, type->fields.structure[i].type, predicated);
1371      }
1372      return;
1373   }
1374
1375   if (type->is_array()) {
1376      for (unsigned int i = 0; i < type->length; i++) {
1377	 emit_block_move(dst, src, type->fields.array, predicated);
1378      }
1379      return;
1380   }
1381
1382   if (type->is_matrix()) {
1383      const struct glsl_type *vec_type;
1384
1385      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1386					 type->vector_elements, 1);
1387
1388      for (int i = 0; i < type->matrix_columns; i++) {
1389	 emit_block_move(dst, src, vec_type, predicated);
1390      }
1391      return;
1392   }
1393
1394   assert(type->is_scalar() || type->is_vector());
1395
1396   dst->type = brw_type_for_base_type(type);
1397   src->type = dst->type;
1398
1399   dst->writemask = (1 << type->vector_elements) - 1;
1400
1401   /* Do we need to worry about swizzling a swizzle? */
1402   assert(src->swizzle = BRW_SWIZZLE_NOOP);
1403   src->swizzle = swizzle_for_size(type->vector_elements);
1404
1405   vec4_instruction *inst = emit(MOV(*dst, *src));
1406   if (predicated)
1407      inst->predicate = BRW_PREDICATE_NORMAL;
1408
1409   dst->reg_offset++;
1410   src->reg_offset++;
1411}
1412
1413
1414/* If the RHS processing resulted in an instruction generating a
1415 * temporary value, and it would be easy to rewrite the instruction to
1416 * generate its result right into the LHS instead, do so.  This ends
1417 * up reliably removing instructions where it can be tricky to do so
1418 * later without real UD chain information.
1419 */
1420bool
1421vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1422				     dst_reg dst,
1423				     src_reg src,
1424				     vec4_instruction *pre_rhs_inst,
1425				     vec4_instruction *last_rhs_inst)
1426{
1427   /* This could be supported, but it would take more smarts. */
1428   if (ir->condition)
1429      return false;
1430
1431   if (pre_rhs_inst == last_rhs_inst)
1432      return false; /* No instructions generated to work with. */
1433
1434   /* Make sure the last instruction generated our source reg. */
1435   if (src.file != GRF ||
1436       src.file != last_rhs_inst->dst.file ||
1437       src.reg != last_rhs_inst->dst.reg ||
1438       src.reg_offset != last_rhs_inst->dst.reg_offset ||
1439       src.reladdr ||
1440       src.abs ||
1441       src.negate ||
1442       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1443      return false;
1444
1445   /* Check that that last instruction fully initialized the channels
1446    * we want to use, in the order we want to use them.  We could
1447    * potentially reswizzle the operands of many instructions so that
1448    * we could handle out of order channels, but don't yet.
1449    */
1450   for (int i = 0; i < 4; i++) {
1451      if (dst.writemask & (1 << i)) {
1452	 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1453	    return false;
1454
1455	 if (BRW_GET_SWZ(src.swizzle, i) != i)
1456	    return false;
1457      }
1458   }
1459
1460   /* Success!  Rewrite the instruction. */
1461   last_rhs_inst->dst.file = dst.file;
1462   last_rhs_inst->dst.reg = dst.reg;
1463   last_rhs_inst->dst.reg_offset = dst.reg_offset;
1464   last_rhs_inst->dst.reladdr = dst.reladdr;
1465   last_rhs_inst->dst.writemask &= dst.writemask;
1466
1467   return true;
1468}
1469
1470void
1471vec4_visitor::visit(ir_assignment *ir)
1472{
1473   dst_reg dst = get_assignment_lhs(ir->lhs, this);
1474
1475   if (!ir->lhs->type->is_scalar() &&
1476       !ir->lhs->type->is_vector()) {
1477      ir->rhs->accept(this);
1478      src_reg src = this->result;
1479
1480      if (ir->condition) {
1481	 emit_bool_to_cond_code(ir->condition);
1482      }
1483
1484      emit_block_move(&dst, &src, ir->rhs->type, ir->condition != NULL);
1485      return;
1486   }
1487
1488   /* Now we're down to just a scalar/vector with writemasks. */
1489   int i;
1490
1491   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1492   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1493
1494   ir->rhs->accept(this);
1495
1496   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1497
1498   src_reg src = this->result;
1499
1500   int swizzles[4];
1501   int first_enabled_chan = 0;
1502   int src_chan = 0;
1503
1504   assert(ir->lhs->type->is_vector() ||
1505	  ir->lhs->type->is_scalar());
1506   dst.writemask = ir->write_mask;
1507
1508   for (int i = 0; i < 4; i++) {
1509      if (dst.writemask & (1 << i)) {
1510	 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1511	 break;
1512      }
1513   }
1514
1515   /* Swizzle a small RHS vector into the channels being written.
1516    *
1517    * glsl ir treats write_mask as dictating how many channels are
1518    * present on the RHS while in our instructions we need to make
1519    * those channels appear in the slots of the vec4 they're written to.
1520    */
1521   for (int i = 0; i < 4; i++) {
1522      if (dst.writemask & (1 << i))
1523	 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1524      else
1525	 swizzles[i] = first_enabled_chan;
1526   }
1527   src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1528			      swizzles[2], swizzles[3]);
1529
1530   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1531      return;
1532   }
1533
1534   if (ir->condition) {
1535      emit_bool_to_cond_code(ir->condition);
1536   }
1537
1538   for (i = 0; i < type_size(ir->lhs->type); i++) {
1539      vec4_instruction *inst = emit(MOV(dst, src));
1540
1541      if (ir->condition)
1542	 inst->predicate = BRW_PREDICATE_NORMAL;
1543
1544      dst.reg_offset++;
1545      src.reg_offset++;
1546   }
1547}
1548
1549void
1550vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1551{
1552   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1553      foreach_list(node, &ir->components) {
1554	 ir_constant *field_value = (ir_constant *)node;
1555
1556	 emit_constant_values(dst, field_value);
1557      }
1558      return;
1559   }
1560
1561   if (ir->type->is_array()) {
1562      for (unsigned int i = 0; i < ir->type->length; i++) {
1563	 emit_constant_values(dst, ir->array_elements[i]);
1564      }
1565      return;
1566   }
1567
1568   if (ir->type->is_matrix()) {
1569      for (int i = 0; i < ir->type->matrix_columns; i++) {
1570	 for (int j = 0; j < ir->type->vector_elements; j++) {
1571	    dst->writemask = 1 << j;
1572	    dst->type = BRW_REGISTER_TYPE_F;
1573
1574	    emit(MOV(*dst,
1575		     src_reg(ir->value.f[i * ir->type->vector_elements + j])));
1576	 }
1577	 dst->reg_offset++;
1578      }
1579      return;
1580   }
1581
1582   for (int i = 0; i < ir->type->vector_elements; i++) {
1583      dst->writemask = 1 << i;
1584      dst->type = brw_type_for_base_type(ir->type);
1585
1586      switch (ir->type->base_type) {
1587      case GLSL_TYPE_FLOAT:
1588	 emit(MOV(*dst, src_reg(ir->value.f[i])));
1589	 break;
1590      case GLSL_TYPE_INT:
1591	 emit(MOV(*dst, src_reg(ir->value.i[i])));
1592	 break;
1593      case GLSL_TYPE_UINT:
1594	 emit(MOV(*dst, src_reg(ir->value.u[i])));
1595	 break;
1596      case GLSL_TYPE_BOOL:
1597	 emit(MOV(*dst, src_reg(ir->value.b[i])));
1598	 break;
1599      default:
1600	 assert(!"Non-float/uint/int/bool constant");
1601	 break;
1602      }
1603   }
1604   dst->reg_offset++;
1605}
1606
1607void
1608vec4_visitor::visit(ir_constant *ir)
1609{
1610   dst_reg dst = dst_reg(this, ir->type);
1611   this->result = src_reg(dst);
1612
1613   emit_constant_values(&dst, ir);
1614}
1615
1616void
1617vec4_visitor::visit(ir_call *ir)
1618{
1619   assert(!"not reached");
1620}
1621
1622void
1623vec4_visitor::visit(ir_texture *ir)
1624{
1625   /* FINISHME: Implement vertex texturing.
1626    *
1627    * With 0 vertex samplers available, the linker will reject
1628    * programs that do vertex texturing, but after our visitor has
1629    * run.
1630    */
1631}
1632
1633void
1634vec4_visitor::visit(ir_return *ir)
1635{
1636   assert(!"not reached");
1637}
1638
1639void
1640vec4_visitor::visit(ir_discard *ir)
1641{
1642   assert(!"not reached");
1643}
1644
1645void
1646vec4_visitor::visit(ir_if *ir)
1647{
1648   /* Don't point the annotation at the if statement, because then it plus
1649    * the then and else blocks get printed.
1650    */
1651   this->base_ir = ir->condition;
1652
1653   if (intel->gen == 6) {
1654      emit_if_gen6(ir);
1655   } else {
1656      emit_bool_to_cond_code(ir->condition);
1657      emit(IF(BRW_PREDICATE_NORMAL));
1658   }
1659
1660   visit_instructions(&ir->then_instructions);
1661
1662   if (!ir->else_instructions.is_empty()) {
1663      this->base_ir = ir->condition;
1664      emit(BRW_OPCODE_ELSE);
1665
1666      visit_instructions(&ir->else_instructions);
1667   }
1668
1669   this->base_ir = ir->condition;
1670   emit(BRW_OPCODE_ENDIF);
1671}
1672
1673int
1674vec4_visitor::emit_vue_header_gen4(int header_mrf)
1675{
1676   /* Get the position */
1677   src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1678
1679   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1680   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1681
1682   current_annotation = "NDC";
1683   dst_reg ndc_w = ndc;
1684   ndc_w.writemask = WRITEMASK_W;
1685   src_reg pos_w = pos;
1686   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1687   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1688
1689   dst_reg ndc_xyz = ndc;
1690   ndc_xyz.writemask = WRITEMASK_XYZ;
1691
1692   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1693
1694   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1695       c->key.nr_userclip || brw->has_negative_rhw_bug) {
1696      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1697      GLuint i;
1698
1699      emit(MOV(header1, 0u));
1700
1701      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1702	 assert(!"finishme: psiz");
1703	 src_reg psiz;
1704
1705	 header1.writemask = WRITEMASK_W;
1706	 emit(MUL(header1, psiz, 1u << 11));
1707	 emit(AND(header1, src_reg(header1), 0x7ff << 8));
1708      }
1709
1710      for (i = 0; i < c->key.nr_userclip; i++) {
1711	 vec4_instruction *inst;
1712
1713	 inst = emit(DP4(dst_null_f(), pos, src_reg(c->userplane[i])));
1714	 inst->conditional_mod = BRW_CONDITIONAL_L;
1715
1716	 emit(OR(header1, src_reg(header1), 1u << i));
1717	 inst->predicate = BRW_PREDICATE_NORMAL;
1718      }
1719
1720      /* i965 clipping workaround:
1721       * 1) Test for -ve rhw
1722       * 2) If set,
1723       *      set ndc = (0,0,0,0)
1724       *      set ucp[6] = 1
1725       *
1726       * Later, clipping will detect ucp[6] and ensure the primitive is
1727       * clipped against all fixed planes.
1728       */
1729      if (brw->has_negative_rhw_bug) {
1730#if 0
1731	 /* FINISHME */
1732	 brw_CMP(p,
1733		 vec8(brw_null_reg()),
1734		 BRW_CONDITIONAL_L,
1735		 brw_swizzle1(ndc, 3),
1736		 brw_imm_f(0));
1737
1738	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1739	 brw_MOV(p, ndc, brw_imm_f(0));
1740	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1741#endif
1742      }
1743
1744      header1.writemask = WRITEMASK_XYZW;
1745      emit(MOV(brw_message_reg(header_mrf++), src_reg(header1)));
1746   } else {
1747      emit(MOV(retype(brw_message_reg(header_mrf++),
1748		      BRW_REGISTER_TYPE_UD), 0u));
1749   }
1750
1751   if (intel->gen == 5) {
1752      /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1753       * dword 0-3 (m1) of the header is indices, point width, clip flags.
1754       * dword 4-7 (m2) is the ndc position (set above)
1755       * dword 8-11 (m3) of the vertex header is the 4D space position
1756       * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1757       * m6 is a pad so that the vertex element data is aligned
1758       * m7 is the first vertex data we fill.
1759       */
1760      current_annotation = "NDC";
1761      emit(MOV(brw_message_reg(header_mrf++), src_reg(ndc)));
1762
1763      current_annotation = "gl_Position";
1764      emit(MOV(brw_message_reg(header_mrf++), pos));
1765
1766      /* user clip distance. */
1767      header_mrf += 2;
1768
1769      /* Pad so that vertex element data is aligned. */
1770      header_mrf++;
1771   } else {
1772      /* There are 8 dwords in VUE header pre-Ironlake:
1773       * dword 0-3 (m1) is indices, point width, clip flags.
1774       * dword 4-7 (m2) is ndc position (set above)
1775       *
1776       * dword 8-11 (m3) is the first vertex data.
1777       */
1778      current_annotation = "NDC";
1779      emit(MOV(brw_message_reg(header_mrf++), src_reg(ndc)));
1780
1781      current_annotation = "gl_Position";
1782      emit(MOV(brw_message_reg(header_mrf++), pos));
1783   }
1784
1785   return header_mrf;
1786}
1787
1788int
1789vec4_visitor::emit_vue_header_gen6(int header_mrf)
1790{
1791   struct brw_reg reg;
1792
1793   /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1794    * dword 0-3 (m2) of the header is indices, point width, clip flags.
1795    * dword 4-7 (m3) is the 4D space position
1796    * dword 8-15 (m4,m5) of the vertex header is the user clip distance if
1797    * enabled.
1798    *
1799    * m4 or 6 is the first vertex element data we fill.
1800    */
1801
1802   current_annotation = "indices, point width, clip flags";
1803   reg = brw_message_reg(header_mrf++);
1804   emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1805   if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1806      emit(MOV(brw_writemask(reg, WRITEMASK_W),
1807	       src_reg(output_reg[VERT_RESULT_PSIZ])));
1808   }
1809
1810   current_annotation = "gl_Position";
1811   emit(MOV(brw_message_reg(header_mrf++),
1812	    src_reg(output_reg[VERT_RESULT_HPOS])));
1813
1814   current_annotation = "user clip distances";
1815   if (c->key.nr_userclip) {
1816      for (int i = 0; i < c->key.nr_userclip; i++) {
1817	 struct brw_reg m;
1818	 if (i < 4)
1819	    m = brw_message_reg(header_mrf);
1820	 else
1821	    m = brw_message_reg(header_mrf + 1);
1822
1823	 emit(DP4(dst_reg(brw_writemask(m, 1 << (i & 3))),
1824		  src_reg(output_reg[VERT_RESULT_HPOS]),
1825		  src_reg(c->userplane[i])));
1826      }
1827      header_mrf += 2;
1828   }
1829
1830   current_annotation = NULL;
1831
1832   return header_mrf;
1833}
1834
1835static int
1836align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1837{
1838   struct intel_context *intel = &brw->intel;
1839
1840   if (intel->gen >= 6) {
1841      /* URB data written (does not include the message header reg) must
1842       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1843       * section 5.4.3.2.2: URB_INTERLEAVED.
1844       *
1845       * URB entries are allocated on a multiple of 1024 bits, so an
1846       * extra 128 bits written here to make the end align to 256 is
1847       * no problem.
1848       */
1849      if ((mlen % 2) != 1)
1850	 mlen++;
1851   }
1852
1853   return mlen;
1854}
1855
1856/**
1857 * Generates the VUE payload plus the 1 or 2 URB write instructions to
1858 * complete the VS thread.
1859 *
1860 * The VUE layout is documented in Volume 2a.
1861 */
1862void
1863vec4_visitor::emit_urb_writes()
1864{
1865   /* MRF 0 is reserved for the debugger, so start with message header
1866    * in MRF 1.
1867    */
1868   int base_mrf = 1;
1869   int mrf = base_mrf;
1870   int urb_entry_size;
1871   uint64_t outputs_remaining = c->prog_data.outputs_written;
1872   /* In the process of generating our URB write message contents, we
1873    * may need to unspill a register or load from an array.  Those
1874    * reads would use MRFs 14-15.
1875    */
1876   int max_usable_mrf = 13;
1877
1878   /* FINISHME: edgeflag */
1879
1880   /* First mrf is the g0-based message header containing URB handles and such,
1881    * which is implied in VS_OPCODE_URB_WRITE.
1882    */
1883   mrf++;
1884
1885   if (intel->gen >= 6) {
1886      mrf = emit_vue_header_gen6(mrf);
1887   } else {
1888      mrf = emit_vue_header_gen4(mrf);
1889   }
1890
1891   /* Set up the VUE data for the first URB write */
1892   int attr;
1893   for (attr = 0; attr < VERT_RESULT_MAX; attr++) {
1894      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1895	 continue;
1896
1897      outputs_remaining &= ~BITFIELD64_BIT(attr);
1898
1899      /* This is set up in the VUE header. */
1900      if (attr == VERT_RESULT_HPOS)
1901	 continue;
1902
1903      /* This is loaded into the VUE header, and thus doesn't occupy
1904       * an attribute slot.
1905       */
1906      if (attr == VERT_RESULT_PSIZ)
1907	 continue;
1908
1909      vec4_instruction *inst = emit(MOV(brw_message_reg(mrf++),
1910					src_reg(output_reg[attr])));
1911
1912      if ((attr == VERT_RESULT_COL0 ||
1913	   attr == VERT_RESULT_COL1 ||
1914	   attr == VERT_RESULT_BFC0 ||
1915	   attr == VERT_RESULT_BFC1) &&
1916	  c->key.clamp_vertex_color) {
1917	 inst->saturate = true;
1918      }
1919
1920      /* If this was MRF 15, we can't fit anything more into this URB
1921       * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
1922       * even-numbered amount of URB write data, which will meet
1923       * gen6's requirements for length alignment.
1924       */
1925      if (mrf > max_usable_mrf) {
1926	 attr++;
1927	 break;
1928      }
1929   }
1930
1931   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
1932   inst->base_mrf = base_mrf;
1933   inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1934   inst->eot = !outputs_remaining;
1935
1936   urb_entry_size = mrf - base_mrf;
1937
1938   /* Optional second URB write */
1939   if (outputs_remaining) {
1940      mrf = base_mrf + 1;
1941
1942      for (; attr < VERT_RESULT_MAX; attr++) {
1943	 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1944	    continue;
1945
1946	 assert(mrf < max_usable_mrf);
1947
1948	 emit(MOV(brw_message_reg(mrf++), src_reg(output_reg[attr])));
1949      }
1950
1951      inst = emit(VS_OPCODE_URB_WRITE);
1952      inst->base_mrf = base_mrf;
1953      inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1954      inst->eot = true;
1955      /* URB destination offset.  In the previous write, we got MRFs
1956       * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
1957       * URB row increments, and each of our MRFs is half of one of
1958       * those, since we're doing interleaved writes.
1959       */
1960      inst->offset = (max_usable_mrf - base_mrf) / 2;
1961
1962      urb_entry_size += mrf - base_mrf;
1963   }
1964
1965   if (intel->gen == 6)
1966      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
1967   else
1968      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
1969}
1970
1971src_reg
1972vec4_visitor::get_scratch_offset(vec4_instruction *inst,
1973				 src_reg *reladdr, int reg_offset)
1974{
1975   /* Because we store the values to scratch interleaved like our
1976    * vertex data, we need to scale the vec4 index by 2.
1977    */
1978   int message_header_scale = 2;
1979
1980   /* Pre-gen6, the message header uses byte offsets instead of vec4
1981    * (16-byte) offset units.
1982    */
1983   if (intel->gen < 6)
1984      message_header_scale *= 16;
1985
1986   if (reladdr) {
1987      src_reg index = src_reg(this, glsl_type::int_type);
1988
1989      vec4_instruction *add = emit(ADD(dst_reg(index),
1990				       *reladdr,
1991				       src_reg(reg_offset)));
1992      /* Move our new instruction from the tail to its correct place. */
1993      add->remove();
1994      inst->insert_before(add);
1995
1996      vec4_instruction *mul = emit(MUL(dst_reg(index),
1997				       index, src_reg(message_header_scale)));
1998      mul->remove();
1999      inst->insert_before(mul);
2000
2001      return index;
2002   } else {
2003      return src_reg(reg_offset * message_header_scale);
2004   }
2005}
2006
2007src_reg
2008vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2009				       src_reg *reladdr, int reg_offset)
2010{
2011   if (reladdr) {
2012      src_reg index = src_reg(this, glsl_type::int_type);
2013
2014      vec4_instruction *add = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_ADD,
2015							    dst_reg(index),
2016							    *reladdr,
2017							    src_reg(reg_offset));
2018      add->ir = inst->ir;
2019      add->annotation = inst->annotation;
2020      inst->insert_before(add);
2021
2022      /* Pre-gen6, the message header uses byte offsets instead of vec4
2023       * (16-byte) offset units.
2024       */
2025      if (intel->gen < 6) {
2026	 vec4_instruction *mul = new(mem_ctx) vec4_instruction(this,
2027							       BRW_OPCODE_MUL,
2028							       dst_reg(index),
2029							       index,
2030							       src_reg(16));
2031	 mul->ir = inst->ir;
2032	 mul->annotation = inst->annotation;
2033	 inst->insert_before(mul);
2034      }
2035
2036      return index;
2037   } else {
2038      int message_header_scale = intel->gen < 6 ? 16 : 1;
2039      return src_reg(reg_offset * message_header_scale);
2040   }
2041}
2042
2043/**
2044 * Emits an instruction before @inst to load the value named by @orig_src
2045 * from scratch space at @base_offset to @temp.
2046 */
2047void
2048vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2049				dst_reg temp, src_reg orig_src,
2050				int base_offset)
2051{
2052   int reg_offset = base_offset + orig_src.reg_offset;
2053   src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2054
2055   vec4_instruction *scratch_read_inst = emit(VS_OPCODE_SCRATCH_READ,
2056					      temp, index);
2057
2058   scratch_read_inst->base_mrf = 14;
2059   scratch_read_inst->mlen = 1;
2060   /* Move our instruction from the tail to its correct place. */
2061   scratch_read_inst->remove();
2062   inst->insert_before(scratch_read_inst);
2063}
2064
2065/**
2066 * Emits an instruction after @inst to store the value to be written
2067 * to @orig_dst to scratch space at @base_offset, from @temp.
2068 */
2069void
2070vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2071				 src_reg temp, dst_reg orig_dst,
2072				 int base_offset)
2073{
2074   int reg_offset = base_offset + orig_dst.reg_offset;
2075   src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2076
2077   dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2078				       orig_dst.writemask));
2079   vec4_instruction *scratch_write_inst = emit(VS_OPCODE_SCRATCH_WRITE,
2080					       dst, temp, index);
2081   scratch_write_inst->base_mrf = 13;
2082   scratch_write_inst->mlen = 2;
2083   scratch_write_inst->predicate = inst->predicate;
2084   /* Move our instruction from the tail to its correct place. */
2085   scratch_write_inst->remove();
2086   inst->insert_after(scratch_write_inst);
2087}
2088
2089/**
2090 * We can't generally support array access in GRF space, because a
2091 * single instruction's destination can only span 2 contiguous
2092 * registers.  So, we send all GRF arrays that get variable index
2093 * access to scratch space.
2094 */
2095void
2096vec4_visitor::move_grf_array_access_to_scratch()
2097{
2098   int scratch_loc[this->virtual_grf_count];
2099
2100   for (int i = 0; i < this->virtual_grf_count; i++) {
2101      scratch_loc[i] = -1;
2102   }
2103
2104   /* First, calculate the set of virtual GRFs that need to be punted
2105    * to scratch due to having any array access on them, and where in
2106    * scratch.
2107    */
2108   foreach_list(node, &this->instructions) {
2109      vec4_instruction *inst = (vec4_instruction *)node;
2110
2111      if (inst->dst.file == GRF && inst->dst.reladdr &&
2112	  scratch_loc[inst->dst.reg] == -1) {
2113	 scratch_loc[inst->dst.reg] = c->last_scratch;
2114	 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2115      }
2116
2117      for (int i = 0 ; i < 3; i++) {
2118	 src_reg *src = &inst->src[i];
2119
2120	 if (src->file == GRF && src->reladdr &&
2121	     scratch_loc[src->reg] == -1) {
2122	    scratch_loc[src->reg] = c->last_scratch;
2123	    c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2124	 }
2125      }
2126   }
2127
2128   /* Now, for anything that will be accessed through scratch, rewrite
2129    * it to load/store.  Note that this is a _safe list walk, because
2130    * we may generate a new scratch_write instruction after the one
2131    * we're processing.
2132    */
2133   foreach_list_safe(node, &this->instructions) {
2134      vec4_instruction *inst = (vec4_instruction *)node;
2135
2136      /* Set up the annotation tracking for new generated instructions. */
2137      base_ir = inst->ir;
2138      current_annotation = inst->annotation;
2139
2140      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2141	 src_reg temp = src_reg(this, glsl_type::vec4_type);
2142
2143	 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2144
2145	 inst->dst.file = temp.file;
2146	 inst->dst.reg = temp.reg;
2147	 inst->dst.reg_offset = temp.reg_offset;
2148	 inst->dst.reladdr = NULL;
2149      }
2150
2151      for (int i = 0 ; i < 3; i++) {
2152	 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2153	    continue;
2154
2155	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2156
2157	 emit_scratch_read(inst, temp, inst->src[i],
2158			   scratch_loc[inst->src[i].reg]);
2159
2160	 inst->src[i].file = temp.file;
2161	 inst->src[i].reg = temp.reg;
2162	 inst->src[i].reg_offset = temp.reg_offset;
2163	 inst->src[i].reladdr = NULL;
2164      }
2165   }
2166}
2167
2168/**
2169 * Emits an instruction before @inst to load the value named by @orig_src
2170 * from the pull constant buffer (surface) at @base_offset to @temp.
2171 */
2172void
2173vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2174				      dst_reg temp, src_reg orig_src,
2175				      int base_offset)
2176{
2177   int reg_offset = base_offset + orig_src.reg_offset;
2178   src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2179   vec4_instruction *load;
2180
2181   load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2182					temp, index);
2183   load->annotation = inst->annotation;
2184   load->ir = inst->ir;
2185   load->base_mrf = 14;
2186   load->mlen = 1;
2187   inst->insert_before(load);
2188}
2189
2190/**
2191 * Implements array access of uniforms by inserting a
2192 * PULL_CONSTANT_LOAD instruction.
2193 *
2194 * Unlike temporary GRF array access (where we don't support it due to
2195 * the difficulty of doing relative addressing on instruction
2196 * destinations), we could potentially do array access of uniforms
2197 * that were loaded in GRF space as push constants.  In real-world
2198 * usage we've seen, though, the arrays being used are always larger
2199 * than we could load as push constants, so just always move all
2200 * uniform array access out to a pull constant buffer.
2201 */
2202void
2203vec4_visitor::move_uniform_array_access_to_pull_constants()
2204{
2205   int pull_constant_loc[this->uniforms];
2206
2207   for (int i = 0; i < this->uniforms; i++) {
2208      pull_constant_loc[i] = -1;
2209   }
2210
2211   /* Walk through and find array access of uniforms.  Put a copy of that
2212    * uniform in the pull constant buffer.
2213    *
2214    * Note that we don't move constant-indexed accesses to arrays.  No
2215    * testing has been done of the performance impact of this choice.
2216    */
2217   foreach_list_safe(node, &this->instructions) {
2218      vec4_instruction *inst = (vec4_instruction *)node;
2219
2220      for (int i = 0 ; i < 3; i++) {
2221	 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2222	    continue;
2223
2224	 int uniform = inst->src[i].reg;
2225
2226	 /* If this array isn't already present in the pull constant buffer,
2227	  * add it.
2228	  */
2229	 if (pull_constant_loc[uniform] == -1) {
2230	    const float **values = &prog_data->param[uniform * 4];
2231
2232	    pull_constant_loc[uniform] = prog_data->nr_pull_params;
2233
2234	    for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2235	       prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2236	    }
2237	 }
2238
2239	 /* Set up the annotation tracking for new generated instructions. */
2240	 base_ir = inst->ir;
2241	 current_annotation = inst->annotation;
2242
2243	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2244
2245	 emit_pull_constant_load(inst, temp, inst->src[i],
2246				 pull_constant_loc[uniform]);
2247
2248	 inst->src[i].file = temp.file;
2249	 inst->src[i].reg = temp.reg;
2250	 inst->src[i].reg_offset = temp.reg_offset;
2251	 inst->src[i].reladdr = NULL;
2252      }
2253   }
2254
2255   /* Now there are no accesses of the UNIFORM file with a reladdr, so
2256    * no need to track them as larger-than-vec4 objects.  This will be
2257    * relied on in cutting out unused uniform vectors from push
2258    * constants.
2259    */
2260   split_uniform_registers();
2261}
2262
2263vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2264			   struct gl_shader_program *prog,
2265			   struct brw_shader *shader)
2266{
2267   this->c = c;
2268   this->p = &c->func;
2269   this->brw = p->brw;
2270   this->intel = &brw->intel;
2271   this->ctx = &intel->ctx;
2272   this->prog = prog;
2273   this->shader = shader;
2274
2275   this->mem_ctx = ralloc_context(NULL);
2276   this->failed = false;
2277
2278   this->base_ir = NULL;
2279   this->current_annotation = NULL;
2280
2281   this->c = c;
2282   this->vp = prog->VertexProgram;
2283   this->prog_data = &c->prog_data;
2284
2285   this->variable_ht = hash_table_ctor(0,
2286				       hash_table_pointer_hash,
2287				       hash_table_pointer_compare);
2288
2289   this->virtual_grf_def = NULL;
2290   this->virtual_grf_use = NULL;
2291   this->virtual_grf_sizes = NULL;
2292   this->virtual_grf_count = 0;
2293   this->virtual_grf_array_size = 0;
2294   this->live_intervals_valid = false;
2295
2296   this->uniforms = 0;
2297
2298   this->variable_ht = hash_table_ctor(0,
2299				       hash_table_pointer_hash,
2300				       hash_table_pointer_compare);
2301}
2302
2303vec4_visitor::~vec4_visitor()
2304{
2305   ralloc_free(this->mem_ctx);
2306   hash_table_dtor(this->variable_ht);
2307}
2308
2309
2310void
2311vec4_visitor::fail(const char *format, ...)
2312{
2313   va_list va;
2314   char *msg;
2315
2316   if (failed)
2317      return;
2318
2319   failed = true;
2320
2321   va_start(va, format);
2322   msg = ralloc_vasprintf(mem_ctx, format, va);
2323   va_end(va);
2324   msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2325
2326   this->fail_msg = msg;
2327
2328   if (INTEL_DEBUG & DEBUG_VS) {
2329      fprintf(stderr, "%s",  msg);
2330   }
2331}
2332
2333} /* namespace brw */
2334