brw_vec4_visitor.cpp revision ead7ffc62a99c83c3f41a3f229cfbb9ed1826df0
1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_vec4.h"
25extern "C" {
26#include "main/macros.h"
27#include "program/prog_parameter.h"
28}
29
30namespace brw {
31
32src_reg::src_reg(dst_reg reg)
33{
34   init();
35
36   this->file = reg.file;
37   this->reg = reg.reg;
38   this->reg_offset = reg.reg_offset;
39   this->type = reg.type;
40   this->reladdr = reg.reladdr;
41   this->fixed_hw_reg = reg.fixed_hw_reg;
42
43   int swizzles[4];
44   int next_chan = 0;
45   int last = 0;
46
47   for (int i = 0; i < 4; i++) {
48      if (!(reg.writemask & (1 << i)))
49	 continue;
50
51      swizzles[next_chan++] = last = i;
52   }
53
54   for (; next_chan < 4; next_chan++) {
55      swizzles[next_chan] = last;
56   }
57
58   this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
59				swizzles[2], swizzles[3]);
60}
61
62dst_reg::dst_reg(src_reg reg)
63{
64   init();
65
66   this->file = reg.file;
67   this->reg = reg.reg;
68   this->reg_offset = reg.reg_offset;
69   this->type = reg.type;
70   this->writemask = WRITEMASK_XYZW;
71   this->reladdr = reg.reladdr;
72   this->fixed_hw_reg = reg.fixed_hw_reg;
73}
74
75vec4_instruction::vec4_instruction(vec4_visitor *v,
76				   enum opcode opcode, dst_reg dst,
77				   src_reg src0, src_reg src1, src_reg src2)
78{
79   this->opcode = opcode;
80   this->dst = dst;
81   this->src[0] = src0;
82   this->src[1] = src1;
83   this->src[2] = src2;
84   this->ir = v->base_ir;
85   this->annotation = v->current_annotation;
86}
87
88vec4_instruction *
89vec4_visitor::emit(vec4_instruction *inst)
90{
91   this->instructions.push_tail(inst);
92
93   return inst;
94}
95
96vec4_instruction *
97vec4_visitor::emit(enum opcode opcode, dst_reg dst,
98		   src_reg src0, src_reg src1, src_reg src2)
99{
100   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
101					     src0, src1, src2));
102}
103
104
105vec4_instruction *
106vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
107{
108   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
109}
110
111vec4_instruction *
112vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
113{
114   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
115}
116
117vec4_instruction *
118vec4_visitor::emit(enum opcode opcode)
119{
120   return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
121}
122
123#define ALU1(op)							\
124   vec4_instruction *							\
125   vec4_visitor::op(dst_reg dst, src_reg src0)				\
126   {									\
127      return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
128					   src0);			\
129   }
130
131#define ALU2(op)							\
132   vec4_instruction *							\
133   vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1)		\
134   {									\
135      return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst,	\
136					   src0, src1);			\
137   }
138
139ALU1(NOT)
140ALU1(MOV)
141ALU1(FRC)
142ALU1(RNDD)
143ALU1(RNDE)
144ALU1(RNDZ)
145ALU2(ADD)
146ALU2(MUL)
147ALU2(MACH)
148ALU2(AND)
149ALU2(OR)
150ALU2(XOR)
151ALU2(DP3)
152ALU2(DP4)
153
154/** Gen4 predicated IF. */
155vec4_instruction *
156vec4_visitor::IF(uint32_t predicate)
157{
158   vec4_instruction *inst;
159
160   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
161   inst->predicate = predicate;
162
163   return inst;
164}
165
166/** Gen6+ IF with embedded comparison. */
167vec4_instruction *
168vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
169{
170   assert(intel->gen >= 6);
171
172   vec4_instruction *inst;
173
174   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
175					src0, src1);
176   inst->conditional_mod = condition;
177
178   return inst;
179}
180
181vec4_instruction *
182vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
183{
184   vec4_instruction *inst;
185
186   inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst,
187					src0, src1, src_reg());
188   inst->conditional_mod = condition;
189
190   return inst;
191}
192
193void
194vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
195{
196   static enum opcode dot_opcodes[] = {
197      BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
198   };
199
200   emit(dot_opcodes[elements - 2], dst, src0, src1);
201}
202
203void
204vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
205{
206   /* The gen6 math instruction ignores the source modifiers --
207    * swizzle, abs, negate, and at least some parts of the register
208    * region description.
209    *
210    * While it would seem that this MOV could be avoided at this point
211    * in the case that the swizzle is matched up with the destination
212    * writemask, note that uniform packing and register allocation
213    * could rearrange our swizzle, so let's leave this matter up to
214    * copy propagation later.
215    */
216   src_reg temp_src = src_reg(this, glsl_type::vec4_type);
217   emit(BRW_OPCODE_MOV, dst_reg(temp_src), src);
218
219   if (dst.writemask != WRITEMASK_XYZW) {
220      /* The gen6 math instruction must be align1, so we can't do
221       * writemasks.
222       */
223      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
224
225      emit(opcode, temp_dst, temp_src);
226
227      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
228   } else {
229      emit(opcode, dst, temp_src);
230   }
231}
232
233void
234vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
235{
236   vec4_instruction *inst = emit(opcode, dst, src);
237   inst->base_mrf = 1;
238   inst->mlen = 1;
239}
240
241void
242vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
243{
244   switch (opcode) {
245   case SHADER_OPCODE_RCP:
246   case SHADER_OPCODE_RSQ:
247   case SHADER_OPCODE_SQRT:
248   case SHADER_OPCODE_EXP2:
249   case SHADER_OPCODE_LOG2:
250   case SHADER_OPCODE_SIN:
251   case SHADER_OPCODE_COS:
252      break;
253   default:
254      assert(!"not reached: bad math opcode");
255      return;
256   }
257
258   if (intel->gen >= 6) {
259      return emit_math1_gen6(opcode, dst, src);
260   } else {
261      return emit_math1_gen4(opcode, dst, src);
262   }
263}
264
265void
266vec4_visitor::emit_math2_gen6(enum opcode opcode,
267			      dst_reg dst, src_reg src0, src_reg src1)
268{
269   src_reg expanded;
270
271   /* The gen6 math instruction ignores the source modifiers --
272    * swizzle, abs, negate, and at least some parts of the register
273    * region description.  Move the sources to temporaries to make it
274    * generally work.
275    */
276
277   expanded = src_reg(this, glsl_type::vec4_type);
278   emit(BRW_OPCODE_MOV, dst_reg(expanded), src0);
279   src0 = expanded;
280
281   expanded = src_reg(this, glsl_type::vec4_type);
282   emit(BRW_OPCODE_MOV, dst_reg(expanded), src1);
283   src1 = expanded;
284
285   if (dst.writemask != WRITEMASK_XYZW) {
286      /* The gen6 math instruction must be align1, so we can't do
287       * writemasks.
288       */
289      dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
290
291      emit(opcode, temp_dst, src0, src1);
292
293      emit(BRW_OPCODE_MOV, dst, src_reg(temp_dst));
294   } else {
295      emit(opcode, dst, src0, src1);
296   }
297}
298
299void
300vec4_visitor::emit_math2_gen4(enum opcode opcode,
301			      dst_reg dst, src_reg src0, src_reg src1)
302{
303   vec4_instruction *inst = emit(opcode, dst, src0, src1);
304   inst->base_mrf = 1;
305   inst->mlen = 2;
306}
307
308void
309vec4_visitor::emit_math(enum opcode opcode,
310			dst_reg dst, src_reg src0, src_reg src1)
311{
312   assert(opcode == SHADER_OPCODE_POW);
313
314   if (intel->gen >= 6) {
315      return emit_math2_gen6(opcode, dst, src0, src1);
316   } else {
317      return emit_math2_gen4(opcode, dst, src0, src1);
318   }
319}
320
321void
322vec4_visitor::visit_instructions(const exec_list *list)
323{
324   foreach_list(node, list) {
325      ir_instruction *ir = (ir_instruction *)node;
326
327      base_ir = ir;
328      ir->accept(this);
329   }
330}
331
332
333static int
334type_size(const struct glsl_type *type)
335{
336   unsigned int i;
337   int size;
338
339   switch (type->base_type) {
340   case GLSL_TYPE_UINT:
341   case GLSL_TYPE_INT:
342   case GLSL_TYPE_FLOAT:
343   case GLSL_TYPE_BOOL:
344      if (type->is_matrix()) {
345	 return type->matrix_columns;
346      } else {
347	 /* Regardless of size of vector, it gets a vec4. This is bad
348	  * packing for things like floats, but otherwise arrays become a
349	  * mess.  Hopefully a later pass over the code can pack scalars
350	  * down if appropriate.
351	  */
352	 return 1;
353      }
354   case GLSL_TYPE_ARRAY:
355      assert(type->length > 0);
356      return type_size(type->fields.array) * type->length;
357   case GLSL_TYPE_STRUCT:
358      size = 0;
359      for (i = 0; i < type->length; i++) {
360	 size += type_size(type->fields.structure[i].type);
361      }
362      return size;
363   case GLSL_TYPE_SAMPLER:
364      /* Samplers take up one slot in UNIFORMS[], but they're baked in
365       * at link time.
366       */
367      return 1;
368   default:
369      assert(0);
370      return 0;
371   }
372}
373
374int
375vec4_visitor::virtual_grf_alloc(int size)
376{
377   if (virtual_grf_array_size <= virtual_grf_count) {
378      if (virtual_grf_array_size == 0)
379	 virtual_grf_array_size = 16;
380      else
381	 virtual_grf_array_size *= 2;
382      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
383				   virtual_grf_array_size);
384   }
385   virtual_grf_sizes[virtual_grf_count] = size;
386   return virtual_grf_count++;
387}
388
389src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
390{
391   init();
392
393   this->file = GRF;
394   this->reg = v->virtual_grf_alloc(type_size(type));
395
396   if (type->is_array() || type->is_record()) {
397      this->swizzle = BRW_SWIZZLE_NOOP;
398   } else {
399      this->swizzle = swizzle_for_size(type->vector_elements);
400   }
401
402   this->type = brw_type_for_base_type(type);
403}
404
405dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
406{
407   init();
408
409   this->file = GRF;
410   this->reg = v->virtual_grf_alloc(type_size(type));
411
412   if (type->is_array() || type->is_record()) {
413      this->writemask = WRITEMASK_XYZW;
414   } else {
415      this->writemask = (1 << type->vector_elements) - 1;
416   }
417
418   this->type = brw_type_for_base_type(type);
419}
420
421/* Our support for uniforms is piggy-backed on the struct
422 * gl_fragment_program, because that's where the values actually
423 * get stored, rather than in some global gl_shader_program uniform
424 * store.
425 */
426int
427vec4_visitor::setup_uniform_values(int loc, const glsl_type *type)
428{
429   unsigned int offset = 0;
430   float *values = &this->vp->Base.Parameters->ParameterValues[loc][0].f;
431
432   if (type->is_matrix()) {
433      const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
434							type->vector_elements,
435							1);
436
437      for (unsigned int i = 0; i < type->matrix_columns; i++) {
438	 offset += setup_uniform_values(loc + offset, column);
439      }
440
441      return offset;
442   }
443
444   switch (type->base_type) {
445   case GLSL_TYPE_FLOAT:
446   case GLSL_TYPE_UINT:
447   case GLSL_TYPE_INT:
448   case GLSL_TYPE_BOOL:
449      for (unsigned int i = 0; i < type->vector_elements; i++) {
450	 c->prog_data.param[this->uniforms * 4 + i] = &values[i];
451      }
452
453      /* Set up pad elements to get things aligned to a vec4 boundary. */
454      for (unsigned int i = type->vector_elements; i < 4; i++) {
455	 static float zero = 0;
456
457	 c->prog_data.param[this->uniforms * 4 + i] = &zero;
458      }
459
460      /* Track the size of this uniform vector, for future packing of
461       * uniforms.
462       */
463      this->uniform_vector_size[this->uniforms] = type->vector_elements;
464      this->uniforms++;
465
466      return 1;
467
468   case GLSL_TYPE_STRUCT:
469      for (unsigned int i = 0; i < type->length; i++) {
470	 offset += setup_uniform_values(loc + offset,
471					type->fields.structure[i].type);
472      }
473      return offset;
474
475   case GLSL_TYPE_ARRAY:
476      for (unsigned int i = 0; i < type->length; i++) {
477	 offset += setup_uniform_values(loc + offset, type->fields.array);
478      }
479      return offset;
480
481   case GLSL_TYPE_SAMPLER:
482      /* The sampler takes up a slot, but we don't use any values from it. */
483      return 1;
484
485   default:
486      assert(!"not reached");
487      return 0;
488   }
489}
490
491/* Our support for builtin uniforms is even scarier than non-builtin.
492 * It sits on top of the PROG_STATE_VAR parameters that are
493 * automatically updated from GL context state.
494 */
495void
496vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
497{
498   const ir_state_slot *const slots = ir->state_slots;
499   assert(ir->state_slots != NULL);
500
501   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
502      /* This state reference has already been setup by ir_to_mesa,
503       * but we'll get the same index back here.  We can reference
504       * ParameterValues directly, since unlike brw_fs.cpp, we never
505       * add new state references during compile.
506       */
507      int index = _mesa_add_state_reference(this->vp->Base.Parameters,
508					    (gl_state_index *)slots[i].tokens);
509      float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
510
511      this->uniform_vector_size[this->uniforms] = 0;
512      /* Add each of the unique swizzled channels of the element.
513       * This will end up matching the size of the glsl_type of this field.
514       */
515      int last_swiz = -1;
516      for (unsigned int j = 0; j < 4; j++) {
517	 int swiz = GET_SWZ(slots[i].swizzle, j);
518	 last_swiz = swiz;
519
520	 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
521	 if (swiz <= last_swiz)
522	    this->uniform_vector_size[this->uniforms]++;
523      }
524      this->uniforms++;
525   }
526}
527
528dst_reg *
529vec4_visitor::variable_storage(ir_variable *var)
530{
531   return (dst_reg *)hash_table_find(this->variable_ht, var);
532}
533
534void
535vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
536{
537   ir_expression *expr = ir->as_expression();
538
539   if (expr) {
540      src_reg op[2];
541      vec4_instruction *inst;
542
543      assert(expr->get_num_operands() <= 2);
544      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
545	 assert(expr->operands[i]->type->is_scalar());
546
547	 expr->operands[i]->accept(this);
548	 op[i] = this->result;
549      }
550
551      switch (expr->operation) {
552      case ir_unop_logic_not:
553	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], src_reg(1));
554	 inst->conditional_mod = BRW_CONDITIONAL_Z;
555	 break;
556
557      case ir_binop_logic_xor:
558	 inst = emit(BRW_OPCODE_XOR, dst_null_d(), op[0], op[1]);
559	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
560	 break;
561
562      case ir_binop_logic_or:
563	 inst = emit(BRW_OPCODE_OR, dst_null_d(), op[0], op[1]);
564	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
565	 break;
566
567      case ir_binop_logic_and:
568	 inst = emit(BRW_OPCODE_AND, dst_null_d(), op[0], op[1]);
569	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
570	 break;
571
572      case ir_unop_f2b:
573	 if (intel->gen >= 6) {
574	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0.0f));
575	 } else {
576	    inst = emit(BRW_OPCODE_MOV, dst_null_f(), op[0]);
577	 }
578	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
579	 break;
580
581      case ir_unop_i2b:
582	 if (intel->gen >= 6) {
583	    inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
584	 } else {
585	    inst = emit(BRW_OPCODE_MOV, dst_null_d(), op[0]);
586	 }
587	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
588	 break;
589
590      case ir_binop_greater:
591      case ir_binop_gequal:
592      case ir_binop_less:
593      case ir_binop_lequal:
594      case ir_binop_equal:
595      case ir_binop_all_equal:
596      case ir_binop_nequal:
597      case ir_binop_any_nequal:
598	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
599	 inst->conditional_mod =
600	    brw_conditional_for_comparison(expr->operation);
601	 break;
602
603      default:
604	 assert(!"not reached");
605	 break;
606      }
607      return;
608   }
609
610   ir->accept(this);
611
612   if (intel->gen >= 6) {
613      vec4_instruction *inst = emit(BRW_OPCODE_AND, dst_null_d(),
614			       this->result, src_reg(1));
615      inst->conditional_mod = BRW_CONDITIONAL_NZ;
616   } else {
617      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst_null_d(), this->result);
618      inst->conditional_mod = BRW_CONDITIONAL_NZ;
619   }
620}
621
622/**
623 * Emit a gen6 IF statement with the comparison folded into the IF
624 * instruction.
625 */
626void
627vec4_visitor::emit_if_gen6(ir_if *ir)
628{
629   ir_expression *expr = ir->condition->as_expression();
630
631   if (expr) {
632      src_reg op[2];
633      vec4_instruction *inst;
634      dst_reg temp;
635
636      assert(expr->get_num_operands() <= 2);
637      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
638	 expr->operands[i]->accept(this);
639	 op[i] = this->result;
640      }
641
642      switch (expr->operation) {
643      case ir_unop_logic_not:
644	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
645	 inst->conditional_mod = BRW_CONDITIONAL_Z;
646	 return;
647
648      case ir_binop_logic_xor:
649	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
650	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
651	 return;
652
653      case ir_binop_logic_or:
654	 temp = dst_reg(this, glsl_type::bool_type);
655	 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
656	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
657	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
658	 return;
659
660      case ir_binop_logic_and:
661	 temp = dst_reg(this, glsl_type::bool_type);
662	 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
663	 inst = emit(BRW_OPCODE_IF, dst_null_d(), src_reg(temp), src_reg(0));
664	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
665	 return;
666
667      case ir_unop_f2b:
668	 inst = emit(BRW_OPCODE_IF, dst_null_f(), op[0], src_reg(0));
669	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
670	 return;
671
672      case ir_unop_i2b:
673	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
674	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
675	 return;
676
677      case ir_binop_greater:
678      case ir_binop_gequal:
679      case ir_binop_less:
680      case ir_binop_lequal:
681      case ir_binop_equal:
682      case ir_binop_nequal:
683	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], op[1]);
684	 inst->conditional_mod =
685	    brw_conditional_for_comparison(expr->operation);
686	 return;
687
688      case ir_binop_all_equal:
689	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
690	 inst->conditional_mod = BRW_CONDITIONAL_Z;
691
692	 inst = emit(BRW_OPCODE_IF);
693	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
694	 return;
695
696      case ir_binop_any_nequal:
697	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], op[1]);
698	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
699
700	 inst = emit(BRW_OPCODE_IF);
701	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
702	 return;
703
704      case ir_unop_any:
705	 inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
706	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
707
708	 inst = emit(BRW_OPCODE_IF);
709	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
710	 return;
711
712      default:
713	 assert(!"not reached");
714	 inst = emit(BRW_OPCODE_IF, dst_null_d(), op[0], src_reg(0));
715	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
716	 return;
717      }
718      return;
719   }
720
721   ir->condition->accept(this);
722
723   vec4_instruction *inst = emit(BRW_OPCODE_IF, dst_null_d(),
724			    this->result, src_reg(0));
725   inst->conditional_mod = BRW_CONDITIONAL_NZ;
726}
727
728void
729vec4_visitor::visit(ir_variable *ir)
730{
731   dst_reg *reg = NULL;
732
733   if (variable_storage(ir))
734      return;
735
736   switch (ir->mode) {
737   case ir_var_in:
738      reg = new(mem_ctx) dst_reg(ATTR, ir->location);
739
740      /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
741       * come in as floating point conversions of the integer values.
742       */
743      for (int i = ir->location; i < ir->location + type_size(ir->type); i++) {
744	 if (!c->key.gl_fixed_input_size[i])
745	    continue;
746
747	 dst_reg dst = *reg;
748	 dst.writemask = (1 << c->key.gl_fixed_input_size[i]) - 1;
749	 emit(BRW_OPCODE_MUL, dst, src_reg(dst), src_reg(1.0f / 65536.0f));
750      }
751      break;
752
753   case ir_var_out:
754      reg = new(mem_ctx) dst_reg(this, ir->type);
755
756      for (int i = 0; i < type_size(ir->type); i++) {
757	 output_reg[ir->location + i] = *reg;
758	 output_reg[ir->location + i].reg_offset = i;
759	 output_reg[ir->location + i].type = BRW_REGISTER_TYPE_F;
760      }
761      break;
762
763   case ir_var_auto:
764   case ir_var_temporary:
765      reg = new(mem_ctx) dst_reg(this, ir->type);
766      break;
767
768   case ir_var_uniform:
769      reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
770
771      /* Track how big the whole uniform variable is, in case we need to put a
772       * copy of its data into pull constants for array access.
773       */
774      this->uniform_size[this->uniforms] = type_size(ir->type);
775
776      if (!strncmp(ir->name, "gl_", 3)) {
777	 setup_builtin_uniform_values(ir);
778      } else {
779	 setup_uniform_values(ir->location, ir->type);
780      }
781      break;
782
783   default:
784      assert(!"not reached");
785   }
786
787   reg->type = brw_type_for_base_type(ir->type);
788   hash_table_insert(this->variable_ht, reg, ir);
789}
790
791void
792vec4_visitor::visit(ir_loop *ir)
793{
794   dst_reg counter;
795
796   /* We don't want debugging output to print the whole body of the
797    * loop as the annotation.
798    */
799   this->base_ir = NULL;
800
801   if (ir->counter != NULL) {
802      this->base_ir = ir->counter;
803      ir->counter->accept(this);
804      counter = *(variable_storage(ir->counter));
805
806      if (ir->from != NULL) {
807	 this->base_ir = ir->from;
808	 ir->from->accept(this);
809
810	 emit(BRW_OPCODE_MOV, counter, this->result);
811      }
812   }
813
814   emit(BRW_OPCODE_DO);
815
816   if (ir->to) {
817      this->base_ir = ir->to;
818      ir->to->accept(this);
819
820      vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst_null_d(),
821				    src_reg(counter), this->result);
822      inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
823
824      inst = emit(BRW_OPCODE_BREAK);
825      inst->predicate = BRW_PREDICATE_NORMAL;
826   }
827
828   visit_instructions(&ir->body_instructions);
829
830
831   if (ir->increment) {
832      this->base_ir = ir->increment;
833      ir->increment->accept(this);
834      emit(BRW_OPCODE_ADD, counter, src_reg(counter), this->result);
835   }
836
837   emit(BRW_OPCODE_WHILE);
838}
839
840void
841vec4_visitor::visit(ir_loop_jump *ir)
842{
843   switch (ir->mode) {
844   case ir_loop_jump::jump_break:
845      emit(BRW_OPCODE_BREAK);
846      break;
847   case ir_loop_jump::jump_continue:
848      emit(BRW_OPCODE_CONTINUE);
849      break;
850   }
851}
852
853
854void
855vec4_visitor::visit(ir_function_signature *ir)
856{
857   assert(0);
858   (void)ir;
859}
860
861void
862vec4_visitor::visit(ir_function *ir)
863{
864   /* Ignore function bodies other than main() -- we shouldn't see calls to
865    * them since they should all be inlined.
866    */
867   if (strcmp(ir->name, "main") == 0) {
868      const ir_function_signature *sig;
869      exec_list empty;
870
871      sig = ir->matching_signature(&empty);
872
873      assert(sig);
874
875      visit_instructions(&sig->body);
876   }
877}
878
879GLboolean
880vec4_visitor::try_emit_sat(ir_expression *ir)
881{
882   ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
883   if (!sat_src)
884      return false;
885
886   sat_src->accept(this);
887   src_reg src = this->result;
888
889   this->result = src_reg(this, ir->type);
890   vec4_instruction *inst;
891   inst = emit(BRW_OPCODE_MOV, dst_reg(this->result), src);
892   inst->saturate = true;
893
894   return true;
895}
896
897void
898vec4_visitor::emit_bool_comparison(unsigned int op,
899				 dst_reg dst, src_reg src0, src_reg src1)
900{
901   /* original gen4 does destination conversion before comparison. */
902   if (intel->gen < 5)
903      dst.type = src0.type;
904
905   vec4_instruction *inst = emit(BRW_OPCODE_CMP, dst, src0, src1);
906   inst->conditional_mod = brw_conditional_for_comparison(op);
907
908   dst.type = BRW_REGISTER_TYPE_D;
909   emit(BRW_OPCODE_AND, dst, src_reg(dst), src_reg(0x1));
910}
911
912void
913vec4_visitor::visit(ir_expression *ir)
914{
915   unsigned int operand;
916   src_reg op[Elements(ir->operands)];
917   src_reg result_src;
918   dst_reg result_dst;
919   vec4_instruction *inst;
920
921   if (try_emit_sat(ir))
922      return;
923
924   for (operand = 0; operand < ir->get_num_operands(); operand++) {
925      this->result.file = BAD_FILE;
926      ir->operands[operand]->accept(this);
927      if (this->result.file == BAD_FILE) {
928	 printf("Failed to get tree for expression operand:\n");
929	 ir->operands[operand]->print();
930	 exit(1);
931      }
932      op[operand] = this->result;
933
934      /* Matrix expression operands should have been broken down to vector
935       * operations already.
936       */
937      assert(!ir->operands[operand]->type->is_matrix());
938   }
939
940   int vector_elements = ir->operands[0]->type->vector_elements;
941   if (ir->operands[1]) {
942      vector_elements = MAX2(vector_elements,
943			     ir->operands[1]->type->vector_elements);
944   }
945
946   this->result.file = BAD_FILE;
947
948   /* Storage for our result.  Ideally for an assignment we'd be using
949    * the actual storage for the result here, instead.
950    */
951   result_src = src_reg(this, ir->type);
952   /* convenience for the emit functions below. */
953   result_dst = dst_reg(result_src);
954   /* If nothing special happens, this is the result. */
955   this->result = result_src;
956   /* Limit writes to the channels that will be used by result_src later.
957    * This does limit this temp's use as a temporary for multi-instruction
958    * sequences.
959    */
960   result_dst.writemask = (1 << ir->type->vector_elements) - 1;
961
962   switch (ir->operation) {
963   case ir_unop_logic_not:
964      /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
965       * ones complement of the whole register, not just bit 0.
966       */
967      emit(BRW_OPCODE_XOR, result_dst, op[0], src_reg(1));
968      break;
969   case ir_unop_neg:
970      op[0].negate = !op[0].negate;
971      this->result = op[0];
972      break;
973   case ir_unop_abs:
974      op[0].abs = true;
975      op[0].negate = false;
976      this->result = op[0];
977      break;
978
979   case ir_unop_sign:
980      emit(BRW_OPCODE_MOV, result_dst, src_reg(0.0f));
981
982      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
983      inst->conditional_mod = BRW_CONDITIONAL_G;
984      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1.0f));
985      inst->predicate = BRW_PREDICATE_NORMAL;
986
987      inst = emit(BRW_OPCODE_CMP, dst_null_f(), op[0], src_reg(0.0f));
988      inst->conditional_mod = BRW_CONDITIONAL_L;
989      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(-1.0f));
990      inst->predicate = BRW_PREDICATE_NORMAL;
991
992      break;
993
994   case ir_unop_rcp:
995      emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
996      break;
997
998   case ir_unop_exp2:
999      emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1000      break;
1001   case ir_unop_log2:
1002      emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1003      break;
1004   case ir_unop_exp:
1005   case ir_unop_log:
1006      assert(!"not reached: should be handled by ir_explog_to_explog2");
1007      break;
1008   case ir_unop_sin:
1009   case ir_unop_sin_reduced:
1010      emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1011      break;
1012   case ir_unop_cos:
1013   case ir_unop_cos_reduced:
1014      emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1015      break;
1016
1017   case ir_unop_dFdx:
1018   case ir_unop_dFdy:
1019      assert(!"derivatives not valid in vertex shader");
1020      break;
1021
1022   case ir_unop_noise:
1023      assert(!"not reached: should be handled by lower_noise");
1024      break;
1025
1026   case ir_binop_add:
1027      emit(BRW_OPCODE_ADD, result_dst, op[0], op[1]);
1028      break;
1029   case ir_binop_sub:
1030      assert(!"not reached: should be handled by ir_sub_to_add_neg");
1031      break;
1032
1033   case ir_binop_mul:
1034      if (ir->type->is_integer()) {
1035	 /* For integer multiplication, the MUL uses the low 16 bits
1036	  * of one of the operands (src0 on gen6, src1 on gen7).  The
1037	  * MACH accumulates in the contribution of the upper 16 bits
1038	  * of that operand.
1039	  *
1040	  * FINISHME: Emit just the MUL if we know an operand is small
1041	  * enough.
1042	  */
1043	 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1044
1045	 emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
1046	 emit(BRW_OPCODE_MACH, dst_null_d(), op[0], op[1]);
1047	 emit(BRW_OPCODE_MOV, result_dst, src_reg(acc));
1048      } else {
1049	 emit(BRW_OPCODE_MUL, result_dst, op[0], op[1]);
1050      }
1051      break;
1052   case ir_binop_div:
1053      assert(!"not reached: should be handled by ir_div_to_mul_rcp");
1054   case ir_binop_mod:
1055      assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
1056      break;
1057
1058   case ir_binop_less:
1059   case ir_binop_greater:
1060   case ir_binop_lequal:
1061   case ir_binop_gequal:
1062   case ir_binop_equal:
1063   case ir_binop_nequal: {
1064      dst_reg temp = result_dst;
1065      /* original gen4 does implicit conversion before comparison. */
1066      if (intel->gen < 5)
1067	 temp.type = op[0].type;
1068
1069      inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1070      inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
1071      emit(BRW_OPCODE_AND, result_dst, this->result, src_reg(0x1));
1072      break;
1073   }
1074
1075   case ir_binop_all_equal:
1076      /* "==" operator producing a scalar boolean. */
1077      if (ir->operands[0]->type->is_vector() ||
1078	  ir->operands[1]->type->is_vector()) {
1079	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
1080	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1081
1082	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1083	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1084	 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1085      } else {
1086	 dst_reg temp = result_dst;
1087	 /* original gen4 does implicit conversion before comparison. */
1088	 if (intel->gen < 5)
1089	    temp.type = op[0].type;
1090
1091	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1092	 inst->conditional_mod = BRW_CONDITIONAL_Z;
1093	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
1094      }
1095      break;
1096   case ir_binop_any_nequal:
1097      /* "!=" operator producing a scalar boolean. */
1098      if (ir->operands[0]->type->is_vector() ||
1099	  ir->operands[1]->type->is_vector()) {
1100	 inst = emit(BRW_OPCODE_CMP, dst_null_cmp(), op[0], op[1]);
1101	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1102
1103	 emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1104	 inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1105	 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1106      } else {
1107	 dst_reg temp = result_dst;
1108	 /* original gen4 does implicit conversion before comparison. */
1109	 if (intel->gen < 5)
1110	    temp.type = op[0].type;
1111
1112	 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
1113	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1114	 emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(0x1));
1115      }
1116      break;
1117
1118   case ir_unop_any:
1119      inst = emit(BRW_OPCODE_CMP, dst_null_d(), op[0], src_reg(0));
1120      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1121
1122      emit(BRW_OPCODE_MOV, result_dst, src_reg(0));
1123
1124      inst = emit(BRW_OPCODE_MOV, result_dst, src_reg(1));
1125      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1126      break;
1127
1128   case ir_binop_logic_xor:
1129      emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
1130      break;
1131
1132   case ir_binop_logic_or:
1133      emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
1134      break;
1135
1136   case ir_binop_logic_and:
1137      emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
1138      break;
1139
1140   case ir_binop_dot:
1141      assert(ir->operands[0]->type->is_vector());
1142      assert(ir->operands[0]->type == ir->operands[1]->type);
1143      emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1144      break;
1145
1146   case ir_unop_sqrt:
1147      emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1148      break;
1149   case ir_unop_rsq:
1150      emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1151      break;
1152   case ir_unop_i2f:
1153   case ir_unop_i2u:
1154   case ir_unop_u2i:
1155   case ir_unop_u2f:
1156   case ir_unop_b2f:
1157   case ir_unop_b2i:
1158   case ir_unop_f2i:
1159      emit(BRW_OPCODE_MOV, result_dst, op[0]);
1160      break;
1161   case ir_unop_f2b:
1162   case ir_unop_i2b: {
1163      dst_reg temp = result_dst;
1164      /* original gen4 does implicit conversion before comparison. */
1165      if (intel->gen < 5)
1166	 temp.type = op[0].type;
1167
1168      inst = emit(BRW_OPCODE_CMP, temp, op[0], src_reg(0.0f));
1169      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1170      inst = emit(BRW_OPCODE_AND, result_dst, result_src, src_reg(1));
1171      break;
1172   }
1173
1174   case ir_unop_trunc:
1175      emit(BRW_OPCODE_RNDZ, result_dst, op[0]);
1176      break;
1177   case ir_unop_ceil:
1178      op[0].negate = !op[0].negate;
1179      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
1180      this->result.negate = true;
1181      break;
1182   case ir_unop_floor:
1183      inst = emit(BRW_OPCODE_RNDD, result_dst, op[0]);
1184      break;
1185   case ir_unop_fract:
1186      inst = emit(BRW_OPCODE_FRC, result_dst, op[0]);
1187      break;
1188   case ir_unop_round_even:
1189      emit(BRW_OPCODE_RNDE, result_dst, op[0]);
1190      break;
1191
1192   case ir_binop_min:
1193      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
1194      inst->conditional_mod = BRW_CONDITIONAL_L;
1195
1196      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1197      inst->predicate = BRW_PREDICATE_NORMAL;
1198      break;
1199   case ir_binop_max:
1200      inst = emit(BRW_OPCODE_CMP, result_dst, op[0], op[1]);
1201      inst->conditional_mod = BRW_CONDITIONAL_G;
1202
1203      inst = emit(BRW_OPCODE_SEL, result_dst, op[0], op[1]);
1204      inst->predicate = BRW_PREDICATE_NORMAL;
1205      break;
1206
1207   case ir_binop_pow:
1208      emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1209      break;
1210
1211   case ir_unop_bit_not:
1212      inst = emit(BRW_OPCODE_NOT, result_dst, op[0]);
1213      break;
1214   case ir_binop_bit_and:
1215      inst = emit(BRW_OPCODE_AND, result_dst, op[0], op[1]);
1216      break;
1217   case ir_binop_bit_xor:
1218      inst = emit(BRW_OPCODE_XOR, result_dst, op[0], op[1]);
1219      break;
1220   case ir_binop_bit_or:
1221      inst = emit(BRW_OPCODE_OR, result_dst, op[0], op[1]);
1222      break;
1223
1224   case ir_binop_lshift:
1225   case ir_binop_rshift:
1226      assert(!"GLSL 1.30 features unsupported");
1227      break;
1228
1229   case ir_quadop_vector:
1230      assert(!"not reached: should be handled by lower_quadop_vector");
1231      break;
1232   }
1233}
1234
1235
1236void
1237vec4_visitor::visit(ir_swizzle *ir)
1238{
1239   src_reg src;
1240   int i = 0;
1241   int swizzle[4];
1242
1243   /* Note that this is only swizzles in expressions, not those on the left
1244    * hand side of an assignment, which do write masking.  See ir_assignment
1245    * for that.
1246    */
1247
1248   ir->val->accept(this);
1249   src = this->result;
1250   assert(src.file != BAD_FILE);
1251
1252   for (i = 0; i < ir->type->vector_elements; i++) {
1253      switch (i) {
1254      case 0:
1255	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1256	 break;
1257      case 1:
1258	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1259	 break;
1260      case 2:
1261	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1262	 break;
1263      case 3:
1264	 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1265	    break;
1266      }
1267   }
1268   for (; i < 4; i++) {
1269      /* Replicate the last channel out. */
1270      swizzle[i] = swizzle[ir->type->vector_elements - 1];
1271   }
1272
1273   src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1274
1275   this->result = src;
1276}
1277
1278void
1279vec4_visitor::visit(ir_dereference_variable *ir)
1280{
1281   const struct glsl_type *type = ir->type;
1282   dst_reg *reg = variable_storage(ir->var);
1283
1284   if (!reg) {
1285      fail("Failed to find variable storage for %s\n", ir->var->name);
1286      this->result = src_reg(brw_null_reg());
1287      return;
1288   }
1289
1290   this->result = src_reg(*reg);
1291
1292   if (type->is_scalar() || type->is_vector() || type->is_matrix())
1293      this->result.swizzle = swizzle_for_size(type->vector_elements);
1294}
1295
1296void
1297vec4_visitor::visit(ir_dereference_array *ir)
1298{
1299   ir_constant *constant_index;
1300   src_reg src;
1301   int element_size = type_size(ir->type);
1302
1303   constant_index = ir->array_index->constant_expression_value();
1304
1305   ir->array->accept(this);
1306   src = this->result;
1307
1308   if (constant_index) {
1309      src.reg_offset += constant_index->value.i[0] * element_size;
1310   } else {
1311      /* Variable index array dereference.  It eats the "vec4" of the
1312       * base of the array and an index that offsets the Mesa register
1313       * index.
1314       */
1315      ir->array_index->accept(this);
1316
1317      src_reg index_reg;
1318
1319      if (element_size == 1) {
1320	 index_reg = this->result;
1321      } else {
1322	 index_reg = src_reg(this, glsl_type::int_type);
1323
1324	 emit(BRW_OPCODE_MUL, dst_reg(index_reg),
1325	      this->result, src_reg(element_size));
1326      }
1327
1328      if (src.reladdr) {
1329	 src_reg temp = src_reg(this, glsl_type::int_type);
1330
1331	 emit(BRW_OPCODE_ADD, dst_reg(temp), *src.reladdr, index_reg);
1332
1333	 index_reg = temp;
1334      }
1335
1336      src.reladdr = ralloc(mem_ctx, src_reg);
1337      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1338   }
1339
1340   /* If the type is smaller than a vec4, replicate the last channel out. */
1341   if (ir->type->is_scalar() || ir->type->is_vector())
1342      src.swizzle = swizzle_for_size(ir->type->vector_elements);
1343   else
1344      src.swizzle = BRW_SWIZZLE_NOOP;
1345   src.type = brw_type_for_base_type(ir->type);
1346
1347   this->result = src;
1348}
1349
1350void
1351vec4_visitor::visit(ir_dereference_record *ir)
1352{
1353   unsigned int i;
1354   const glsl_type *struct_type = ir->record->type;
1355   int offset = 0;
1356
1357   ir->record->accept(this);
1358
1359   for (i = 0; i < struct_type->length; i++) {
1360      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1361	 break;
1362      offset += type_size(struct_type->fields.structure[i].type);
1363   }
1364
1365   /* If the type is smaller than a vec4, replicate the last channel out. */
1366   if (ir->type->is_scalar() || ir->type->is_vector())
1367      this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1368   else
1369      this->result.swizzle = BRW_SWIZZLE_NOOP;
1370   this->result.type = brw_type_for_base_type(ir->type);
1371
1372   this->result.reg_offset += offset;
1373}
1374
1375/**
1376 * We want to be careful in assignment setup to hit the actual storage
1377 * instead of potentially using a temporary like we might with the
1378 * ir_dereference handler.
1379 */
1380static dst_reg
1381get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1382{
1383   /* The LHS must be a dereference.  If the LHS is a variable indexed array
1384    * access of a vector, it must be separated into a series conditional moves
1385    * before reaching this point (see ir_vec_index_to_cond_assign).
1386    */
1387   assert(ir->as_dereference());
1388   ir_dereference_array *deref_array = ir->as_dereference_array();
1389   if (deref_array) {
1390      assert(!deref_array->array->type->is_vector());
1391   }
1392
1393   /* Use the rvalue deref handler for the most part.  We'll ignore
1394    * swizzles in it and write swizzles using writemask, though.
1395    */
1396   ir->accept(v);
1397   return dst_reg(v->result);
1398}
1399
1400void
1401vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1402			      const struct glsl_type *type, bool predicated)
1403{
1404   if (type->base_type == GLSL_TYPE_STRUCT) {
1405      for (unsigned int i = 0; i < type->length; i++) {
1406	 emit_block_move(dst, src, type->fields.structure[i].type, predicated);
1407      }
1408      return;
1409   }
1410
1411   if (type->is_array()) {
1412      for (unsigned int i = 0; i < type->length; i++) {
1413	 emit_block_move(dst, src, type->fields.array, predicated);
1414      }
1415      return;
1416   }
1417
1418   if (type->is_matrix()) {
1419      const struct glsl_type *vec_type;
1420
1421      vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1422					 type->vector_elements, 1);
1423
1424      for (int i = 0; i < type->matrix_columns; i++) {
1425	 emit_block_move(dst, src, vec_type, predicated);
1426      }
1427      return;
1428   }
1429
1430   assert(type->is_scalar() || type->is_vector());
1431
1432   dst->type = brw_type_for_base_type(type);
1433   src->type = dst->type;
1434
1435   dst->writemask = (1 << type->vector_elements) - 1;
1436
1437   /* Do we need to worry about swizzling a swizzle? */
1438   assert(src->swizzle = BRW_SWIZZLE_NOOP);
1439   src->swizzle = swizzle_for_size(type->vector_elements);
1440
1441   vec4_instruction *inst = emit(BRW_OPCODE_MOV, *dst, *src);
1442   if (predicated)
1443      inst->predicate = BRW_PREDICATE_NORMAL;
1444
1445   dst->reg_offset++;
1446   src->reg_offset++;
1447}
1448
1449
1450/* If the RHS processing resulted in an instruction generating a
1451 * temporary value, and it would be easy to rewrite the instruction to
1452 * generate its result right into the LHS instead, do so.  This ends
1453 * up reliably removing instructions where it can be tricky to do so
1454 * later without real UD chain information.
1455 */
1456bool
1457vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1458				     dst_reg dst,
1459				     src_reg src,
1460				     vec4_instruction *pre_rhs_inst,
1461				     vec4_instruction *last_rhs_inst)
1462{
1463   /* This could be supported, but it would take more smarts. */
1464   if (ir->condition)
1465      return false;
1466
1467   if (pre_rhs_inst == last_rhs_inst)
1468      return false; /* No instructions generated to work with. */
1469
1470   /* Make sure the last instruction generated our source reg. */
1471   if (src.file != GRF ||
1472       src.file != last_rhs_inst->dst.file ||
1473       src.reg != last_rhs_inst->dst.reg ||
1474       src.reg_offset != last_rhs_inst->dst.reg_offset ||
1475       src.reladdr ||
1476       src.abs ||
1477       src.negate ||
1478       last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1479      return false;
1480
1481   /* Check that that last instruction fully initialized the channels
1482    * we want to use, in the order we want to use them.  We could
1483    * potentially reswizzle the operands of many instructions so that
1484    * we could handle out of order channels, but don't yet.
1485    */
1486   for (int i = 0; i < 4; i++) {
1487      if (dst.writemask & (1 << i)) {
1488	 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1489	    return false;
1490
1491	 if (BRW_GET_SWZ(src.swizzle, i) != i)
1492	    return false;
1493      }
1494   }
1495
1496   /* Success!  Rewrite the instruction. */
1497   last_rhs_inst->dst.file = dst.file;
1498   last_rhs_inst->dst.reg = dst.reg;
1499   last_rhs_inst->dst.reg_offset = dst.reg_offset;
1500   last_rhs_inst->dst.reladdr = dst.reladdr;
1501   last_rhs_inst->dst.writemask &= dst.writemask;
1502
1503   return true;
1504}
1505
1506void
1507vec4_visitor::visit(ir_assignment *ir)
1508{
1509   dst_reg dst = get_assignment_lhs(ir->lhs, this);
1510
1511   if (!ir->lhs->type->is_scalar() &&
1512       !ir->lhs->type->is_vector()) {
1513      ir->rhs->accept(this);
1514      src_reg src = this->result;
1515
1516      if (ir->condition) {
1517	 emit_bool_to_cond_code(ir->condition);
1518      }
1519
1520      emit_block_move(&dst, &src, ir->rhs->type, ir->condition != NULL);
1521      return;
1522   }
1523
1524   /* Now we're down to just a scalar/vector with writemasks. */
1525   int i;
1526
1527   vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1528   pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1529
1530   ir->rhs->accept(this);
1531
1532   last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1533
1534   src_reg src = this->result;
1535
1536   int swizzles[4];
1537   int first_enabled_chan = 0;
1538   int src_chan = 0;
1539
1540   assert(ir->lhs->type->is_vector() ||
1541	  ir->lhs->type->is_scalar());
1542   dst.writemask = ir->write_mask;
1543
1544   for (int i = 0; i < 4; i++) {
1545      if (dst.writemask & (1 << i)) {
1546	 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1547	 break;
1548      }
1549   }
1550
1551   /* Swizzle a small RHS vector into the channels being written.
1552    *
1553    * glsl ir treats write_mask as dictating how many channels are
1554    * present on the RHS while in our instructions we need to make
1555    * those channels appear in the slots of the vec4 they're written to.
1556    */
1557   for (int i = 0; i < 4; i++) {
1558      if (dst.writemask & (1 << i))
1559	 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1560      else
1561	 swizzles[i] = first_enabled_chan;
1562   }
1563   src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1564			      swizzles[2], swizzles[3]);
1565
1566   if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1567      return;
1568   }
1569
1570   if (ir->condition) {
1571      emit_bool_to_cond_code(ir->condition);
1572   }
1573
1574   for (i = 0; i < type_size(ir->lhs->type); i++) {
1575      vec4_instruction *inst = emit(BRW_OPCODE_MOV, dst, src);
1576
1577      if (ir->condition)
1578	 inst->predicate = BRW_PREDICATE_NORMAL;
1579
1580      dst.reg_offset++;
1581      src.reg_offset++;
1582   }
1583}
1584
1585void
1586vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1587{
1588   if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1589      foreach_list(node, &ir->components) {
1590	 ir_constant *field_value = (ir_constant *)node;
1591
1592	 emit_constant_values(dst, field_value);
1593      }
1594      return;
1595   }
1596
1597   if (ir->type->is_array()) {
1598      for (unsigned int i = 0; i < ir->type->length; i++) {
1599	 emit_constant_values(dst, ir->array_elements[i]);
1600      }
1601      return;
1602   }
1603
1604   if (ir->type->is_matrix()) {
1605      for (int i = 0; i < ir->type->matrix_columns; i++) {
1606	 for (int j = 0; j < ir->type->vector_elements; j++) {
1607	    dst->writemask = 1 << j;
1608	    dst->type = BRW_REGISTER_TYPE_F;
1609
1610	    emit(BRW_OPCODE_MOV, *dst,
1611		 src_reg(ir->value.f[i * ir->type->vector_elements + j]));
1612	 }
1613	 dst->reg_offset++;
1614      }
1615      return;
1616   }
1617
1618   for (int i = 0; i < ir->type->vector_elements; i++) {
1619      dst->writemask = 1 << i;
1620      dst->type = brw_type_for_base_type(ir->type);
1621
1622      switch (ir->type->base_type) {
1623      case GLSL_TYPE_FLOAT:
1624	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.f[i]));
1625	 break;
1626      case GLSL_TYPE_INT:
1627	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.i[i]));
1628	 break;
1629      case GLSL_TYPE_UINT:
1630	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.u[i]));
1631	 break;
1632      case GLSL_TYPE_BOOL:
1633	 emit(BRW_OPCODE_MOV, *dst, src_reg(ir->value.b[i]));
1634	 break;
1635      default:
1636	 assert(!"Non-float/uint/int/bool constant");
1637	 break;
1638      }
1639   }
1640   dst->reg_offset++;
1641}
1642
1643void
1644vec4_visitor::visit(ir_constant *ir)
1645{
1646   dst_reg dst = dst_reg(this, ir->type);
1647   this->result = src_reg(dst);
1648
1649   emit_constant_values(&dst, ir);
1650}
1651
1652void
1653vec4_visitor::visit(ir_call *ir)
1654{
1655   assert(!"not reached");
1656}
1657
1658void
1659vec4_visitor::visit(ir_texture *ir)
1660{
1661   /* FINISHME: Implement vertex texturing.
1662    *
1663    * With 0 vertex samplers available, the linker will reject
1664    * programs that do vertex texturing, but after our visitor has
1665    * run.
1666    */
1667}
1668
1669void
1670vec4_visitor::visit(ir_return *ir)
1671{
1672   assert(!"not reached");
1673}
1674
1675void
1676vec4_visitor::visit(ir_discard *ir)
1677{
1678   assert(!"not reached");
1679}
1680
1681void
1682vec4_visitor::visit(ir_if *ir)
1683{
1684   /* Don't point the annotation at the if statement, because then it plus
1685    * the then and else blocks get printed.
1686    */
1687   this->base_ir = ir->condition;
1688
1689   if (intel->gen == 6) {
1690      emit_if_gen6(ir);
1691   } else {
1692      emit_bool_to_cond_code(ir->condition);
1693      vec4_instruction *inst = emit(BRW_OPCODE_IF);
1694      inst->predicate = BRW_PREDICATE_NORMAL;
1695   }
1696
1697   visit_instructions(&ir->then_instructions);
1698
1699   if (!ir->else_instructions.is_empty()) {
1700      this->base_ir = ir->condition;
1701      emit(BRW_OPCODE_ELSE);
1702
1703      visit_instructions(&ir->else_instructions);
1704   }
1705
1706   this->base_ir = ir->condition;
1707   emit(BRW_OPCODE_ENDIF);
1708}
1709
1710int
1711vec4_visitor::emit_vue_header_gen4(int header_mrf)
1712{
1713   /* Get the position */
1714   src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
1715
1716   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1717   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1718
1719   current_annotation = "NDC";
1720   dst_reg ndc_w = ndc;
1721   ndc_w.writemask = WRITEMASK_W;
1722   src_reg pos_w = pos;
1723   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1724   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1725
1726   dst_reg ndc_xyz = ndc;
1727   ndc_xyz.writemask = WRITEMASK_XYZ;
1728
1729   emit(BRW_OPCODE_MUL, ndc_xyz, pos, src_reg(ndc_w));
1730
1731   if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1732       c->key.nr_userclip || brw->has_negative_rhw_bug) {
1733      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1734      GLuint i;
1735
1736      emit(BRW_OPCODE_MOV, header1, 0u);
1737
1738      if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1739	 assert(!"finishme: psiz");
1740	 src_reg psiz;
1741
1742	 header1.writemask = WRITEMASK_W;
1743	 emit(BRW_OPCODE_MUL, header1, psiz, 1u << 11);
1744	 emit(BRW_OPCODE_AND, header1, src_reg(header1), 0x7ff << 8);
1745      }
1746
1747      for (i = 0; i < c->key.nr_userclip; i++) {
1748	 vec4_instruction *inst;
1749
1750	 inst = emit(BRW_OPCODE_DP4, dst_reg(brw_null_reg()),
1751		     pos, src_reg(c->userplane[i]));
1752	 inst->conditional_mod = BRW_CONDITIONAL_L;
1753
1754	 emit(BRW_OPCODE_OR, header1, src_reg(header1), 1u << i);
1755	 inst->predicate = BRW_PREDICATE_NORMAL;
1756      }
1757
1758      /* i965 clipping workaround:
1759       * 1) Test for -ve rhw
1760       * 2) If set,
1761       *      set ndc = (0,0,0,0)
1762       *      set ucp[6] = 1
1763       *
1764       * Later, clipping will detect ucp[6] and ensure the primitive is
1765       * clipped against all fixed planes.
1766       */
1767      if (brw->has_negative_rhw_bug) {
1768#if 0
1769	 /* FINISHME */
1770	 brw_CMP(p,
1771		 vec8(brw_null_reg()),
1772		 BRW_CONDITIONAL_L,
1773		 brw_swizzle1(ndc, 3),
1774		 brw_imm_f(0));
1775
1776	 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1777	 brw_MOV(p, ndc, brw_imm_f(0));
1778	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1779#endif
1780      }
1781
1782      header1.writemask = WRITEMASK_XYZW;
1783      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(header1));
1784   } else {
1785      emit(BRW_OPCODE_MOV, retype(brw_message_reg(header_mrf++),
1786				  BRW_REGISTER_TYPE_UD), 0u);
1787   }
1788
1789   if (intel->gen == 5) {
1790      /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1791       * dword 0-3 (m1) of the header is indices, point width, clip flags.
1792       * dword 4-7 (m2) is the ndc position (set above)
1793       * dword 8-11 (m3) of the vertex header is the 4D space position
1794       * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1795       * m6 is a pad so that the vertex element data is aligned
1796       * m7 is the first vertex data we fill.
1797       */
1798      current_annotation = "NDC";
1799      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
1800
1801      current_annotation = "gl_Position";
1802      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
1803
1804      /* user clip distance. */
1805      header_mrf += 2;
1806
1807      /* Pad so that vertex element data is aligned. */
1808      header_mrf++;
1809   } else {
1810      /* There are 8 dwords in VUE header pre-Ironlake:
1811       * dword 0-3 (m1) is indices, point width, clip flags.
1812       * dword 4-7 (m2) is ndc position (set above)
1813       *
1814       * dword 8-11 (m3) is the first vertex data.
1815       */
1816      current_annotation = "NDC";
1817      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), src_reg(ndc));
1818
1819      current_annotation = "gl_Position";
1820      emit(BRW_OPCODE_MOV, brw_message_reg(header_mrf++), pos);
1821   }
1822
1823   return header_mrf;
1824}
1825
1826int
1827vec4_visitor::emit_vue_header_gen6(int header_mrf)
1828{
1829   struct brw_reg reg;
1830
1831   /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1832    * dword 0-3 (m2) of the header is indices, point width, clip flags.
1833    * dword 4-7 (m3) is the 4D space position
1834    * dword 8-15 (m4,m5) of the vertex header is the user clip distance if
1835    * enabled.
1836    *
1837    * m4 or 6 is the first vertex element data we fill.
1838    */
1839
1840   current_annotation = "indices, point width, clip flags";
1841   reg = brw_message_reg(header_mrf++);
1842   emit(BRW_OPCODE_MOV, retype(reg, BRW_REGISTER_TYPE_D), src_reg(0));
1843   if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1844      emit(BRW_OPCODE_MOV, brw_writemask(reg, WRITEMASK_W),
1845	   src_reg(output_reg[VERT_RESULT_PSIZ]));
1846   }
1847
1848   current_annotation = "gl_Position";
1849   emit(BRW_OPCODE_MOV,
1850	brw_message_reg(header_mrf++), src_reg(output_reg[VERT_RESULT_HPOS]));
1851
1852   current_annotation = "user clip distances";
1853   if (c->key.nr_userclip) {
1854      for (int i = 0; i < c->key.nr_userclip; i++) {
1855	 struct brw_reg m;
1856	 if (i < 4)
1857	    m = brw_message_reg(header_mrf);
1858	 else
1859	    m = brw_message_reg(header_mrf + 1);
1860
1861	 emit(DP4(dst_reg(brw_writemask(m, 1 << (i & 3))),
1862		  src_reg(output_reg[VERT_RESULT_HPOS]),
1863		  src_reg(c->userplane[i])));
1864      }
1865      header_mrf += 2;
1866   }
1867
1868   current_annotation = NULL;
1869
1870   return header_mrf;
1871}
1872
1873static int
1874align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1875{
1876   struct intel_context *intel = &brw->intel;
1877
1878   if (intel->gen >= 6) {
1879      /* URB data written (does not include the message header reg) must
1880       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1881       * section 5.4.3.2.2: URB_INTERLEAVED.
1882       *
1883       * URB entries are allocated on a multiple of 1024 bits, so an
1884       * extra 128 bits written here to make the end align to 256 is
1885       * no problem.
1886       */
1887      if ((mlen % 2) != 1)
1888	 mlen++;
1889   }
1890
1891   return mlen;
1892}
1893
1894/**
1895 * Generates the VUE payload plus the 1 or 2 URB write instructions to
1896 * complete the VS thread.
1897 *
1898 * The VUE layout is documented in Volume 2a.
1899 */
1900void
1901vec4_visitor::emit_urb_writes()
1902{
1903   /* MRF 0 is reserved for the debugger, so start with message header
1904    * in MRF 1.
1905    */
1906   int base_mrf = 1;
1907   int mrf = base_mrf;
1908   int urb_entry_size;
1909   uint64_t outputs_remaining = c->prog_data.outputs_written;
1910   /* In the process of generating our URB write message contents, we
1911    * may need to unspill a register or load from an array.  Those
1912    * reads would use MRFs 14-15.
1913    */
1914   int max_usable_mrf = 13;
1915
1916   /* FINISHME: edgeflag */
1917
1918   /* First mrf is the g0-based message header containing URB handles and such,
1919    * which is implied in VS_OPCODE_URB_WRITE.
1920    */
1921   mrf++;
1922
1923   if (intel->gen >= 6) {
1924      mrf = emit_vue_header_gen6(mrf);
1925   } else {
1926      mrf = emit_vue_header_gen4(mrf);
1927   }
1928
1929   /* Set up the VUE data for the first URB write */
1930   int attr;
1931   for (attr = 0; attr < VERT_RESULT_MAX; attr++) {
1932      if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1933	 continue;
1934
1935      outputs_remaining &= ~BITFIELD64_BIT(attr);
1936
1937      /* This is set up in the VUE header. */
1938      if (attr == VERT_RESULT_HPOS)
1939	 continue;
1940
1941      /* This is loaded into the VUE header, and thus doesn't occupy
1942       * an attribute slot.
1943       */
1944      if (attr == VERT_RESULT_PSIZ)
1945	 continue;
1946
1947      vec4_instruction *inst = emit(BRW_OPCODE_MOV, brw_message_reg(mrf++),
1948				    src_reg(output_reg[attr]));
1949
1950      if ((attr == VERT_RESULT_COL0 ||
1951	   attr == VERT_RESULT_COL1 ||
1952	   attr == VERT_RESULT_BFC0 ||
1953	   attr == VERT_RESULT_BFC1) &&
1954	  c->key.clamp_vertex_color) {
1955	 inst->saturate = true;
1956      }
1957
1958      /* If this was MRF 15, we can't fit anything more into this URB
1959       * WRITE.  Note that base_mrf of 1 means that MRF 15 is an
1960       * even-numbered amount of URB write data, which will meet
1961       * gen6's requirements for length alignment.
1962       */
1963      if (mrf > max_usable_mrf) {
1964	 attr++;
1965	 break;
1966      }
1967   }
1968
1969   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
1970   inst->base_mrf = base_mrf;
1971   inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1972   inst->eot = !outputs_remaining;
1973
1974   urb_entry_size = mrf - base_mrf;
1975
1976   /* Optional second URB write */
1977   if (outputs_remaining) {
1978      mrf = base_mrf + 1;
1979
1980      for (; attr < VERT_RESULT_MAX; attr++) {
1981	 if (!(c->prog_data.outputs_written & BITFIELD64_BIT(attr)))
1982	    continue;
1983
1984	 assert(mrf < max_usable_mrf);
1985
1986	 emit(BRW_OPCODE_MOV, brw_message_reg(mrf++), src_reg(output_reg[attr]));
1987      }
1988
1989      inst = emit(VS_OPCODE_URB_WRITE);
1990      inst->base_mrf = base_mrf;
1991      inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
1992      inst->eot = true;
1993      /* URB destination offset.  In the previous write, we got MRFs
1994       * 2-13 minus the one header MRF, so 12 regs.  URB offset is in
1995       * URB row increments, and each of our MRFs is half of one of
1996       * those, since we're doing interleaved writes.
1997       */
1998      inst->offset = (max_usable_mrf - base_mrf) / 2;
1999
2000      urb_entry_size += mrf - base_mrf;
2001   }
2002
2003   if (intel->gen == 6)
2004      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 8) / 8;
2005   else
2006      c->prog_data.urb_entry_size = ALIGN(urb_entry_size, 4) / 4;
2007}
2008
2009src_reg
2010vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2011				 src_reg *reladdr, int reg_offset)
2012{
2013   /* Because we store the values to scratch interleaved like our
2014    * vertex data, we need to scale the vec4 index by 2.
2015    */
2016   int message_header_scale = 2;
2017
2018   /* Pre-gen6, the message header uses byte offsets instead of vec4
2019    * (16-byte) offset units.
2020    */
2021   if (intel->gen < 6)
2022      message_header_scale *= 16;
2023
2024   if (reladdr) {
2025      src_reg index = src_reg(this, glsl_type::int_type);
2026
2027      vec4_instruction *add = emit(BRW_OPCODE_ADD,
2028				   dst_reg(index),
2029				   *reladdr,
2030				   src_reg(reg_offset));
2031      /* Move our new instruction from the tail to its correct place. */
2032      add->remove();
2033      inst->insert_before(add);
2034
2035      vec4_instruction *mul = emit(BRW_OPCODE_MUL, dst_reg(index),
2036				   index, src_reg(message_header_scale));
2037      mul->remove();
2038      inst->insert_before(mul);
2039
2040      return index;
2041   } else {
2042      return src_reg(reg_offset * message_header_scale);
2043   }
2044}
2045
2046src_reg
2047vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2048				       src_reg *reladdr, int reg_offset)
2049{
2050   if (reladdr) {
2051      src_reg index = src_reg(this, glsl_type::int_type);
2052
2053      vec4_instruction *add = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_ADD,
2054							    dst_reg(index),
2055							    *reladdr,
2056							    src_reg(reg_offset));
2057      add->ir = inst->ir;
2058      add->annotation = inst->annotation;
2059      inst->insert_before(add);
2060
2061      /* Pre-gen6, the message header uses byte offsets instead of vec4
2062       * (16-byte) offset units.
2063       */
2064      if (intel->gen < 6) {
2065	 vec4_instruction *mul = new(mem_ctx) vec4_instruction(this,
2066							       BRW_OPCODE_MUL,
2067							       dst_reg(index),
2068							       index,
2069							       src_reg(16));
2070	 mul->ir = inst->ir;
2071	 mul->annotation = inst->annotation;
2072	 inst->insert_before(mul);
2073      }
2074
2075      return index;
2076   } else {
2077      int message_header_scale = intel->gen < 6 ? 16 : 1;
2078      return src_reg(reg_offset * message_header_scale);
2079   }
2080}
2081
2082/**
2083 * Emits an instruction before @inst to load the value named by @orig_src
2084 * from scratch space at @base_offset to @temp.
2085 */
2086void
2087vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2088				dst_reg temp, src_reg orig_src,
2089				int base_offset)
2090{
2091   int reg_offset = base_offset + orig_src.reg_offset;
2092   src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2093
2094   vec4_instruction *scratch_read_inst = emit(VS_OPCODE_SCRATCH_READ,
2095					      temp, index);
2096
2097   scratch_read_inst->base_mrf = 14;
2098   scratch_read_inst->mlen = 1;
2099   /* Move our instruction from the tail to its correct place. */
2100   scratch_read_inst->remove();
2101   inst->insert_before(scratch_read_inst);
2102}
2103
2104/**
2105 * Emits an instruction after @inst to store the value to be written
2106 * to @orig_dst to scratch space at @base_offset, from @temp.
2107 */
2108void
2109vec4_visitor::emit_scratch_write(vec4_instruction *inst,
2110				 src_reg temp, dst_reg orig_dst,
2111				 int base_offset)
2112{
2113   int reg_offset = base_offset + orig_dst.reg_offset;
2114   src_reg index = get_scratch_offset(inst, orig_dst.reladdr, reg_offset);
2115
2116   dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2117				       orig_dst.writemask));
2118   vec4_instruction *scratch_write_inst = emit(VS_OPCODE_SCRATCH_WRITE,
2119					       dst, temp, index);
2120   scratch_write_inst->base_mrf = 13;
2121   scratch_write_inst->mlen = 2;
2122   scratch_write_inst->predicate = inst->predicate;
2123   /* Move our instruction from the tail to its correct place. */
2124   scratch_write_inst->remove();
2125   inst->insert_after(scratch_write_inst);
2126}
2127
2128/**
2129 * We can't generally support array access in GRF space, because a
2130 * single instruction's destination can only span 2 contiguous
2131 * registers.  So, we send all GRF arrays that get variable index
2132 * access to scratch space.
2133 */
2134void
2135vec4_visitor::move_grf_array_access_to_scratch()
2136{
2137   int scratch_loc[this->virtual_grf_count];
2138
2139   for (int i = 0; i < this->virtual_grf_count; i++) {
2140      scratch_loc[i] = -1;
2141   }
2142
2143   /* First, calculate the set of virtual GRFs that need to be punted
2144    * to scratch due to having any array access on them, and where in
2145    * scratch.
2146    */
2147   foreach_list(node, &this->instructions) {
2148      vec4_instruction *inst = (vec4_instruction *)node;
2149
2150      if (inst->dst.file == GRF && inst->dst.reladdr &&
2151	  scratch_loc[inst->dst.reg] == -1) {
2152	 scratch_loc[inst->dst.reg] = c->last_scratch;
2153	 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg] * 8 * 4;
2154      }
2155
2156      for (int i = 0 ; i < 3; i++) {
2157	 src_reg *src = &inst->src[i];
2158
2159	 if (src->file == GRF && src->reladdr &&
2160	     scratch_loc[src->reg] == -1) {
2161	    scratch_loc[src->reg] = c->last_scratch;
2162	    c->last_scratch += this->virtual_grf_sizes[src->reg] * 8 * 4;
2163	 }
2164      }
2165   }
2166
2167   /* Now, for anything that will be accessed through scratch, rewrite
2168    * it to load/store.  Note that this is a _safe list walk, because
2169    * we may generate a new scratch_write instruction after the one
2170    * we're processing.
2171    */
2172   foreach_list_safe(node, &this->instructions) {
2173      vec4_instruction *inst = (vec4_instruction *)node;
2174
2175      /* Set up the annotation tracking for new generated instructions. */
2176      base_ir = inst->ir;
2177      current_annotation = inst->annotation;
2178
2179      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2180	 src_reg temp = src_reg(this, glsl_type::vec4_type);
2181
2182	 emit_scratch_write(inst, temp, inst->dst, scratch_loc[inst->dst.reg]);
2183
2184	 inst->dst.file = temp.file;
2185	 inst->dst.reg = temp.reg;
2186	 inst->dst.reg_offset = temp.reg_offset;
2187	 inst->dst.reladdr = NULL;
2188      }
2189
2190      for (int i = 0 ; i < 3; i++) {
2191	 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2192	    continue;
2193
2194	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2195
2196	 emit_scratch_read(inst, temp, inst->src[i],
2197			   scratch_loc[inst->src[i].reg]);
2198
2199	 inst->src[i].file = temp.file;
2200	 inst->src[i].reg = temp.reg;
2201	 inst->src[i].reg_offset = temp.reg_offset;
2202	 inst->src[i].reladdr = NULL;
2203      }
2204   }
2205}
2206
2207/**
2208 * Emits an instruction before @inst to load the value named by @orig_src
2209 * from the pull constant buffer (surface) at @base_offset to @temp.
2210 */
2211void
2212vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2213				      dst_reg temp, src_reg orig_src,
2214				      int base_offset)
2215{
2216   int reg_offset = base_offset + orig_src.reg_offset;
2217   src_reg index = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2218   vec4_instruction *load;
2219
2220   load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2221					temp, index);
2222   load->annotation = inst->annotation;
2223   load->ir = inst->ir;
2224   load->base_mrf = 14;
2225   load->mlen = 1;
2226   inst->insert_before(load);
2227}
2228
2229/**
2230 * Implements array access of uniforms by inserting a
2231 * PULL_CONSTANT_LOAD instruction.
2232 *
2233 * Unlike temporary GRF array access (where we don't support it due to
2234 * the difficulty of doing relative addressing on instruction
2235 * destinations), we could potentially do array access of uniforms
2236 * that were loaded in GRF space as push constants.  In real-world
2237 * usage we've seen, though, the arrays being used are always larger
2238 * than we could load as push constants, so just always move all
2239 * uniform array access out to a pull constant buffer.
2240 */
2241void
2242vec4_visitor::move_uniform_array_access_to_pull_constants()
2243{
2244   int pull_constant_loc[this->uniforms];
2245
2246   for (int i = 0; i < this->uniforms; i++) {
2247      pull_constant_loc[i] = -1;
2248   }
2249
2250   /* Walk through and find array access of uniforms.  Put a copy of that
2251    * uniform in the pull constant buffer.
2252    *
2253    * Note that we don't move constant-indexed accesses to arrays.  No
2254    * testing has been done of the performance impact of this choice.
2255    */
2256   foreach_list_safe(node, &this->instructions) {
2257      vec4_instruction *inst = (vec4_instruction *)node;
2258
2259      for (int i = 0 ; i < 3; i++) {
2260	 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2261	    continue;
2262
2263	 int uniform = inst->src[i].reg;
2264
2265	 /* If this array isn't already present in the pull constant buffer,
2266	  * add it.
2267	  */
2268	 if (pull_constant_loc[uniform] == -1) {
2269	    const float **values = &prog_data->param[uniform * 4];
2270
2271	    pull_constant_loc[uniform] = prog_data->nr_pull_params;
2272
2273	    for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2274	       prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2275	    }
2276	 }
2277
2278	 /* Set up the annotation tracking for new generated instructions. */
2279	 base_ir = inst->ir;
2280	 current_annotation = inst->annotation;
2281
2282	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2283
2284	 emit_pull_constant_load(inst, temp, inst->src[i],
2285				 pull_constant_loc[uniform]);
2286
2287	 inst->src[i].file = temp.file;
2288	 inst->src[i].reg = temp.reg;
2289	 inst->src[i].reg_offset = temp.reg_offset;
2290	 inst->src[i].reladdr = NULL;
2291      }
2292   }
2293
2294   /* Now there are no accesses of the UNIFORM file with a reladdr, so
2295    * no need to track them as larger-than-vec4 objects.  This will be
2296    * relied on in cutting out unused uniform vectors from push
2297    * constants.
2298    */
2299   split_uniform_registers();
2300}
2301
2302vec4_visitor::vec4_visitor(struct brw_vs_compile *c,
2303			   struct gl_shader_program *prog,
2304			   struct brw_shader *shader)
2305{
2306   this->c = c;
2307   this->p = &c->func;
2308   this->brw = p->brw;
2309   this->intel = &brw->intel;
2310   this->ctx = &intel->ctx;
2311   this->prog = prog;
2312   this->shader = shader;
2313
2314   this->mem_ctx = ralloc_context(NULL);
2315   this->failed = false;
2316
2317   this->base_ir = NULL;
2318   this->current_annotation = NULL;
2319
2320   this->c = c;
2321   this->vp = prog->VertexProgram;
2322   this->prog_data = &c->prog_data;
2323
2324   this->variable_ht = hash_table_ctor(0,
2325				       hash_table_pointer_hash,
2326				       hash_table_pointer_compare);
2327
2328   this->virtual_grf_def = NULL;
2329   this->virtual_grf_use = NULL;
2330   this->virtual_grf_sizes = NULL;
2331   this->virtual_grf_count = 0;
2332   this->virtual_grf_array_size = 0;
2333   this->live_intervals_valid = false;
2334
2335   this->uniforms = 0;
2336
2337   this->variable_ht = hash_table_ctor(0,
2338				       hash_table_pointer_hash,
2339				       hash_table_pointer_compare);
2340}
2341
2342vec4_visitor::~vec4_visitor()
2343{
2344   ralloc_free(this->mem_ctx);
2345   hash_table_dtor(this->variable_ht);
2346}
2347
2348
2349void
2350vec4_visitor::fail(const char *format, ...)
2351{
2352   va_list va;
2353   char *msg;
2354
2355   if (failed)
2356      return;
2357
2358   failed = true;
2359
2360   va_start(va, format);
2361   msg = ralloc_vasprintf(mem_ctx, format, va);
2362   va_end(va);
2363   msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2364
2365   this->fail_msg = msg;
2366
2367   if (INTEL_DEBUG & DEBUG_VS) {
2368      fprintf(stderr, "%s",  msg);
2369   }
2370}
2371
2372} /* namespace brw */
2373