1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_vec4.h"
25#include "brw_cfg.h"
26#include "brw_eu.h"
27#include "brw_program.h"
28
29namespace brw {
30
31vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32                                   const src_reg &src0, const src_reg &src1,
33                                   const src_reg &src2)
34{
35   this->opcode = opcode;
36   this->dst = dst;
37   this->src[0] = src0;
38   this->src[1] = src1;
39   this->src[2] = src2;
40   this->saturate = false;
41   this->force_writemask_all = false;
42   this->no_dd_clear = false;
43   this->no_dd_check = false;
44   this->writes_accumulator = false;
45   this->conditional_mod = BRW_CONDITIONAL_NONE;
46   this->predicate = BRW_PREDICATE_NONE;
47   this->predicate_inverse = false;
48   this->target = 0;
49   this->shadow_compare = false;
50   this->ir = NULL;
51   this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52   this->header_size = 0;
53   this->flag_subreg = 0;
54   this->mlen = 0;
55   this->base_mrf = 0;
56   this->offset = 0;
57   this->exec_size = 8;
58   this->group = 0;
59   this->size_written = (dst.file == BAD_FILE ?
60                         0 : this->exec_size * type_sz(dst.type));
61   this->annotation = NULL;
62}
63
64vec4_instruction *
65vec4_visitor::emit(vec4_instruction *inst)
66{
67   inst->ir = this->base_ir;
68   inst->annotation = this->current_annotation;
69
70   this->instructions.push_tail(inst);
71
72   return inst;
73}
74
75vec4_instruction *
76vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
77                          vec4_instruction *new_inst)
78{
79   new_inst->ir = inst->ir;
80   new_inst->annotation = inst->annotation;
81
82   inst->insert_before(block, new_inst);
83
84   return inst;
85}
86
87vec4_instruction *
88vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
89                   const src_reg &src1, const src_reg &src2)
90{
91   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
92}
93
94
95vec4_instruction *
96vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
97                   const src_reg &src1)
98{
99   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
100}
101
102vec4_instruction *
103vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
104{
105   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
106}
107
108vec4_instruction *
109vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
110{
111   return emit(new(mem_ctx) vec4_instruction(opcode, dst));
112}
113
114vec4_instruction *
115vec4_visitor::emit(enum opcode opcode)
116{
117   return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
118}
119
120#define ALU1(op)							\
121   vec4_instruction *							\
122   vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
123   {									\
124      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
125   }
126
127#define ALU2(op)							\
128   vec4_instruction *							\
129   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
130                    const src_reg &src1)				\
131   {									\
132      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
133                                           src0, src1);                 \
134   }
135
136#define ALU2_ACC(op)							\
137   vec4_instruction *							\
138   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
139                    const src_reg &src1)				\
140   {									\
141      vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
142                       BRW_OPCODE_##op, dst, src0, src1);		\
143      inst->writes_accumulator = true;                                  \
144      return inst;                                                      \
145   }
146
147#define ALU3(op)							\
148   vec4_instruction *							\
149   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
150                    const src_reg &src1, const src_reg &src2)		\
151   {									\
152      assert(devinfo->gen >= 6);						\
153      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
154					   src0, src1, src2);		\
155   }
156
157ALU1(NOT)
158ALU1(MOV)
159ALU1(FRC)
160ALU1(RNDD)
161ALU1(RNDE)
162ALU1(RNDZ)
163ALU1(F32TO16)
164ALU1(F16TO32)
165ALU2(ADD)
166ALU2(MUL)
167ALU2_ACC(MACH)
168ALU2(AND)
169ALU2(OR)
170ALU2(XOR)
171ALU2(DP3)
172ALU2(DP4)
173ALU2(DPH)
174ALU2(SHL)
175ALU2(SHR)
176ALU2(ASR)
177ALU3(LRP)
178ALU1(BFREV)
179ALU3(BFE)
180ALU2(BFI1)
181ALU3(BFI2)
182ALU1(FBH)
183ALU1(FBL)
184ALU1(CBIT)
185ALU3(MAD)
186ALU2_ACC(ADDC)
187ALU2_ACC(SUBB)
188ALU2(MAC)
189ALU1(DIM)
190
191/** Gen4 predicated IF. */
192vec4_instruction *
193vec4_visitor::IF(enum brw_predicate predicate)
194{
195   vec4_instruction *inst;
196
197   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
198   inst->predicate = predicate;
199
200   return inst;
201}
202
203/** Gen6 IF with embedded comparison. */
204vec4_instruction *
205vec4_visitor::IF(src_reg src0, src_reg src1,
206                 enum brw_conditional_mod condition)
207{
208   assert(devinfo->gen == 6);
209
210   vec4_instruction *inst;
211
212   resolve_ud_negate(&src0);
213   resolve_ud_negate(&src1);
214
215   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
216					src0, src1);
217   inst->conditional_mod = condition;
218
219   return inst;
220}
221
222/**
223 * CMP: Sets the low bit of the destination channels with the result
224 * of the comparison, while the upper bits are undefined, and updates
225 * the flag register with the packed 16 bits of the result.
226 */
227vec4_instruction *
228vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
229                  enum brw_conditional_mod condition)
230{
231   vec4_instruction *inst;
232
233   /* Take the instruction:
234    *
235    * CMP null<d> src0<f> src1<f>
236    *
237    * Original gen4 does type conversion to the destination type before
238    * comparison, producing garbage results for floating point comparisons.
239    *
240    * The destination type doesn't matter on newer generations, so we set the
241    * type to match src0 so we can compact the instruction.
242    */
243   dst.type = src0.type;
244
245   resolve_ud_negate(&src0);
246   resolve_ud_negate(&src1);
247
248   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249   inst->conditional_mod = condition;
250
251   return inst;
252}
253
254vec4_instruction *
255vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256{
257   vec4_instruction *inst;
258
259   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260					dst, index);
261   inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262   inst->mlen = 2;
263
264   return inst;
265}
266
267vec4_instruction *
268vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269                            const src_reg &index)
270{
271   vec4_instruction *inst;
272
273   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274					dst, src, index);
275   inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276   inst->mlen = 3;
277
278   return inst;
279}
280
281src_reg
282vec4_visitor::fix_3src_operand(const src_reg &src)
283{
284   /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285    * able to use vertical stride of zero to replicate the vec4 uniform, like
286    *
287    *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288    *
289    * But you can't, since vertical stride is always four in three-source
290    * instructions. Instead, insert a MOV instruction to do the replication so
291    * that the three-source instruction can consume it.
292    */
293
294   /* The MOV is only needed if the source is a uniform or immediate. */
295   if (src.file != UNIFORM && src.file != IMM)
296      return src;
297
298   if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299      return src;
300
301   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302   expanded.type = src.type;
303   emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
304   return src_reg(expanded);
305}
306
307src_reg
308vec4_visitor::resolve_source_modifiers(const src_reg &src)
309{
310   if (!src.abs && !src.negate)
311      return src;
312
313   dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
314   resolved.type = src.type;
315   emit(MOV(resolved, src));
316
317   return src_reg(resolved);
318}
319
320src_reg
321vec4_visitor::fix_math_operand(const src_reg &src)
322{
323   if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
324      return src;
325
326   /* The gen6 math instruction ignores the source modifiers --
327    * swizzle, abs, negate, and at least some parts of the register
328    * region description.
329    *
330    * Rather than trying to enumerate all these cases, *always* expand the
331    * operand to a temp GRF for gen6.
332    *
333    * For gen7, keep the operand as-is, except if immediate, which gen7 still
334    * can't use.
335    */
336
337   if (devinfo->gen == 7 && src.file != IMM)
338      return src;
339
340   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
341   expanded.type = src.type;
342   emit(MOV(expanded, src));
343   return src_reg(expanded);
344}
345
346vec4_instruction *
347vec4_visitor::emit_math(enum opcode opcode,
348                        const dst_reg &dst,
349                        const src_reg &src0, const src_reg &src1)
350{
351   vec4_instruction *math =
352      emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
353
354   if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
355      /* MATH on Gen6 must be align1, so we can't do writemasks. */
356      math->dst = dst_reg(this, glsl_type::vec4_type);
357      math->dst.type = dst.type;
358      math = emit(MOV(dst, src_reg(math->dst)));
359   } else if (devinfo->gen < 6) {
360      math->base_mrf = 1;
361      math->mlen = src1.file == BAD_FILE ? 1 : 2;
362   }
363
364   return math;
365}
366
367void
368vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
369{
370   if (devinfo->gen < 7) {
371      unreachable("ir_unop_pack_half_2x16 should be lowered");
372   }
373
374   assert(dst.type == BRW_REGISTER_TYPE_UD);
375   assert(src0.type == BRW_REGISTER_TYPE_F);
376
377   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
378    *
379    *   Because this instruction does not have a 16-bit floating-point type,
380    *   the destination data type must be Word (W).
381    *
382    *   The destination must be DWord-aligned and specify a horizontal stride
383    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
384    *   each destination channel and the upper word is not modified.
385    *
386    * The above restriction implies that the f32to16 instruction must use
387    * align1 mode, because only in align1 mode is it possible to specify
388    * horizontal stride.  We choose here to defy the hardware docs and emit
389    * align16 instructions.
390    *
391    * (I [chadv] did attempt to emit align1 instructions for VS f32to16
392    * instructions. I was partially successful in that the code passed all
393    * tests.  However, the code was dubiously correct and fragile, and the
394    * tests were not harsh enough to probe that frailty. Not trusting the
395    * code, I chose instead to remain in align16 mode in defiance of the hw
396    * docs).
397    *
398    * I've [chadv] experimentally confirmed that, on gen7 hardware and the
399    * simulator, emitting a f32to16 in align16 mode with UD as destination
400    * data type is safe. The behavior differs from that specified in the PRM
401    * in that the upper word of each destination channel is cleared to 0.
402    */
403
404   dst_reg tmp_dst(this, glsl_type::uvec2_type);
405   src_reg tmp_src(tmp_dst);
406
407#if 0
408   /* Verify the undocumented behavior on which the following instructions
409    * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
410    * then the result of the bit-or instruction below will be incorrect.
411    *
412    * You should inspect the disasm output in order to verify that the MOV is
413    * not optimized away.
414    */
415   emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
416#endif
417
418   /* Give tmp the form below, where "." means untouched.
419    *
420    *     w z          y          x w z          y          x
421    *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
422    *
423    * That the upper word of each write-channel be 0 is required for the
424    * following bit-shift and bit-or instructions to work. Note that this
425    * relies on the undocumented hardware behavior mentioned above.
426    */
427   tmp_dst.writemask = WRITEMASK_XY;
428   emit(F32TO16(tmp_dst, src0));
429
430   /* Give the write-channels of dst the form:
431    *   0xhhhh0000
432    */
433   tmp_src.swizzle = BRW_SWIZZLE_YYYY;
434   emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
435
436   /* Finally, give the write-channels of dst the form of packHalf2x16's
437    * output:
438    *   0xhhhhllll
439    */
440   tmp_src.swizzle = BRW_SWIZZLE_XXXX;
441   emit(OR(dst, src_reg(dst), tmp_src));
442}
443
444void
445vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
446{
447   if (devinfo->gen < 7) {
448      unreachable("ir_unop_unpack_half_2x16 should be lowered");
449   }
450
451   assert(dst.type == BRW_REGISTER_TYPE_F);
452   assert(src0.type == BRW_REGISTER_TYPE_UD);
453
454   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
455    *
456    *   Because this instruction does not have a 16-bit floating-point type,
457    *   the source data type must be Word (W). The destination type must be
458    *   F (Float).
459    *
460    * To use W as the source data type, we must adjust horizontal strides,
461    * which is only possible in align1 mode. All my [chadv] attempts at
462    * emitting align1 instructions for unpackHalf2x16 failed to pass the
463    * Piglit tests, so I gave up.
464    *
465    * I've verified that, on gen7 hardware and the simulator, it is safe to
466    * emit f16to32 in align16 mode with UD as source data type.
467    */
468
469   dst_reg tmp_dst(this, glsl_type::uvec2_type);
470   src_reg tmp_src(tmp_dst);
471
472   tmp_dst.writemask = WRITEMASK_X;
473   emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
474
475   tmp_dst.writemask = WRITEMASK_Y;
476   emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
477
478   dst.writemask = WRITEMASK_XY;
479   emit(F16TO32(dst, tmp_src));
480}
481
482void
483vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
484{
485   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
486    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
487    * is not suitable to generate the shift values, but we can use the packed
488    * vector float and a type-converting MOV.
489    */
490   dst_reg shift(this, glsl_type::uvec4_type);
491   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
492
493   dst_reg shifted(this, glsl_type::uvec4_type);
494   src0.swizzle = BRW_SWIZZLE_XXXX;
495   emit(SHR(shifted, src0, src_reg(shift)));
496
497   shifted.type = BRW_REGISTER_TYPE_UB;
498   dst_reg f(this, glsl_type::vec4_type);
499   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
500
501   emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
502}
503
504void
505vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
506{
507   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
508    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
509    * is not suitable to generate the shift values, but we can use the packed
510    * vector float and a type-converting MOV.
511    */
512   dst_reg shift(this, glsl_type::uvec4_type);
513   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
514
515   dst_reg shifted(this, glsl_type::uvec4_type);
516   src0.swizzle = BRW_SWIZZLE_XXXX;
517   emit(SHR(shifted, src0, src_reg(shift)));
518
519   shifted.type = BRW_REGISTER_TYPE_B;
520   dst_reg f(this, glsl_type::vec4_type);
521   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
522
523   dst_reg scaled(this, glsl_type::vec4_type);
524   emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
525
526   dst_reg max(this, glsl_type::vec4_type);
527   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
528   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
529}
530
531void
532vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
533{
534   dst_reg saturated(this, glsl_type::vec4_type);
535   vec4_instruction *inst = emit(MOV(saturated, src0));
536   inst->saturate = true;
537
538   dst_reg scaled(this, glsl_type::vec4_type);
539   emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
540
541   dst_reg rounded(this, glsl_type::vec4_type);
542   emit(RNDE(rounded, src_reg(scaled)));
543
544   dst_reg u(this, glsl_type::uvec4_type);
545   emit(MOV(u, src_reg(rounded)));
546
547   src_reg bytes(u);
548   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
549}
550
551void
552vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
553{
554   dst_reg max(this, glsl_type::vec4_type);
555   emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
556
557   dst_reg min(this, glsl_type::vec4_type);
558   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
559
560   dst_reg scaled(this, glsl_type::vec4_type);
561   emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
562
563   dst_reg rounded(this, glsl_type::vec4_type);
564   emit(RNDE(rounded, src_reg(scaled)));
565
566   dst_reg i(this, glsl_type::ivec4_type);
567   emit(MOV(i, src_reg(rounded)));
568
569   src_reg bytes(i);
570   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
571}
572
573/*
574 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
575 * false) elements needed to pack a type.
576 */
577static int
578type_size_xvec4(const struct glsl_type *type, bool as_vec4)
579{
580   unsigned int i;
581   int size;
582
583   switch (type->base_type) {
584   case GLSL_TYPE_UINT:
585   case GLSL_TYPE_INT:
586   case GLSL_TYPE_FLOAT:
587   case GLSL_TYPE_BOOL:
588   case GLSL_TYPE_DOUBLE:
589      if (type->is_matrix()) {
590         const glsl_type *col_type = type->column_type();
591         unsigned col_slots =
592            (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
593         return type->matrix_columns * col_slots;
594      } else {
595         /* Regardless of size of vector, it gets a vec4. This is bad
596          * packing for things like floats, but otherwise arrays become a
597          * mess.  Hopefully a later pass over the code can pack scalars
598          * down if appropriate.
599          */
600         return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
601      }
602   case GLSL_TYPE_ARRAY:
603      assert(type->length > 0);
604      return type_size_xvec4(type->fields.array, as_vec4) * type->length;
605   case GLSL_TYPE_STRUCT:
606      size = 0;
607      for (i = 0; i < type->length; i++) {
608	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
609      }
610      return size;
611   case GLSL_TYPE_SUBROUTINE:
612      return 1;
613
614   case GLSL_TYPE_SAMPLER:
615      /* Samplers take up no register space, since they're baked in at
616       * link time.
617       */
618      return 0;
619   case GLSL_TYPE_ATOMIC_UINT:
620      return 0;
621   case GLSL_TYPE_IMAGE:
622      return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
623   case GLSL_TYPE_VOID:
624   case GLSL_TYPE_ERROR:
625   case GLSL_TYPE_INTERFACE:
626   case GLSL_TYPE_FUNCTION:
627      unreachable("not reached");
628   }
629
630   return 0;
631}
632
633/**
634 * Returns the minimum number of vec4 elements needed to pack a type.
635 *
636 * For simple types, it will return 1 (a single vec4); for matrices, the
637 * number of columns; for array and struct, the sum of the vec4_size of
638 * each of its elements; and for sampler and atomic, zero.
639 *
640 * This method is useful to calculate how much register space is needed to
641 * store a particular type.
642 */
643extern "C" int
644type_size_vec4(const struct glsl_type *type)
645{
646   return type_size_xvec4(type, true);
647}
648
649/**
650 * Returns the minimum number of dvec4 elements needed to pack a type.
651 *
652 * For simple types, it will return 1 (a single dvec4); for matrices, the
653 * number of columns; for array and struct, the sum of the dvec4_size of
654 * each of its elements; and for sampler and atomic, zero.
655 *
656 * This method is useful to calculate how much register space is needed to
657 * store a particular type.
658 *
659 * Measuring double-precision vertex inputs as dvec4 is required because
660 * ARB_vertex_attrib_64bit states that these uses the same number of locations
661 * than the single-precision version. That is, two consecutives dvec4 would be
662 * located in location "x" and location "x+1", not "x+2".
663 *
664 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
665 * remap_vs_attrs() will take in account both the location and also if the
666 * type fits in one or two vec4 slots.
667 */
668extern "C" int
669type_size_dvec4(const struct glsl_type *type)
670{
671   return type_size_xvec4(type, false);
672}
673
674src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
675{
676   init();
677
678   this->file = VGRF;
679   this->nr = v->alloc.allocate(type_size_vec4(type));
680
681   if (type->is_array() || type->is_record()) {
682      this->swizzle = BRW_SWIZZLE_NOOP;
683   } else {
684      this->swizzle = brw_swizzle_for_size(type->vector_elements);
685   }
686
687   this->type = brw_type_for_base_type(type);
688}
689
690src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
691{
692   assert(size > 0);
693
694   init();
695
696   this->file = VGRF;
697   this->nr = v->alloc.allocate(type_size_vec4(type) * size);
698
699   this->swizzle = BRW_SWIZZLE_NOOP;
700
701   this->type = brw_type_for_base_type(type);
702}
703
704dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
705{
706   init();
707
708   this->file = VGRF;
709   this->nr = v->alloc.allocate(type_size_vec4(type));
710
711   if (type->is_array() || type->is_record()) {
712      this->writemask = WRITEMASK_XYZW;
713   } else {
714      this->writemask = (1 << type->vector_elements) - 1;
715   }
716
717   this->type = brw_type_for_base_type(type);
718}
719
720vec4_instruction *
721vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
722                          src_reg src0, src_reg src1)
723{
724   vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
725   inst->conditional_mod = conditionalmod;
726   return inst;
727}
728
729vec4_instruction *
730vec4_visitor::emit_lrp(const dst_reg &dst,
731                       const src_reg &x, const src_reg &y, const src_reg &a)
732{
733   if (devinfo->gen >= 6) {
734      /* Note that the instruction's argument order is reversed from GLSL
735       * and the IR.
736       */
737     return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
738                     fix_3src_operand(x)));
739   } else {
740      /* Earlier generations don't support three source operations, so we
741       * need to emit x*(1-a) + y*a.
742       */
743      dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
744      dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
745      dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
746      y_times_a.writemask           = dst.writemask;
747      one_minus_a.writemask         = dst.writemask;
748      x_times_one_minus_a.writemask = dst.writemask;
749
750      emit(MUL(y_times_a, y, a));
751      emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
752      emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
753      return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
754   }
755}
756
757/**
758 * Emits the instructions needed to perform a pull constant load. before_block
759 * and before_inst can be NULL in which case the instruction will be appended
760 * to the end of the instruction list.
761 */
762void
763vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
764                                          src_reg surf_index,
765                                          src_reg offset_reg,
766                                          bblock_t *before_block,
767                                          vec4_instruction *before_inst)
768{
769   assert((before_inst == NULL && before_block == NULL) ||
770          (before_inst && before_block));
771
772   vec4_instruction *pull;
773
774   if (devinfo->gen >= 9) {
775      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
776      src_reg header(this, glsl_type::uvec4_type, 2);
777
778      pull = new(mem_ctx)
779         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
780                          dst_reg(header));
781
782      if (before_inst)
783         emit_before(before_block, before_inst, pull);
784      else
785         emit(pull);
786
787      dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
788                                 offset_reg.type);
789      pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
790
791      if (before_inst)
792         emit_before(before_block, before_inst, pull);
793      else
794         emit(pull);
795
796      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
797                                           dst,
798                                           surf_index,
799                                           header);
800      pull->mlen = 2;
801      pull->header_size = 1;
802   } else if (devinfo->gen >= 7) {
803      dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
804
805      grf_offset.type = offset_reg.type;
806
807      pull = MOV(grf_offset, offset_reg);
808
809      if (before_inst)
810         emit_before(before_block, before_inst, pull);
811      else
812         emit(pull);
813
814      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
815                                           dst,
816                                           surf_index,
817                                           src_reg(grf_offset));
818      pull->mlen = 1;
819   } else {
820      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
821                                           dst,
822                                           surf_index,
823                                           offset_reg);
824      pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
825      pull->mlen = 1;
826   }
827
828   if (before_inst)
829      emit_before(before_block, before_inst, pull);
830   else
831      emit(pull);
832}
833
834src_reg
835vec4_visitor::emit_uniformize(const src_reg &src)
836{
837   const src_reg chan_index(this, glsl_type::uint_type);
838   const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
839                              src.type);
840
841   emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
842      ->force_writemask_all = true;
843   emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
844      ->force_writemask_all = true;
845
846   return src_reg(dst);
847}
848
849src_reg
850vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
851                             src_reg coordinate, src_reg surface)
852{
853   vec4_instruction *inst =
854      new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
855                                    dst_reg(this, glsl_type::uvec4_type));
856   inst->base_mrf = 2;
857   inst->src[1] = surface;
858   inst->src[2] = surface;
859
860   int param_base;
861
862   if (devinfo->gen >= 9) {
863      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
864      vec4_instruction *header_inst = new(mem_ctx)
865         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
866                          dst_reg(MRF, inst->base_mrf));
867
868      emit(header_inst);
869
870      inst->mlen = 2;
871      inst->header_size = 1;
872      param_base = inst->base_mrf + 1;
873   } else {
874      inst->mlen = 1;
875      param_base = inst->base_mrf;
876   }
877
878   /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
879   int coord_mask = (1 << coordinate_type->vector_elements) - 1;
880   int zero_mask = 0xf & ~coord_mask;
881
882   emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
883            coordinate));
884
885   emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
886            brw_imm_d(0)));
887
888   emit(inst);
889   return src_reg(inst->dst);
890}
891
892bool
893vec4_visitor::is_high_sampler(src_reg sampler)
894{
895   if (devinfo->gen < 8 && !devinfo->is_haswell)
896      return false;
897
898   return sampler.file != IMM || sampler.ud >= 16;
899}
900
901void
902vec4_visitor::emit_texture(ir_texture_opcode op,
903                           dst_reg dest,
904                           const glsl_type *dest_type,
905                           src_reg coordinate,
906                           int coord_components,
907                           src_reg shadow_comparator,
908                           src_reg lod, src_reg lod2,
909                           src_reg sample_index,
910                           uint32_t constant_offset,
911                           src_reg offset_value,
912                           src_reg mcs,
913                           uint32_t surface,
914                           src_reg surface_reg,
915                           src_reg sampler_reg)
916{
917   /* The sampler can only meaningfully compute LOD for fragment shader
918    * messages. For all other stages, we change the opcode to TXL and hardcode
919    * the LOD to 0.
920    *
921    * textureQueryLevels() is implemented in terms of TXS so we need to pass a
922    * valid LOD argument.
923    */
924   if (op == ir_tex || op == ir_query_levels) {
925      assert(lod.file == BAD_FILE);
926      lod = brw_imm_f(0.0f);
927   }
928
929   enum opcode opcode;
930   switch (op) {
931   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
932   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
933   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
934   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
935   case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
936                             SHADER_OPCODE_TXF_CMS); break;
937   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
938   case ir_tg4: opcode = offset_value.file != BAD_FILE
939                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
940   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
941   case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
942   case ir_txb:
943      unreachable("TXB is not valid for vertex shaders.");
944   case ir_lod:
945      unreachable("LOD is not valid for vertex shaders.");
946   case ir_samples_identical: {
947      /* There are some challenges implementing this for vec4, and it seems
948       * unlikely to be used anyway.  For now, just return false ways.
949       */
950      emit(MOV(dest, brw_imm_ud(0u)));
951      return;
952   }
953   default:
954      unreachable("Unrecognized tex op");
955   }
956
957   vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
958
959   inst->offset = constant_offset;
960
961   /* The message header is necessary for:
962    * - Gen4 (always)
963    * - Gen9+ for selecting SIMD4x2
964    * - Texel offsets
965    * - Gather channel selection
966    * - Sampler indices too large to fit in a 4-bit value.
967    * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
968    */
969   inst->header_size =
970      (devinfo->gen < 5 || devinfo->gen >= 9 ||
971       inst->offset != 0 || op == ir_tg4 ||
972       op == ir_texture_samples ||
973       is_high_sampler(sampler_reg)) ? 1 : 0;
974   inst->base_mrf = 2;
975   inst->mlen = inst->header_size;
976   inst->dst.writemask = WRITEMASK_XYZW;
977   inst->shadow_compare = shadow_comparator.file != BAD_FILE;
978
979   inst->src[1] = surface_reg;
980   inst->src[2] = sampler_reg;
981
982   /* MRF for the first parameter */
983   int param_base = inst->base_mrf + inst->header_size;
984
985   if (op == ir_txs || op == ir_query_levels) {
986      int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
987      emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
988      inst->mlen++;
989   } else if (op == ir_texture_samples) {
990      inst->dst.writemask = WRITEMASK_X;
991   } else {
992      /* Load the coordinate */
993      /* FINISHME: gl_clamp_mask and saturate */
994      int coord_mask = (1 << coord_components) - 1;
995      int zero_mask = 0xf & ~coord_mask;
996
997      emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
998               coordinate));
999      inst->mlen++;
1000
1001      if (zero_mask != 0) {
1002         emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1003                  brw_imm_d(0)));
1004      }
1005      /* Load the shadow comparator */
1006      if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1007	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1008			  WRITEMASK_X),
1009		  shadow_comparator));
1010	 inst->mlen++;
1011      }
1012
1013      /* Load the LOD info */
1014      if (op == ir_tex || op == ir_txl) {
1015	 int mrf, writemask;
1016	 if (devinfo->gen >= 5) {
1017	    mrf = param_base + 1;
1018	    if (shadow_comparator.file != BAD_FILE) {
1019	       writemask = WRITEMASK_Y;
1020	       /* mlen already incremented */
1021	    } else {
1022	       writemask = WRITEMASK_X;
1023	       inst->mlen++;
1024	    }
1025	 } else /* devinfo->gen == 4 */ {
1026	    mrf = param_base;
1027	    writemask = WRITEMASK_W;
1028	 }
1029	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1030      } else if (op == ir_txf) {
1031         emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1032      } else if (op == ir_txf_ms) {
1033         emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1034                  sample_index));
1035         if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1036            /* MCS data is stored in the first two channels of ‘mcs’, but we
1037             * need to get it into the .y and .z channels of the second vec4
1038             * of params.
1039             */
1040            mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1041            emit(MOV(dst_reg(MRF, param_base + 1,
1042                             glsl_type::uint_type, WRITEMASK_YZ),
1043                     mcs));
1044         } else if (devinfo->gen >= 7) {
1045            /* MCS data is in the first channel of `mcs`, but we need to get it into
1046             * the .y channel of the second vec4 of params, so replicate .x across
1047             * the whole vec4 and then mask off everything except .y
1048             */
1049            mcs.swizzle = BRW_SWIZZLE_XXXX;
1050            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1051                     mcs));
1052         }
1053         inst->mlen++;
1054      } else if (op == ir_txd) {
1055         const brw_reg_type type = lod.type;
1056
1057	 if (devinfo->gen >= 5) {
1058	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1059	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1060	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1061	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1062	    inst->mlen++;
1063
1064	    if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1065	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
1066	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1067	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1068	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1069	       inst->mlen++;
1070
1071               if (shadow_comparator.file != BAD_FILE) {
1072                  emit(MOV(dst_reg(MRF, param_base + 2,
1073                                   shadow_comparator.type, WRITEMASK_Z),
1074                           shadow_comparator));
1075               }
1076	    }
1077	 } else /* devinfo->gen == 4 */ {
1078	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1079	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1080	    inst->mlen += 2;
1081	 }
1082      } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1083         if (shadow_comparator.file != BAD_FILE) {
1084            emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1085                     shadow_comparator));
1086         }
1087
1088         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1089                  offset_value));
1090         inst->mlen++;
1091      }
1092   }
1093
1094   emit(inst);
1095
1096   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1097    * spec requires layers.
1098    */
1099   if (op == ir_txs && devinfo->gen < 7) {
1100      /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1101      emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1102                  src_reg(inst->dst), brw_imm_d(1));
1103   }
1104
1105   if (devinfo->gen == 6 && op == ir_tg4) {
1106      emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1107   }
1108
1109   if (op == ir_query_levels) {
1110      /* # levels is in .w */
1111      src_reg swizzled(dest);
1112      swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1113                                      SWIZZLE_W, SWIZZLE_W);
1114      emit(MOV(dest, swizzled));
1115   }
1116}
1117
1118/**
1119 * Apply workarounds for Gen6 gather with UINT/SINT
1120 */
1121void
1122vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1123{
1124   if (!wa)
1125      return;
1126
1127   int width = (wa & WA_8BIT) ? 8 : 16;
1128   dst_reg dst_f = dst;
1129   dst_f.type = BRW_REGISTER_TYPE_F;
1130
1131   /* Convert from UNORM to UINT */
1132   emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1133   emit(MOV(dst, src_reg(dst_f)));
1134
1135   if (wa & WA_SIGN) {
1136      /* Reinterpret the UINT value as a signed INT value by
1137       * shifting the sign bit into place, then shifting back
1138       * preserving sign.
1139       */
1140      emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1141      emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1142   }
1143}
1144
1145void
1146vec4_visitor::gs_emit_vertex(int /* stream_id */)
1147{
1148   unreachable("not reached");
1149}
1150
1151void
1152vec4_visitor::gs_end_primitive()
1153{
1154   unreachable("not reached");
1155}
1156
1157void
1158vec4_visitor::emit_ndc_computation()
1159{
1160   if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1161      return;
1162
1163   /* Get the position */
1164   src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1165
1166   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1167   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1168   output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1169   output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1170
1171   current_annotation = "NDC";
1172   dst_reg ndc_w = ndc;
1173   ndc_w.writemask = WRITEMASK_W;
1174   src_reg pos_w = pos;
1175   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1176   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1177
1178   dst_reg ndc_xyz = ndc;
1179   ndc_xyz.writemask = WRITEMASK_XYZ;
1180
1181   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1182}
1183
1184void
1185vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1186{
1187   if (devinfo->gen < 6 &&
1188       ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1189        output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1190        devinfo->has_negative_rhw_bug)) {
1191      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1192      dst_reg header1_w = header1;
1193      header1_w.writemask = WRITEMASK_W;
1194
1195      emit(MOV(header1, brw_imm_ud(0u)));
1196
1197      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1198	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1199
1200	 current_annotation = "Point size";
1201	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1202	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1203      }
1204
1205      if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1206         current_annotation = "Clipping flags";
1207         dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1208         dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1209
1210         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1211         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1212         emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1213
1214         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1215         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1216         emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1217         emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1218      }
1219
1220      /* i965 clipping workaround:
1221       * 1) Test for -ve rhw
1222       * 2) If set,
1223       *      set ndc = (0,0,0,0)
1224       *      set ucp[6] = 1
1225       *
1226       * Later, clipping will detect ucp[6] and ensure the primitive is
1227       * clipped against all fixed planes.
1228       */
1229      if (devinfo->has_negative_rhw_bug &&
1230          output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1231         src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1232         ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1233         emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1234         vec4_instruction *inst;
1235         inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1236         inst->predicate = BRW_PREDICATE_NORMAL;
1237         output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1238         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1239         inst->predicate = BRW_PREDICATE_NORMAL;
1240      }
1241
1242      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1243   } else if (devinfo->gen < 6) {
1244      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1245   } else {
1246      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1247      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1248         dst_reg reg_w = reg;
1249         reg_w.writemask = WRITEMASK_W;
1250         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1251         reg_as_src.type = reg_w.type;
1252         reg_as_src.swizzle = brw_swizzle_for_size(1);
1253         emit(MOV(reg_w, reg_as_src));
1254      }
1255      if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1256         dst_reg reg_y = reg;
1257         reg_y.writemask = WRITEMASK_Y;
1258         reg_y.type = BRW_REGISTER_TYPE_D;
1259         output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1260         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1261      }
1262      if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1263         dst_reg reg_z = reg;
1264         reg_z.writemask = WRITEMASK_Z;
1265         reg_z.type = BRW_REGISTER_TYPE_D;
1266         output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1267         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1268      }
1269   }
1270}
1271
1272vec4_instruction *
1273vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1274{
1275   assert(varying < VARYING_SLOT_MAX);
1276
1277   unsigned num_comps = output_num_components[varying][component];
1278   if (num_comps == 0)
1279      return NULL;
1280
1281   assert(output_reg[varying][component].type == reg.type);
1282   current_annotation = output_reg_annotation[varying];
1283   if (output_reg[varying][component].file != BAD_FILE) {
1284      src_reg src = src_reg(output_reg[varying][component]);
1285      src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1286      reg.writemask =
1287         brw_writemask_for_component_packing(num_comps, component);
1288      return emit(MOV(reg, src));
1289   }
1290   return NULL;
1291}
1292
1293void
1294vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1295{
1296   reg.type = BRW_REGISTER_TYPE_F;
1297   output_reg[varying][0].type = reg.type;
1298
1299   switch (varying) {
1300   case VARYING_SLOT_PSIZ:
1301   {
1302      /* PSIZ is always in slot 0, and is coupled with other flags. */
1303      current_annotation = "indices, point width, clip flags";
1304      emit_psiz_and_flags(reg);
1305      break;
1306   }
1307   case BRW_VARYING_SLOT_NDC:
1308      current_annotation = "NDC";
1309      if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1310         emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1311      break;
1312   case VARYING_SLOT_POS:
1313      current_annotation = "gl_Position";
1314      if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1315         emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1316      break;
1317   case VARYING_SLOT_EDGE:
1318      /* This is present when doing unfilled polygons.  We're supposed to copy
1319       * the edge flag from the user-provided vertex array
1320       * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1321       * of that attribute (starts as 1.0f).  This is then used in clipping to
1322       * determine which edges should be drawn as wireframe.
1323       */
1324      current_annotation = "edge flag";
1325      emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1326                                    glsl_type::float_type, WRITEMASK_XYZW))));
1327      break;
1328   case BRW_VARYING_SLOT_PAD:
1329      /* No need to write to this slot */
1330      break;
1331   default:
1332      for (int i = 0; i < 4; i++) {
1333         emit_generic_urb_slot(reg, varying, i);
1334      }
1335      break;
1336   }
1337}
1338
1339static int
1340align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1341{
1342   if (devinfo->gen >= 6) {
1343      /* URB data written (does not include the message header reg) must
1344       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1345       * section 5.4.3.2.2: URB_INTERLEAVED.
1346       *
1347       * URB entries are allocated on a multiple of 1024 bits, so an
1348       * extra 128 bits written here to make the end align to 256 is
1349       * no problem.
1350       */
1351      if ((mlen % 2) != 1)
1352	 mlen++;
1353   }
1354
1355   return mlen;
1356}
1357
1358
1359/**
1360 * Generates the VUE payload plus the necessary URB write instructions to
1361 * output it.
1362 *
1363 * The VUE layout is documented in Volume 2a.
1364 */
1365void
1366vec4_visitor::emit_vertex()
1367{
1368   /* MRF 0 is reserved for the debugger, so start with message header
1369    * in MRF 1.
1370    */
1371   int base_mrf = 1;
1372   int mrf = base_mrf;
1373   /* In the process of generating our URB write message contents, we
1374    * may need to unspill a register or load from an array.  Those
1375    * reads would use MRFs 14-15.
1376    */
1377   int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1378
1379   /* The following assertion verifies that max_usable_mrf causes an
1380    * even-numbered amount of URB write data, which will meet gen6's
1381    * requirements for length alignment.
1382    */
1383   assert ((max_usable_mrf - base_mrf) % 2 == 0);
1384
1385   /* First mrf is the g0-based message header containing URB handles and
1386    * such.
1387    */
1388   emit_urb_write_header(mrf++);
1389
1390   if (devinfo->gen < 6) {
1391      emit_ndc_computation();
1392   }
1393
1394   /* We may need to split this up into several URB writes, so do them in a
1395    * loop.
1396    */
1397   int slot = 0;
1398   bool complete = false;
1399   do {
1400      /* URB offset is in URB row increments, and each of our MRFs is half of
1401       * one of those, since we're doing interleaved writes.
1402       */
1403      int offset = slot / 2;
1404
1405      mrf = base_mrf + 1;
1406      for (; slot < prog_data->vue_map.num_slots; ++slot) {
1407         emit_urb_slot(dst_reg(MRF, mrf++),
1408                       prog_data->vue_map.slot_to_varying[slot]);
1409
1410         /* If this was max_usable_mrf, we can't fit anything more into this
1411          * URB WRITE. Same thing if we reached the maximum length available.
1412          */
1413         if (mrf > max_usable_mrf ||
1414             align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1415            slot++;
1416            break;
1417         }
1418      }
1419
1420      complete = slot >= prog_data->vue_map.num_slots;
1421      current_annotation = "URB write";
1422      vec4_instruction *inst = emit_urb_write_opcode(complete);
1423      inst->base_mrf = base_mrf;
1424      inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1425      inst->offset += offset;
1426   } while(!complete);
1427}
1428
1429
1430src_reg
1431vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1432				 src_reg *reladdr, int reg_offset)
1433{
1434   /* Because we store the values to scratch interleaved like our
1435    * vertex data, we need to scale the vec4 index by 2.
1436    */
1437   int message_header_scale = 2;
1438
1439   /* Pre-gen6, the message header uses byte offsets instead of vec4
1440    * (16-byte) offset units.
1441    */
1442   if (devinfo->gen < 6)
1443      message_header_scale *= 16;
1444
1445   if (reladdr) {
1446      /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1447       * to multiply the reladdr by 2. Notice that the reg_offset part
1448       * is in units of 16 bytes and is used to select the low/high 16-byte
1449       * chunk of a full dvec4, so we don't want to multiply that part.
1450       */
1451      src_reg index = src_reg(this, glsl_type::int_type);
1452      if (type_sz(inst->dst.type) < 8) {
1453         emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1454                                      brw_imm_d(reg_offset)));
1455         emit_before(block, inst, MUL(dst_reg(index), index,
1456                                      brw_imm_d(message_header_scale)));
1457      } else {
1458         emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1459                                      brw_imm_d(message_header_scale * 2)));
1460         emit_before(block, inst, ADD(dst_reg(index), index,
1461                                      brw_imm_d(reg_offset * message_header_scale)));
1462      }
1463      return index;
1464   } else {
1465      return brw_imm_d(reg_offset * message_header_scale);
1466   }
1467}
1468
1469/**
1470 * Emits an instruction before @inst to load the value named by @orig_src
1471 * from scratch space at @base_offset to @temp.
1472 *
1473 * @base_offset is measured in 32-byte units (the size of a register).
1474 */
1475void
1476vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1477				dst_reg temp, src_reg orig_src,
1478				int base_offset)
1479{
1480   assert(orig_src.offset % REG_SIZE == 0);
1481   int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1482   src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1483                                      reg_offset);
1484
1485   if (type_sz(orig_src.type) < 8) {
1486      emit_before(block, inst, SCRATCH_READ(temp, index));
1487   } else {
1488      dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1489      dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1490      emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1491      index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1492      vec4_instruction *last_read =
1493         SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1494      emit_before(block, inst, last_read);
1495      shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1496   }
1497}
1498
1499/**
1500 * Emits an instruction after @inst to store the value to be written
1501 * to @orig_dst to scratch space at @base_offset, from @temp.
1502 *
1503 * @base_offset is measured in 32-byte units (the size of a register).
1504 */
1505void
1506vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1507                                 int base_offset)
1508{
1509   assert(inst->dst.offset % REG_SIZE == 0);
1510   int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1511   src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1512                                      reg_offset);
1513
1514   /* Create a temporary register to store *inst's result in.
1515    *
1516    * We have to be careful in MOVing from our temporary result register in
1517    * the scratch write.  If we swizzle from channels of the temporary that
1518    * weren't initialized, it will confuse live interval analysis, which will
1519    * make spilling fail to make progress.
1520    */
1521   bool is_64bit = type_sz(inst->dst.type) == 8;
1522   const glsl_type *alloc_type =
1523      is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1524   const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1525                                       inst->dst.type),
1526                                brw_swizzle_for_mask(inst->dst.writemask));
1527
1528   if (!is_64bit) {
1529      dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1530				          inst->dst.writemask));
1531      vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1532      if (inst->opcode != BRW_OPCODE_SEL)
1533         write->predicate = inst->predicate;
1534      write->ir = inst->ir;
1535      write->annotation = inst->annotation;
1536      inst->insert_after(block, write);
1537   } else {
1538      dst_reg shuffled = dst_reg(this, alloc_type);
1539      vec4_instruction *last =
1540         shuffle_64bit_data(shuffled, temp, true, block, inst);
1541      src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1542
1543      uint8_t mask = 0;
1544      if (inst->dst.writemask & WRITEMASK_X)
1545         mask |= WRITEMASK_XY;
1546      if (inst->dst.writemask & WRITEMASK_Y)
1547         mask |= WRITEMASK_ZW;
1548      if (mask) {
1549         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1550
1551         vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1552         if (inst->opcode != BRW_OPCODE_SEL)
1553            write->predicate = inst->predicate;
1554         write->ir = inst->ir;
1555         write->annotation = inst->annotation;
1556         last->insert_after(block, write);
1557      }
1558
1559      mask = 0;
1560      if (inst->dst.writemask & WRITEMASK_Z)
1561         mask |= WRITEMASK_XY;
1562      if (inst->dst.writemask & WRITEMASK_W)
1563         mask |= WRITEMASK_ZW;
1564      if (mask) {
1565         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1566
1567         src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1568                                            reg_offset + 1);
1569         vec4_instruction *write =
1570            SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1571         if (inst->opcode != BRW_OPCODE_SEL)
1572            write->predicate = inst->predicate;
1573         write->ir = inst->ir;
1574         write->annotation = inst->annotation;
1575         last->insert_after(block, write);
1576      }
1577   }
1578
1579   inst->dst.file = temp.file;
1580   inst->dst.nr = temp.nr;
1581   inst->dst.offset %= REG_SIZE;
1582   inst->dst.reladdr = NULL;
1583}
1584
1585/**
1586 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1587 * adds the scratch read(s) before \p inst. The function also checks for
1588 * recursive reladdr scratch accesses, issuing the corresponding scratch
1589 * loads and rewriting reladdr references accordingly.
1590 *
1591 * \return \p src if it did not require a scratch load, otherwise, the
1592 * register holding the result of the scratch load that the caller should
1593 * use to rewrite src.
1594 */
1595src_reg
1596vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1597                                   vec4_instruction *inst, src_reg src)
1598{
1599   /* Resolve recursive reladdr scratch access by calling ourselves
1600    * with src.reladdr
1601    */
1602   if (src.reladdr)
1603      *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1604                                          *src.reladdr);
1605
1606   /* Now handle scratch access on src */
1607   if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1608      dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1609         glsl_type::dvec4_type : glsl_type::vec4_type);
1610      emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1611      src.nr = temp.nr;
1612      src.offset %= REG_SIZE;
1613      src.reladdr = NULL;
1614   }
1615
1616   return src;
1617}
1618
1619/**
1620 * We can't generally support array access in GRF space, because a
1621 * single instruction's destination can only span 2 contiguous
1622 * registers.  So, we send all GRF arrays that get variable index
1623 * access to scratch space.
1624 */
1625void
1626vec4_visitor::move_grf_array_access_to_scratch()
1627{
1628   int scratch_loc[this->alloc.count];
1629   memset(scratch_loc, -1, sizeof(scratch_loc));
1630
1631   /* First, calculate the set of virtual GRFs that need to be punted
1632    * to scratch due to having any array access on them, and where in
1633    * scratch.
1634    */
1635   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1636      if (inst->dst.file == VGRF && inst->dst.reladdr) {
1637         if (scratch_loc[inst->dst.nr] == -1) {
1638            scratch_loc[inst->dst.nr] = last_scratch;
1639            last_scratch += this->alloc.sizes[inst->dst.nr];
1640         }
1641
1642         for (src_reg *iter = inst->dst.reladdr;
1643              iter->reladdr;
1644              iter = iter->reladdr) {
1645            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1646               scratch_loc[iter->nr] = last_scratch;
1647               last_scratch += this->alloc.sizes[iter->nr];
1648            }
1649         }
1650      }
1651
1652      for (int i = 0 ; i < 3; i++) {
1653         for (src_reg *iter = &inst->src[i];
1654              iter->reladdr;
1655              iter = iter->reladdr) {
1656            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1657               scratch_loc[iter->nr] = last_scratch;
1658               last_scratch += this->alloc.sizes[iter->nr];
1659            }
1660         }
1661      }
1662   }
1663
1664   /* Now, for anything that will be accessed through scratch, rewrite
1665    * it to load/store.  Note that this is a _safe list walk, because
1666    * we may generate a new scratch_write instruction after the one
1667    * we're processing.
1668    */
1669   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1670      /* Set up the annotation tracking for new generated instructions. */
1671      base_ir = inst->ir;
1672      current_annotation = inst->annotation;
1673
1674      /* First handle scratch access on the dst. Notice we have to handle
1675       * the case where the dst's reladdr also points to scratch space.
1676       */
1677      if (inst->dst.reladdr)
1678         *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1679                                                   *inst->dst.reladdr);
1680
1681      /* Now that we have handled any (possibly recursive) reladdr scratch
1682       * accesses for dst we can safely do the scratch write for dst itself
1683       */
1684      if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1685         emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1686
1687      /* Now handle scratch access on any src. In this case, since inst->src[i]
1688       * already is a src_reg, we can just call emit_resolve_reladdr with
1689       * inst->src[i] and it will take care of handling scratch loads for
1690       * both src and src.reladdr (recursively).
1691       */
1692      for (int i = 0 ; i < 3; i++) {
1693         inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1694                                             inst->src[i]);
1695      }
1696   }
1697}
1698
1699/**
1700 * Emits an instruction before @inst to load the value named by @orig_src
1701 * from the pull constant buffer (surface) at @base_offset to @temp.
1702 */
1703void
1704vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1705                                      dst_reg temp, src_reg orig_src,
1706                                      int base_offset, src_reg indirect)
1707{
1708   assert(orig_src.offset % 16 == 0);
1709   const unsigned index = prog_data->base.binding_table.pull_constants_start;
1710
1711   /* For 64bit loads we need to emit two 32-bit load messages and we also
1712    * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1713    * that we emit the 32-bit loads into a temporary and we shuffle the result
1714    * into the original destination.
1715    */
1716   dst_reg orig_temp = temp;
1717   bool is_64bit = type_sz(orig_src.type) == 8;
1718   if (is_64bit) {
1719      assert(type_sz(temp.type) == 8);
1720      dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1721      temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1722   }
1723
1724   src_reg src = orig_src;
1725   for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1726      int reg_offset = base_offset + src.offset / 16;
1727
1728      src_reg offset;
1729      if (indirect.file != BAD_FILE) {
1730         offset = src_reg(this, glsl_type::uint_type);
1731         emit_before(block, inst, ADD(dst_reg(offset), indirect,
1732                                      brw_imm_ud(reg_offset * 16)));
1733      } else if (devinfo->gen >= 8) {
1734         /* Store the offset in a GRF so we can send-from-GRF. */
1735         offset = src_reg(this, glsl_type::uint_type);
1736         emit_before(block, inst, MOV(dst_reg(offset),
1737                                      brw_imm_ud(reg_offset * 16)));
1738      } else {
1739         offset = brw_imm_d(reg_offset * 16);
1740      }
1741
1742      emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1743                                  brw_imm_ud(index),
1744                                  offset,
1745                                  block, inst);
1746
1747      src = byte_offset(src, 16);
1748   }
1749
1750   brw_mark_surface_used(&prog_data->base, index);
1751
1752   if (is_64bit) {
1753      temp = retype(temp, BRW_REGISTER_TYPE_DF);
1754      shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1755   }
1756}
1757
1758/**
1759 * Implements array access of uniforms by inserting a
1760 * PULL_CONSTANT_LOAD instruction.
1761 *
1762 * Unlike temporary GRF array access (where we don't support it due to
1763 * the difficulty of doing relative addressing on instruction
1764 * destinations), we could potentially do array access of uniforms
1765 * that were loaded in GRF space as push constants.  In real-world
1766 * usage we've seen, though, the arrays being used are always larger
1767 * than we could load as push constants, so just always move all
1768 * uniform array access out to a pull constant buffer.
1769 */
1770void
1771vec4_visitor::move_uniform_array_access_to_pull_constants()
1772{
1773   /* The vulkan dirver doesn't support pull constants other than UBOs so
1774    * everything has to be pushed regardless.
1775    */
1776   if (stage_prog_data->pull_param == NULL) {
1777      split_uniform_registers();
1778      return;
1779   }
1780
1781   int pull_constant_loc[this->uniforms];
1782   memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1783
1784   /* First, walk through the instructions and determine which things need to
1785    * be pulled.  We mark something as needing to be pulled by setting
1786    * pull_constant_loc to 0.
1787    */
1788   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1789      /* We only care about MOV_INDIRECT of a uniform */
1790      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1791          inst->src[0].file != UNIFORM)
1792         continue;
1793
1794      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1795
1796      for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1797         pull_constant_loc[uniform_nr + j] = 0;
1798   }
1799
1800   /* Next, we walk the list of uniforms and assign real pull constant
1801    * locations and set their corresponding entries in pull_param.
1802    */
1803   for (int j = 0; j < this->uniforms; j++) {
1804      if (pull_constant_loc[j] < 0)
1805         continue;
1806
1807      pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1808
1809      for (int i = 0; i < 4; i++) {
1810         stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1811            = stage_prog_data->param[j * 4 + i];
1812      }
1813   }
1814
1815   /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1816    * instructions to actual uniform pulls.
1817    */
1818   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1819      /* We only care about MOV_INDIRECT of a uniform */
1820      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1821          inst->src[0].file != UNIFORM)
1822         continue;
1823
1824      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1825
1826      assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1827
1828      emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1829                              pull_constant_loc[uniform_nr], inst->src[1]);
1830      inst->remove(block);
1831   }
1832
1833   /* Now there are no accesses of the UNIFORM file with a reladdr, so
1834    * no need to track them as larger-than-vec4 objects.  This will be
1835    * relied on in cutting out unused uniform vectors from push
1836    * constants.
1837    */
1838   split_uniform_registers();
1839}
1840
1841void
1842vec4_visitor::resolve_ud_negate(src_reg *reg)
1843{
1844   if (reg->type != BRW_REGISTER_TYPE_UD ||
1845       !reg->negate)
1846      return;
1847
1848   src_reg temp = src_reg(this, glsl_type::uvec4_type);
1849   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1850   *reg = temp;
1851}
1852
1853vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1854                           void *log_data,
1855                           const struct brw_sampler_prog_key_data *key_tex,
1856                           struct brw_vue_prog_data *prog_data,
1857                           const nir_shader *shader,
1858			   void *mem_ctx,
1859                           bool no_spills,
1860                           int shader_time_index)
1861   : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1862     key_tex(key_tex),
1863     prog_data(prog_data),
1864     fail_msg(NULL),
1865     first_non_payload_grf(0),
1866     need_all_constants_in_pull_buffer(false),
1867     no_spills(no_spills),
1868     shader_time_index(shader_time_index),
1869     last_scratch(0)
1870{
1871   this->failed = false;
1872
1873   this->base_ir = NULL;
1874   this->current_annotation = NULL;
1875   memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1876
1877   memset(this->output_num_components, 0, sizeof(this->output_num_components));
1878
1879   this->virtual_grf_start = NULL;
1880   this->virtual_grf_end = NULL;
1881   this->live_intervals = NULL;
1882
1883   this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1884
1885   this->uniforms = 0;
1886}
1887
1888vec4_visitor::~vec4_visitor()
1889{
1890}
1891
1892
1893void
1894vec4_visitor::fail(const char *format, ...)
1895{
1896   va_list va;
1897   char *msg;
1898
1899   if (failed)
1900      return;
1901
1902   failed = true;
1903
1904   va_start(va, format);
1905   msg = ralloc_vasprintf(mem_ctx, format, va);
1906   va_end(va);
1907   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1908
1909   this->fail_msg = msg;
1910
1911   if (debug_enabled) {
1912      fprintf(stderr, "%s",  msg);
1913   }
1914}
1915
1916} /* namespace brw */
1917