1/*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_nir.h"
25#include "brw_vec4.h"
26#include "brw_vec4_builder.h"
27#include "brw_vec4_surface_builder.h"
28#include "brw_program.h"
29
30using namespace brw;
31using namespace brw::surface_access;
32
33namespace brw {
34
35void
36vec4_visitor::emit_nir_code()
37{
38   if (nir->num_uniforms > 0)
39      nir_setup_uniforms();
40
41   nir_setup_system_values();
42
43   /* get the main function and emit it */
44   nir_foreach_function(function, nir) {
45      assert(strcmp(function->name, "main") == 0);
46      assert(function->impl);
47      nir_emit_impl(function->impl);
48   }
49}
50
51void
52vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
53{
54   dst_reg *reg;
55
56   switch (instr->intrinsic) {
57   case nir_intrinsic_load_vertex_id:
58      unreachable("should be lowered by lower_vertex_id().");
59
60   case nir_intrinsic_load_vertex_id_zero_base:
61      reg = &nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
62      if (reg->file == BAD_FILE)
63         *reg = *make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
64      break;
65
66   case nir_intrinsic_load_base_vertex:
67      reg = &nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
68      if (reg->file == BAD_FILE)
69         *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX);
70      break;
71
72   case nir_intrinsic_load_instance_id:
73      reg = &nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
74      if (reg->file == BAD_FILE)
75         *reg = *make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID);
76      break;
77
78   case nir_intrinsic_load_base_instance:
79      reg = &nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
80      if (reg->file == BAD_FILE)
81         *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_INSTANCE);
82      break;
83
84   case nir_intrinsic_load_draw_id:
85      reg = &nir_system_values[SYSTEM_VALUE_DRAW_ID];
86      if (reg->file == BAD_FILE)
87         *reg = *make_reg_for_system_value(SYSTEM_VALUE_DRAW_ID);
88      break;
89
90   default:
91      break;
92   }
93}
94
95static bool
96setup_system_values_block(nir_block *block, vec4_visitor *v)
97{
98   nir_foreach_instr(instr, block) {
99      if (instr->type != nir_instr_type_intrinsic)
100         continue;
101
102      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
103      v->nir_setup_system_value_intrinsic(intrin);
104   }
105
106   return true;
107}
108
109void
110vec4_visitor::nir_setup_system_values()
111{
112   nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX);
113   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
114      nir_system_values[i] = dst_reg();
115   }
116
117   nir_foreach_function(function, nir) {
118      assert(strcmp(function->name, "main") == 0);
119      assert(function->impl);
120      nir_foreach_block(block, function->impl) {
121         setup_system_values_block(block, this);
122      }
123   }
124}
125
126void
127vec4_visitor::nir_setup_uniforms()
128{
129   uniforms = nir->num_uniforms / 16;
130}
131
132void
133vec4_visitor::nir_emit_impl(nir_function_impl *impl)
134{
135   nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc);
136   for (unsigned i = 0; i < impl->reg_alloc; i++) {
137      nir_locals[i] = dst_reg();
138   }
139
140   foreach_list_typed(nir_register, reg, node, &impl->registers) {
141      unsigned array_elems =
142         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
143      const unsigned num_regs = array_elems * DIV_ROUND_UP(reg->bit_size, 32);
144      nir_locals[reg->index] = dst_reg(VGRF, alloc.allocate(num_regs));
145
146      if (reg->bit_size == 64)
147         nir_locals[reg->index].type = BRW_REGISTER_TYPE_DF;
148   }
149
150   nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
151
152   nir_emit_cf_list(&impl->body);
153}
154
155void
156vec4_visitor::nir_emit_cf_list(exec_list *list)
157{
158   exec_list_validate(list);
159   foreach_list_typed(nir_cf_node, node, node, list) {
160      switch (node->type) {
161      case nir_cf_node_if:
162         nir_emit_if(nir_cf_node_as_if(node));
163         break;
164
165      case nir_cf_node_loop:
166         nir_emit_loop(nir_cf_node_as_loop(node));
167         break;
168
169      case nir_cf_node_block:
170         nir_emit_block(nir_cf_node_as_block(node));
171         break;
172
173      default:
174         unreachable("Invalid CFG node block");
175      }
176   }
177}
178
179void
180vec4_visitor::nir_emit_if(nir_if *if_stmt)
181{
182   /* First, put the condition in f0 */
183   src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1);
184   vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
185   inst->conditional_mod = BRW_CONDITIONAL_NZ;
186
187   /* We can just predicate based on the X channel, as the condition only
188    * goes on its own line */
189   emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X));
190
191   nir_emit_cf_list(&if_stmt->then_list);
192
193   /* note: if the else is empty, dead CF elimination will remove it */
194   emit(BRW_OPCODE_ELSE);
195
196   nir_emit_cf_list(&if_stmt->else_list);
197
198   emit(BRW_OPCODE_ENDIF);
199}
200
201void
202vec4_visitor::nir_emit_loop(nir_loop *loop)
203{
204   emit(BRW_OPCODE_DO);
205
206   nir_emit_cf_list(&loop->body);
207
208   emit(BRW_OPCODE_WHILE);
209}
210
211void
212vec4_visitor::nir_emit_block(nir_block *block)
213{
214   nir_foreach_instr(instr, block) {
215      nir_emit_instr(instr);
216   }
217}
218
219void
220vec4_visitor::nir_emit_instr(nir_instr *instr)
221{
222   base_ir = instr;
223
224   switch (instr->type) {
225   case nir_instr_type_load_const:
226      nir_emit_load_const(nir_instr_as_load_const(instr));
227      break;
228
229   case nir_instr_type_intrinsic:
230      nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
231      break;
232
233   case nir_instr_type_alu:
234      nir_emit_alu(nir_instr_as_alu(instr));
235      break;
236
237   case nir_instr_type_jump:
238      nir_emit_jump(nir_instr_as_jump(instr));
239      break;
240
241   case nir_instr_type_tex:
242      nir_emit_texture(nir_instr_as_tex(instr));
243      break;
244
245   case nir_instr_type_ssa_undef:
246      nir_emit_undef(nir_instr_as_ssa_undef(instr));
247      break;
248
249   default:
250      fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n");
251      break;
252   }
253}
254
255static dst_reg
256dst_reg_for_nir_reg(vec4_visitor *v, nir_register *nir_reg,
257                    unsigned base_offset, nir_src *indirect)
258{
259   dst_reg reg;
260
261   reg = v->nir_locals[nir_reg->index];
262   if (nir_reg->bit_size == 64)
263      reg.type = BRW_REGISTER_TYPE_DF;
264   reg = offset(reg, 8, base_offset);
265   if (indirect) {
266      reg.reladdr =
267         new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
268                                                BRW_REGISTER_TYPE_D,
269                                                1));
270   }
271   return reg;
272}
273
274dst_reg
275vec4_visitor::get_nir_dest(const nir_dest &dest)
276{
277   if (dest.is_ssa) {
278      dst_reg dst =
279         dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(dest.ssa.bit_size, 32)));
280      if (dest.ssa.bit_size == 64)
281         dst.type = BRW_REGISTER_TYPE_DF;
282      nir_ssa_values[dest.ssa.index] = dst;
283      return dst;
284   } else {
285      return dst_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
286                                 dest.reg.indirect);
287   }
288}
289
290dst_reg
291vec4_visitor::get_nir_dest(const nir_dest &dest, enum brw_reg_type type)
292{
293   return retype(get_nir_dest(dest), type);
294}
295
296dst_reg
297vec4_visitor::get_nir_dest(const nir_dest &dest, nir_alu_type type)
298{
299   return get_nir_dest(dest, brw_type_for_nir_type(type));
300}
301
302src_reg
303vec4_visitor::get_nir_src(const nir_src &src, enum brw_reg_type type,
304                          unsigned num_components)
305{
306   dst_reg reg;
307
308   if (src.is_ssa) {
309      assert(src.ssa != NULL);
310      reg = nir_ssa_values[src.ssa->index];
311   }
312   else {
313      reg = dst_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
314                                src.reg.indirect);
315   }
316
317   reg = retype(reg, type);
318
319   src_reg reg_as_src = src_reg(reg);
320   reg_as_src.swizzle = brw_swizzle_for_size(num_components);
321   return reg_as_src;
322}
323
324src_reg
325vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type,
326                          unsigned num_components)
327{
328   return get_nir_src(src, brw_type_for_nir_type(type), num_components);
329}
330
331src_reg
332vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components)
333{
334   /* if type is not specified, default to signed int */
335   return get_nir_src(src, nir_type_int32, num_components);
336}
337
338src_reg
339vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
340{
341   nir_src *offset_src = nir_get_io_offset_src(instr);
342   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
343
344   if (const_value) {
345      /* The only constant offset we should find is 0.  brw_nir.c's
346       * add_const_offset_to_base() will fold other constant offsets
347       * into instr->const_index[0].
348       */
349      assert(const_value->u32[0] == 0);
350      return src_reg();
351   }
352
353   return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1);
354}
355
356void
357vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
358{
359   dst_reg reg;
360
361   if (instr->def.bit_size == 64) {
362      reg = dst_reg(VGRF, alloc.allocate(2));
363      reg.type = BRW_REGISTER_TYPE_DF;
364   } else {
365      reg = dst_reg(VGRF, alloc.allocate(1));
366      reg.type = BRW_REGISTER_TYPE_D;
367   }
368
369   unsigned remaining = brw_writemask_for_size(instr->def.num_components);
370
371   /* @FIXME: consider emitting vector operations to save some MOVs in
372    * cases where the components are representable in 8 bits.
373    * For now, we emit a MOV for each distinct value.
374    */
375   for (unsigned i = 0; i < instr->def.num_components; i++) {
376      unsigned writemask = 1 << i;
377
378      if ((remaining & writemask) == 0)
379         continue;
380
381      for (unsigned j = i; j < instr->def.num_components; j++) {
382         if ((instr->def.bit_size == 32 &&
383              instr->value.u32[i] == instr->value.u32[j]) ||
384             (instr->def.bit_size == 64 &&
385              instr->value.f64[i] == instr->value.f64[j])) {
386            writemask |= 1 << j;
387         }
388      }
389
390      reg.writemask = writemask;
391      if (instr->def.bit_size == 64) {
392         emit(MOV(reg, setup_imm_df(instr->value.f64[i])));
393      } else {
394         emit(MOV(reg, brw_imm_d(instr->value.i32[i])));
395      }
396
397      remaining &= ~writemask;
398   }
399
400   /* Set final writemask */
401   reg.writemask = brw_writemask_for_size(instr->def.num_components);
402
403   nir_ssa_values[instr->def.index] = reg;
404}
405
406void
407vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
408{
409   dst_reg dest;
410   src_reg src;
411
412   switch (instr->intrinsic) {
413
414   case nir_intrinsic_load_input: {
415      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
416
417      /* We set EmitNoIndirectInput for VS */
418      assert(const_offset);
419
420      dest = get_nir_dest(instr->dest);
421      dest.writemask = brw_writemask_for_size(instr->num_components);
422
423      src = src_reg(ATTR, instr->const_index[0] + const_offset->u32[0],
424                    glsl_type::uvec4_type);
425      src = retype(src, dest.type);
426
427      bool is_64bit = nir_dest_bit_size(instr->dest) == 64;
428      if (is_64bit) {
429         dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
430         src.swizzle = BRW_SWIZZLE_XYZW;
431         shuffle_64bit_data(tmp, src, false);
432         emit(MOV(dest, src_reg(tmp)));
433      } else {
434         /* Swizzle source based on component layout qualifier */
435         src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
436         emit(MOV(dest, src));
437      }
438      break;
439   }
440
441   case nir_intrinsic_store_output: {
442      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
443      assert(const_offset);
444
445      int varying = instr->const_index[0] + const_offset->u32[0];
446
447      bool is_64bit = nir_src_bit_size(instr->src[0]) == 64;
448      if (is_64bit) {
449         src_reg data;
450         src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_DF,
451                           instr->num_components);
452         data = src_reg(this, glsl_type::dvec4_type);
453         shuffle_64bit_data(dst_reg(data), src, true);
454         src = retype(data, BRW_REGISTER_TYPE_F);
455      } else {
456         src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F,
457                           instr->num_components);
458      }
459
460      unsigned c = nir_intrinsic_component(instr);
461      output_reg[varying][c] = dst_reg(src);
462      output_num_components[varying][c] = instr->num_components;
463
464      unsigned num_components = instr->num_components;
465      if (is_64bit)
466         num_components *= 2;
467
468      output_reg[varying][c] = dst_reg(src);
469      output_num_components[varying][c] = MIN2(4, num_components);
470
471      if (is_64bit && num_components > 4) {
472         assert(num_components <= 8);
473         output_reg[varying + 1][c] = byte_offset(dst_reg(src), REG_SIZE);
474         output_num_components[varying + 1][c] = num_components - 4;
475      }
476      break;
477   }
478
479   case nir_intrinsic_get_buffer_size: {
480      nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
481      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
482
483      const unsigned index =
484         prog_data->base.binding_table.ssbo_start + ssbo_index;
485      dst_reg result_dst = get_nir_dest(instr->dest);
486      vec4_instruction *inst = new(mem_ctx)
487         vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst);
488
489      inst->base_mrf = 2;
490      inst->mlen = 1; /* always at least one */
491      inst->src[1] = brw_imm_ud(index);
492
493      /* MRF for the first parameter */
494      src_reg lod = brw_imm_d(0);
495      int param_base = inst->base_mrf;
496      int writemask = WRITEMASK_X;
497      emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod));
498
499      emit(inst);
500
501      brw_mark_surface_used(&prog_data->base, index);
502      break;
503   }
504
505   case nir_intrinsic_store_ssbo: {
506      assert(devinfo->gen >= 7);
507
508      /* Block index */
509      src_reg surf_index;
510      nir_const_value *const_uniform_block =
511         nir_src_as_const_value(instr->src[1]);
512      if (const_uniform_block) {
513         unsigned index = prog_data->base.binding_table.ssbo_start +
514                          const_uniform_block->u32[0];
515         surf_index = brw_imm_ud(index);
516         brw_mark_surface_used(&prog_data->base, index);
517      } else {
518         surf_index = src_reg(this, glsl_type::uint_type);
519         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1),
520                  brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
521         surf_index = emit_uniformize(surf_index);
522
523         brw_mark_surface_used(&prog_data->base,
524                               prog_data->base.binding_table.ssbo_start +
525                               nir->info->num_ssbos - 1);
526      }
527
528      /* Offset */
529      src_reg offset_reg;
530      nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
531      if (const_offset) {
532         offset_reg = brw_imm_ud(const_offset->u32[0]);
533      } else {
534         offset_reg = get_nir_src(instr->src[2], 1);
535      }
536
537      /* Value */
538      src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4);
539
540      /* Writemask */
541      unsigned write_mask = instr->const_index[0];
542
543      /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
544       * writes will use SIMD8 mode. In order to hide this and keep symmetry across
545       * typed and untyped messages and across hardware platforms, the
546       * current implementation of the untyped messages will transparently convert
547       * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it
548       * and enabling only channel X on the SEND instruction.
549       *
550       * The above, works well for full vector writes, but not for partial writes
551       * where we want to write some channels and not others, like when we have
552       * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are
553       * quite restrictive with regards to the channel enables we can configure in
554       * the message descriptor (not all combinations are allowed) we cannot simply
555       * implement these scenarios with a single message while keeping the
556       * aforementioned symmetry in the implementation. For now we de decided that
557       * it is better to keep the symmetry to reduce complexity, so in situations
558       * such as the one described we end up emitting two untyped write messages
559       * (one for xy and another for w).
560       *
561       * The code below packs consecutive channels into a single write message,
562       * detects gaps in the vector write and if needed, sends a second message
563       * with the remaining channels. If in the future we decide that we want to
564       * emit a single message at the expense of losing the symmetry in the
565       * implementation we can:
566       *
567       * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8
568       *    message payload. In this mode we can write up to 8 offsets and dwords
569       *    to the red channel only (for the two vec4s in the SIMD4x2 execution)
570       *    and select which of the 8 channels carry data to write by setting the
571       *    appropriate writemask in the dst register of the SEND instruction.
572       *    It would require to write a new generator opcode specifically for
573       *    IvyBridge since we would need to prepare a SIMD8 payload that could
574       *    use any channel, not just X.
575       *
576       * 2) For Haswell+: Simply send a single write message but set the writemask
577       *    on the dst of the SEND instruction to select the channels we want to
578       *    write. It would require to modify the current messages to receive
579       *    and honor the writemask provided.
580       */
581      const vec4_builder bld = vec4_builder(this).at_end()
582                               .annotate(current_annotation, base_ir);
583
584      unsigned type_slots = nir_src_bit_size(instr->src[0]) / 32;
585      if (type_slots == 2) {
586         dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
587         shuffle_64bit_data(tmp, retype(val_reg, tmp.type), true);
588         val_reg = src_reg(retype(tmp, BRW_REGISTER_TYPE_F));
589      }
590
591      uint8_t swizzle[4] = { 0, 0, 0, 0};
592      int num_channels = 0;
593      unsigned skipped_channels = 0;
594      int num_components = instr->num_components;
595      for (int i = 0; i < num_components; i++) {
596         /* Read components Z/W of a dvec from the appropriate place. We will
597          * also have to adjust the swizzle (we do that with the '% 4' below)
598          */
599         if (i == 2 && type_slots == 2)
600            val_reg = byte_offset(val_reg, REG_SIZE);
601
602         /* Check if this channel needs to be written. If so, record the
603          * channel we need to take the data from in the swizzle array
604          */
605         int component_mask = 1 << i;
606         int write_test = write_mask & component_mask;
607         if (write_test) {
608            /* If we are writing doubles we have to write 2 channels worth of
609             * of data (64 bits) for each double component.
610             */
611            swizzle[num_channels++] = (i * type_slots) % 4;
612            if (type_slots == 2)
613               swizzle[num_channels++] = (i * type_slots + 1) % 4;
614         }
615
616         /* If we don't have to write this channel it means we have a gap in the
617          * vector, so write the channels we accumulated until now, if any. Do
618          * the same if this was the last component in the vector, if we have
619          * enough channels for a full vec4 write or if we have processed
620          * components XY of a dvec (since components ZW are not in the same
621          * SIMD register)
622          */
623         if (!write_test || i == num_components - 1 || num_channels == 4 ||
624             (i == 1 && type_slots == 2)) {
625            if (num_channels > 0) {
626               /* We have channels to write, so update the offset we need to
627                * write at to skip the channels we skipped, if any.
628                */
629               if (skipped_channels > 0) {
630                  if (offset_reg.file == IMM) {
631                     offset_reg.ud += 4 * skipped_channels;
632                  } else {
633                     emit(ADD(dst_reg(offset_reg), offset_reg,
634                              brw_imm_ud(4 * skipped_channels)));
635                  }
636               }
637
638               /* Swizzle the data register so we take the data from the channels
639                * we need to write and send the write message. This will write
640                * num_channels consecutive dwords starting at offset.
641                */
642               val_reg.swizzle =
643                  BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
644               emit_untyped_write(bld, surf_index, offset_reg, val_reg,
645                                  1 /* dims */, num_channels /* size */,
646                                  BRW_PREDICATE_NONE);
647
648               /* If we have to do a second write we will have to update the
649                * offset so that we jump over the channels we have just written
650                * now.
651                */
652               skipped_channels = num_channels;
653
654               /* Restart the count for the next write message */
655               num_channels = 0;
656            }
657
658            /* If we didn't write the channel, increase skipped count */
659            if (!write_test)
660               skipped_channels += type_slots;
661         }
662      }
663
664      break;
665   }
666
667   case nir_intrinsic_load_ssbo: {
668      assert(devinfo->gen >= 7);
669
670      nir_const_value *const_uniform_block =
671         nir_src_as_const_value(instr->src[0]);
672
673      src_reg surf_index;
674      if (const_uniform_block) {
675         unsigned index = prog_data->base.binding_table.ssbo_start +
676                          const_uniform_block->u32[0];
677         surf_index = brw_imm_ud(index);
678
679         brw_mark_surface_used(&prog_data->base, index);
680      } else {
681         surf_index = src_reg(this, glsl_type::uint_type);
682         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1),
683                  brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
684         surf_index = emit_uniformize(surf_index);
685
686         /* Assume this may touch any UBO. It would be nice to provide
687          * a tighter bound, but the array information is already lowered away.
688          */
689         brw_mark_surface_used(&prog_data->base,
690                               prog_data->base.binding_table.ssbo_start +
691                               nir->info->num_ssbos - 1);
692      }
693
694      src_reg offset_reg;
695      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
696      if (const_offset) {
697         offset_reg = brw_imm_ud(const_offset->u32[0]);
698      } else {
699         offset_reg = get_nir_src(instr->src[1], 1);
700      }
701
702      /* Read the vector */
703      const vec4_builder bld = vec4_builder(this).at_end()
704         .annotate(current_annotation, base_ir);
705
706      src_reg read_result;
707      dst_reg dest = get_nir_dest(instr->dest);
708      if (type_sz(dest.type) < 8) {
709         read_result = emit_untyped_read(bld, surf_index, offset_reg,
710                                         1 /* dims */, 4 /* size*/,
711                                         BRW_PREDICATE_NONE);
712      } else {
713         src_reg shuffled = src_reg(this, glsl_type::dvec4_type);
714
715         src_reg temp;
716         temp = emit_untyped_read(bld, surf_index, offset_reg,
717                                  1 /* dims */, 4 /* size*/,
718                                  BRW_PREDICATE_NONE);
719         emit(MOV(dst_reg(retype(shuffled, temp.type)), temp));
720
721         if (offset_reg.file == IMM)
722            offset_reg.ud += 16;
723         else
724            emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16)));
725
726         temp = emit_untyped_read(bld, surf_index, offset_reg,
727                                  1 /* dims */, 4 /* size*/,
728                                  BRW_PREDICATE_NONE);
729         emit(MOV(dst_reg(retype(byte_offset(shuffled, REG_SIZE), temp.type)),
730                  temp));
731
732         read_result = src_reg(this, glsl_type::dvec4_type);
733         shuffle_64bit_data(dst_reg(read_result), shuffled, false);
734      }
735
736      read_result.type = dest.type;
737      read_result.swizzle = brw_swizzle_for_size(instr->num_components);
738      emit(MOV(dest, read_result));
739      break;
740   }
741
742   case nir_intrinsic_ssbo_atomic_add:
743      nir_emit_ssbo_atomic(BRW_AOP_ADD, instr);
744      break;
745   case nir_intrinsic_ssbo_atomic_imin:
746      nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr);
747      break;
748   case nir_intrinsic_ssbo_atomic_umin:
749      nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr);
750      break;
751   case nir_intrinsic_ssbo_atomic_imax:
752      nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr);
753      break;
754   case nir_intrinsic_ssbo_atomic_umax:
755      nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr);
756      break;
757   case nir_intrinsic_ssbo_atomic_and:
758      nir_emit_ssbo_atomic(BRW_AOP_AND, instr);
759      break;
760   case nir_intrinsic_ssbo_atomic_or:
761      nir_emit_ssbo_atomic(BRW_AOP_OR, instr);
762      break;
763   case nir_intrinsic_ssbo_atomic_xor:
764      nir_emit_ssbo_atomic(BRW_AOP_XOR, instr);
765      break;
766   case nir_intrinsic_ssbo_atomic_exchange:
767      nir_emit_ssbo_atomic(BRW_AOP_MOV, instr);
768      break;
769   case nir_intrinsic_ssbo_atomic_comp_swap:
770      nir_emit_ssbo_atomic(BRW_AOP_CMPWR, instr);
771      break;
772
773   case nir_intrinsic_load_vertex_id:
774      unreachable("should be lowered by lower_vertex_id()");
775
776   case nir_intrinsic_load_vertex_id_zero_base:
777   case nir_intrinsic_load_base_vertex:
778   case nir_intrinsic_load_instance_id:
779   case nir_intrinsic_load_base_instance:
780   case nir_intrinsic_load_draw_id:
781   case nir_intrinsic_load_invocation_id: {
782      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
783      src_reg val = src_reg(nir_system_values[sv]);
784      assert(val.file != BAD_FILE);
785      dest = get_nir_dest(instr->dest, val.type);
786      emit(MOV(dest, val));
787      break;
788   }
789
790   case nir_intrinsic_load_uniform: {
791      /* Offsets are in bytes but they should always be multiples of 4 */
792      assert(nir_intrinsic_base(instr) % 4 == 0);
793
794      dest = get_nir_dest(instr->dest);
795
796      src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16));
797      src.type = dest.type;
798
799      /* Uniforms don't actually have to be vec4 aligned.  In the case that
800       * it isn't, we have to use a swizzle to shift things around.  They
801       * do still have the std140 alignment requirement that vec2's have to
802       * be vec2-aligned and vec3's and vec4's have to be vec4-aligned.
803       *
804       * The swizzle also works in the indirect case as the generator adds
805       * the swizzle to the offset for us.
806       */
807      unsigned shift = (nir_intrinsic_base(instr) % 16) / 4;
808      assert(shift + instr->num_components <= 4);
809
810      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
811      if (const_offset) {
812         /* Offsets are in bytes but they should always be multiples of 4 */
813         assert(const_offset->u32[0] % 4 == 0);
814
815         unsigned offset = const_offset->u32[0] + shift * 4;
816         src.offset = ROUND_DOWN_TO(offset, 16);
817         shift = (offset % 16) / 4;
818         src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
819
820         emit(MOV(dest, src));
821      } else {
822         src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
823
824         src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
825
826         /* MOV_INDIRECT is going to stomp the whole thing anyway */
827         dest.writemask = WRITEMASK_XYZW;
828
829         emit(SHADER_OPCODE_MOV_INDIRECT, dest, src,
830              indirect, brw_imm_ud(instr->const_index[1]));
831      }
832      break;
833   }
834
835   case nir_intrinsic_atomic_counter_read:
836   case nir_intrinsic_atomic_counter_inc:
837   case nir_intrinsic_atomic_counter_dec: {
838      unsigned surf_index = prog_data->base.binding_table.abo_start +
839         (unsigned) instr->const_index[0];
840      const vec4_builder bld =
841         vec4_builder(this).at_end().annotate(current_annotation, base_ir);
842
843      /* Get some metadata from the image intrinsic. */
844      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
845
846      /* Get the arguments of the atomic intrinsic. */
847      src_reg offset = get_nir_src(instr->src[0], nir_type_int32,
848                                   instr->num_components);
849      const src_reg surface = brw_imm_ud(surf_index);
850      const src_reg src0 = (info->num_srcs >= 2
851                           ? get_nir_src(instr->src[1]) : src_reg());
852      const src_reg src1 = (info->num_srcs >= 3
853                           ? get_nir_src(instr->src[2]) : src_reg());
854
855      src_reg tmp;
856
857      dest = get_nir_dest(instr->dest);
858
859      if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
860         tmp = emit_untyped_read(bld, surface, offset, 1, 1);
861      } else {
862         tmp = emit_untyped_atomic(bld, surface, offset,
863                                   src0, src1,
864                                   1, 1,
865                                   get_atomic_counter_op(instr->intrinsic));
866      }
867
868      bld.MOV(retype(dest, tmp.type), tmp);
869      brw_mark_surface_used(stage_prog_data, surf_index);
870      break;
871   }
872
873   case nir_intrinsic_load_ubo: {
874      nir_const_value *const_block_index = nir_src_as_const_value(instr->src[0]);
875      src_reg surf_index;
876
877      dest = get_nir_dest(instr->dest);
878
879      if (const_block_index) {
880         /* The block index is a constant, so just emit the binding table entry
881          * as an immediate.
882          */
883         const unsigned index = prog_data->base.binding_table.ubo_start +
884                                const_block_index->u32[0];
885         surf_index = brw_imm_ud(index);
886         brw_mark_surface_used(&prog_data->base, index);
887      } else {
888         /* The block index is not a constant. Evaluate the index expression
889          * per-channel and add the base UBO index; we have to select a value
890          * from any live channel.
891          */
892         surf_index = src_reg(this, glsl_type::uint_type);
893         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32,
894                                                   instr->num_components),
895                  brw_imm_ud(prog_data->base.binding_table.ubo_start)));
896         surf_index = emit_uniformize(surf_index);
897
898         /* Assume this may touch any UBO. It would be nice to provide
899          * a tighter bound, but the array information is already lowered away.
900          */
901         brw_mark_surface_used(&prog_data->base,
902                               prog_data->base.binding_table.ubo_start +
903                               nir->info->num_ubos - 1);
904      }
905
906      src_reg offset_reg;
907      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
908      if (const_offset) {
909         offset_reg = brw_imm_ud(const_offset->u32[0] & ~15);
910      } else {
911         offset_reg = get_nir_src(instr->src[1], nir_type_uint32, 1);
912      }
913
914      src_reg packed_consts;
915      if (nir_dest_bit_size(instr->dest) == 32) {
916         packed_consts = src_reg(this, glsl_type::vec4_type);
917         emit_pull_constant_load_reg(dst_reg(packed_consts),
918                                     surf_index,
919                                     offset_reg,
920                                     NULL, NULL /* before_block/inst */);
921      } else {
922         src_reg temp = src_reg(this, glsl_type::dvec4_type);
923         src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F);
924
925         emit_pull_constant_load_reg(dst_reg(temp_float),
926                                     surf_index, offset_reg, NULL, NULL);
927         if (offset_reg.file == IMM)
928            offset_reg.ud += 16;
929         else
930            emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u)));
931         emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)),
932                                     surf_index, offset_reg, NULL, NULL);
933
934         packed_consts = src_reg(this, glsl_type::dvec4_type);
935         shuffle_64bit_data(dst_reg(packed_consts), temp, false);
936      }
937
938      packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
939      if (const_offset) {
940         unsigned type_size = type_sz(dest.type);
941         packed_consts.swizzle +=
942            BRW_SWIZZLE4(const_offset->u32[0] % 16 / type_size,
943                         const_offset->u32[0] % 16 / type_size,
944                         const_offset->u32[0] % 16 / type_size,
945                         const_offset->u32[0] % 16 / type_size);
946      }
947
948      emit(MOV(dest, retype(packed_consts, dest.type)));
949
950      break;
951   }
952
953   case nir_intrinsic_memory_barrier: {
954      const vec4_builder bld =
955         vec4_builder(this).at_end().annotate(current_annotation, base_ir);
956      const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
957      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
958         ->size_written = 2 * REG_SIZE;
959      break;
960   }
961
962   case nir_intrinsic_shader_clock: {
963      /* We cannot do anything if there is an event, so ignore it for now */
964      const src_reg shader_clock = get_timestamp();
965      const enum brw_reg_type type = brw_type_for_base_type(glsl_type::uvec2_type);
966
967      dest = get_nir_dest(instr->dest, type);
968      emit(MOV(dest, shader_clock));
969      break;
970   }
971
972   default:
973      unreachable("Unknown intrinsic");
974   }
975}
976
977void
978vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
979{
980   dst_reg dest;
981   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
982      dest = get_nir_dest(instr->dest);
983
984   src_reg surface;
985   nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
986   if (const_surface) {
987      unsigned surf_index = prog_data->base.binding_table.ssbo_start +
988                            const_surface->u32[0];
989      surface = brw_imm_ud(surf_index);
990      brw_mark_surface_used(&prog_data->base, surf_index);
991   } else {
992      surface = src_reg(this, glsl_type::uint_type);
993      emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]),
994               brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
995
996      /* Assume this may touch any UBO. This is the same we do for other
997       * UBO/SSBO accesses with non-constant surface.
998       */
999      brw_mark_surface_used(&prog_data->base,
1000                            prog_data->base.binding_table.ssbo_start +
1001                            nir->info->num_ssbos - 1);
1002   }
1003
1004   src_reg offset = get_nir_src(instr->src[1], 1);
1005   src_reg data1 = get_nir_src(instr->src[2], 1);
1006   src_reg data2;
1007   if (op == BRW_AOP_CMPWR)
1008      data2 = get_nir_src(instr->src[3], 1);
1009
1010   /* Emit the actual atomic operation operation */
1011   const vec4_builder bld =
1012      vec4_builder(this).at_end().annotate(current_annotation, base_ir);
1013
1014   src_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
1015                                               data1, data2,
1016                                               1 /* dims */, 1 /* rsize */,
1017                                               op,
1018                                               BRW_PREDICATE_NONE);
1019   dest.type = atomic_result.type;
1020   bld.MOV(dest, atomic_result);
1021}
1022
1023static unsigned
1024brw_swizzle_for_nir_swizzle(uint8_t swizzle[4])
1025{
1026   return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1027}
1028
1029static enum brw_conditional_mod
1030brw_conditional_for_nir_comparison(nir_op op)
1031{
1032   switch (op) {
1033   case nir_op_flt:
1034   case nir_op_ilt:
1035   case nir_op_ult:
1036      return BRW_CONDITIONAL_L;
1037
1038   case nir_op_fge:
1039   case nir_op_ige:
1040   case nir_op_uge:
1041      return BRW_CONDITIONAL_GE;
1042
1043   case nir_op_feq:
1044   case nir_op_ieq:
1045   case nir_op_ball_fequal2:
1046   case nir_op_ball_iequal2:
1047   case nir_op_ball_fequal3:
1048   case nir_op_ball_iequal3:
1049   case nir_op_ball_fequal4:
1050   case nir_op_ball_iequal4:
1051      return BRW_CONDITIONAL_Z;
1052
1053   case nir_op_fne:
1054   case nir_op_ine:
1055   case nir_op_bany_fnequal2:
1056   case nir_op_bany_inequal2:
1057   case nir_op_bany_fnequal3:
1058   case nir_op_bany_inequal3:
1059   case nir_op_bany_fnequal4:
1060   case nir_op_bany_inequal4:
1061      return BRW_CONDITIONAL_NZ;
1062
1063   default:
1064      unreachable("not reached: bad operation for comparison");
1065   }
1066}
1067
1068bool
1069vec4_visitor::optimize_predicate(nir_alu_instr *instr,
1070                                 enum brw_predicate *predicate)
1071{
1072   if (!instr->src[0].src.is_ssa ||
1073       instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
1074      return false;
1075
1076   nir_alu_instr *cmp_instr =
1077      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
1078
1079   switch (cmp_instr->op) {
1080   case nir_op_bany_fnequal2:
1081   case nir_op_bany_inequal2:
1082   case nir_op_bany_fnequal3:
1083   case nir_op_bany_inequal3:
1084   case nir_op_bany_fnequal4:
1085   case nir_op_bany_inequal4:
1086      *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1087      break;
1088   case nir_op_ball_fequal2:
1089   case nir_op_ball_iequal2:
1090   case nir_op_ball_fequal3:
1091   case nir_op_ball_iequal3:
1092   case nir_op_ball_fequal4:
1093   case nir_op_ball_iequal4:
1094      *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1095      break;
1096   default:
1097      return false;
1098   }
1099
1100   unsigned size_swizzle =
1101      brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]);
1102
1103   src_reg op[2];
1104   assert(nir_op_infos[cmp_instr->op].num_inputs == 2);
1105   for (unsigned i = 0; i < 2; i++) {
1106      nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i];
1107      unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src);
1108      type = (nir_alu_type) (((unsigned) type) | bit_size);
1109      op[i] = get_nir_src(cmp_instr->src[i].src, type, 4);
1110      unsigned base_swizzle =
1111         brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle);
1112      op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle);
1113      op[i].abs = cmp_instr->src[i].abs;
1114      op[i].negate = cmp_instr->src[i].negate;
1115   }
1116
1117   emit(CMP(dst_null_d(), op[0], op[1],
1118            brw_conditional_for_nir_comparison(cmp_instr->op)));
1119
1120   return true;
1121}
1122
1123static void
1124emit_find_msb_using_lzd(const vec4_builder &bld,
1125                        const dst_reg &dst,
1126                        const src_reg &src,
1127                        bool is_signed)
1128{
1129   vec4_instruction *inst;
1130   src_reg temp = src;
1131
1132   if (is_signed) {
1133      /* LZD of an absolute value source almost always does the right
1134       * thing.  There are two problem values:
1135       *
1136       * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
1137       *   0.  However, findMSB(int(0x80000000)) == 30.
1138       *
1139       * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
1140       *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1141       *
1142       *    For a value of zero or negative one, -1 will be returned.
1143       *
1144       * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
1145       *   findMSB(-(1<<x)) should return x-1.
1146       *
1147       * For all negative number cases, including 0x80000000 and
1148       * 0xffffffff, the correct value is obtained from LZD if instead of
1149       * negating the (already negative) value the logical-not is used.  A
1150       * conditonal logical-not can be achieved in two instructions.
1151       */
1152      temp = src_reg(bld.vgrf(BRW_REGISTER_TYPE_D));
1153
1154      bld.ASR(dst_reg(temp), src, brw_imm_d(31));
1155      bld.XOR(dst_reg(temp), temp, src);
1156   }
1157
1158   bld.LZD(retype(dst, BRW_REGISTER_TYPE_UD),
1159           retype(temp, BRW_REGISTER_TYPE_UD));
1160
1161   /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
1162    * from the LSB side. Subtract the result from 31 to convert the MSB count
1163    * into an LSB count.  If no bits are set, LZD will return 32.  31-32 = -1,
1164    * which is exactly what findMSB() is supposed to return.
1165    */
1166   inst = bld.ADD(dst, retype(src_reg(dst), BRW_REGISTER_TYPE_D),
1167                  brw_imm_d(31));
1168   inst->src[0].negate = true;
1169}
1170
1171void
1172vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src,
1173                                          bool saturate,
1174                                          brw_reg_type single_type)
1175{
1176   /* BDW PRM vol 15 - workarounds:
1177    * DF->f format conversion for Align16 has wrong emask calculation when
1178    * source is immediate.
1179    */
1180   if (devinfo->gen == 8 && single_type == BRW_REGISTER_TYPE_F &&
1181       src.file == BRW_IMMEDIATE_VALUE) {
1182      vec4_instruction *inst = emit(MOV(dst, brw_imm_f(src.df)));
1183      inst->saturate = saturate;
1184      return;
1185   }
1186
1187   dst_reg temp = dst_reg(this, glsl_type::dvec4_type);
1188   emit(MOV(temp, src));
1189
1190   dst_reg temp2 = dst_reg(this, glsl_type::dvec4_type);
1191   temp2 = retype(temp2, single_type);
1192   emit(VEC4_OPCODE_FROM_DOUBLE, temp2, src_reg(temp))
1193      ->size_written = 2 * REG_SIZE;
1194
1195   vec4_instruction *inst = emit(MOV(dst, src_reg(temp2)));
1196   inst->saturate = saturate;
1197}
1198
1199void
1200vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src,
1201                                        bool saturate,
1202                                        brw_reg_type single_type)
1203{
1204   dst_reg tmp_dst = dst_reg(src_reg(this, glsl_type::dvec4_type));
1205   src_reg tmp_src = retype(src_reg(this, glsl_type::vec4_type), single_type);
1206   emit(MOV(dst_reg(tmp_src), retype(src, single_type)));
1207   emit(VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src);
1208   vec4_instruction *inst = emit(MOV(dst, src_reg(tmp_dst)));
1209   inst->saturate = saturate;
1210}
1211
1212src_reg
1213vec4_visitor::setup_imm_df(double v)
1214{
1215   assert(devinfo->gen >= 7);
1216
1217   if (devinfo->gen >= 8)
1218      return brw_imm_df(v);
1219
1220   /* gen7.5 does not support DF immediates straighforward but the DIM
1221    * instruction allows to set the 64-bit immediate value.
1222    */
1223   if (devinfo->is_haswell) {
1224      dst_reg dst = retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_DF);
1225      emit(DIM(dst, brw_imm_df(v)))->force_writemask_all = true;
1226      return swizzle(src_reg(retype(dst, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
1227   }
1228
1229   /* gen7 does not support DF immediates */
1230   union {
1231      double d;
1232      struct {
1233         uint32_t i1;
1234         uint32_t i2;
1235      };
1236   } di;
1237
1238   di.d = v;
1239
1240   /* Write the low 32-bit of the constant to the X:UD channel and the
1241    * high 32-bit to the Y:UD channel to build the constant in a VGRF.
1242    * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes
1243    * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle
1244    * XXXX so any access to the VGRF only reads the constant data in these
1245    * channels.
1246    */
1247   const dst_reg tmp =
1248      retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_UD);
1249   for (int n = 0; n < 2; n++) {
1250      emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1)))
1251         ->force_writemask_all = true;
1252      emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2)))
1253         ->force_writemask_all = true;
1254   }
1255
1256   return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
1257}
1258
1259void
1260vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
1261{
1262   vec4_instruction *inst;
1263
1264   nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type |
1265                                           nir_dest_bit_size(instr->dest.dest));
1266   dst_reg dst = get_nir_dest(instr->dest.dest, dst_type);
1267   dst.writemask = instr->dest.write_mask;
1268
1269   src_reg op[4];
1270   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1271      nir_alu_type src_type = (nir_alu_type)
1272         (nir_op_infos[instr->op].input_types[i] |
1273          nir_src_bit_size(instr->src[i].src));
1274      op[i] = get_nir_src(instr->src[i].src, src_type, 4);
1275      op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle);
1276      op[i].abs = instr->src[i].abs;
1277      op[i].negate = instr->src[i].negate;
1278   }
1279
1280   switch (instr->op) {
1281   case nir_op_imov:
1282   case nir_op_fmov:
1283      inst = emit(MOV(dst, op[0]));
1284      inst->saturate = instr->dest.saturate;
1285      break;
1286
1287   case nir_op_vec2:
1288   case nir_op_vec3:
1289   case nir_op_vec4:
1290      unreachable("not reached: should be handled by lower_vec_to_movs()");
1291
1292   case nir_op_i2f:
1293   case nir_op_u2f:
1294      inst = emit(MOV(dst, op[0]));
1295      inst->saturate = instr->dest.saturate;
1296      break;
1297
1298   case nir_op_f2i:
1299   case nir_op_f2u:
1300      inst = emit(MOV(dst, op[0]));
1301      break;
1302
1303   case nir_op_d2f:
1304      emit_conversion_from_double(dst, op[0], instr->dest.saturate,
1305                                  BRW_REGISTER_TYPE_F);
1306      break;
1307
1308   case nir_op_f2d:
1309      emit_conversion_to_double(dst, op[0], instr->dest.saturate,
1310                                BRW_REGISTER_TYPE_F);
1311      break;
1312
1313   case nir_op_d2i:
1314   case nir_op_d2u:
1315      emit_conversion_from_double(dst, op[0], instr->dest.saturate,
1316                                  instr->op == nir_op_d2i ? BRW_REGISTER_TYPE_D :
1317                                                            BRW_REGISTER_TYPE_UD);
1318      break;
1319
1320   case nir_op_i2d:
1321   case nir_op_u2d:
1322      emit_conversion_to_double(dst, op[0], instr->dest.saturate,
1323                                instr->op == nir_op_i2d ? BRW_REGISTER_TYPE_D :
1324                                                          BRW_REGISTER_TYPE_UD);
1325      break;
1326
1327   case nir_op_iadd:
1328      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1329   case nir_op_fadd:
1330      inst = emit(ADD(dst, op[0], op[1]));
1331      inst->saturate = instr->dest.saturate;
1332      break;
1333
1334   case nir_op_fmul:
1335      inst = emit(MUL(dst, op[0], op[1]));
1336      inst->saturate = instr->dest.saturate;
1337      break;
1338
1339   case nir_op_imul: {
1340      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1341      if (devinfo->gen < 8) {
1342         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
1343         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
1344
1345         /* For integer multiplication, the MUL uses the low 16 bits of one of
1346          * the operands (src0 through SNB, src1 on IVB and later). The MACH
1347          * accumulates in the contribution of the upper 16 bits of that
1348          * operand. If we can determine that one of the args is in the low
1349          * 16 bits, though, we can just emit a single MUL.
1350          */
1351         if (value0 && value0->u32[0] < (1 << 16)) {
1352            if (devinfo->gen < 7)
1353               emit(MUL(dst, op[0], op[1]));
1354            else
1355               emit(MUL(dst, op[1], op[0]));
1356         } else if (value1 && value1->u32[0] < (1 << 16)) {
1357            if (devinfo->gen < 7)
1358               emit(MUL(dst, op[1], op[0]));
1359            else
1360               emit(MUL(dst, op[0], op[1]));
1361         } else {
1362            struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
1363
1364            emit(MUL(acc, op[0], op[1]));
1365            emit(MACH(dst_null_d(), op[0], op[1]));
1366            emit(MOV(dst, src_reg(acc)));
1367         }
1368      } else {
1369	 emit(MUL(dst, op[0], op[1]));
1370      }
1371      break;
1372   }
1373
1374   case nir_op_imul_high:
1375   case nir_op_umul_high: {
1376      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1377      struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
1378
1379      if (devinfo->gen >= 8)
1380         emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW)));
1381      else
1382         emit(MUL(acc, op[0], op[1]));
1383
1384      emit(MACH(dst, op[0], op[1]));
1385      break;
1386   }
1387
1388   case nir_op_frcp:
1389      inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]);
1390      inst->saturate = instr->dest.saturate;
1391      break;
1392
1393   case nir_op_fexp2:
1394      inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]);
1395      inst->saturate = instr->dest.saturate;
1396      break;
1397
1398   case nir_op_flog2:
1399      inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]);
1400      inst->saturate = instr->dest.saturate;
1401      break;
1402
1403   case nir_op_fsin:
1404      inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
1405      inst->saturate = instr->dest.saturate;
1406      break;
1407
1408   case nir_op_fcos:
1409      inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
1410      inst->saturate = instr->dest.saturate;
1411      break;
1412
1413   case nir_op_idiv:
1414   case nir_op_udiv:
1415      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1416      emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
1417      break;
1418
1419   case nir_op_umod:
1420   case nir_op_irem:
1421      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1422       * appears that our hardware just does the right thing for signed
1423       * remainder.
1424       */
1425      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1426      emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
1427      break;
1428
1429   case nir_op_imod: {
1430      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1431      inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
1432
1433      /* Math instructions don't support conditional mod */
1434      inst = emit(MOV(dst_null_d(), src_reg(dst)));
1435      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1436
1437      /* Now, we need to determine if signs of the sources are different.
1438       * When we XOR the sources, the top bit is 0 if they are the same and 1
1439       * if they are different.  We can then use a conditional modifier to
1440       * turn that into a predicate.  This leads us to an XOR.l instruction.
1441       *
1442       * Technically, according to the PRM, you're not allowed to use .l on a
1443       * XOR instruction.  However, emperical experiments and Curro's reading
1444       * of the simulator source both indicate that it's safe.
1445       */
1446      src_reg tmp = src_reg(this, glsl_type::ivec4_type);
1447      inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
1448      inst->predicate = BRW_PREDICATE_NORMAL;
1449      inst->conditional_mod = BRW_CONDITIONAL_L;
1450
1451      /* If the result of the initial remainder operation is non-zero and the
1452       * two sources have different signs, add in a copy of op[1] to get the
1453       * final integer modulus value.
1454       */
1455      inst = emit(ADD(dst, src_reg(dst), op[1]));
1456      inst->predicate = BRW_PREDICATE_NORMAL;
1457      break;
1458   }
1459
1460   case nir_op_ldexp:
1461      unreachable("not reached: should be handled by ldexp_to_arith()");
1462
1463   case nir_op_fsqrt:
1464      inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]);
1465      inst->saturate = instr->dest.saturate;
1466      break;
1467
1468   case nir_op_frsq:
1469      inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]);
1470      inst->saturate = instr->dest.saturate;
1471      break;
1472
1473   case nir_op_fpow:
1474      inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]);
1475      inst->saturate = instr->dest.saturate;
1476      break;
1477
1478   case nir_op_uadd_carry: {
1479      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1480      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1481
1482      emit(ADDC(dst_null_ud(), op[0], op[1]));
1483      emit(MOV(dst, src_reg(acc)));
1484      break;
1485   }
1486
1487   case nir_op_usub_borrow: {
1488      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1489      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1490
1491      emit(SUBB(dst_null_ud(), op[0], op[1]));
1492      emit(MOV(dst, src_reg(acc)));
1493      break;
1494   }
1495
1496   case nir_op_ftrunc:
1497      inst = emit(RNDZ(dst, op[0]));
1498      inst->saturate = instr->dest.saturate;
1499      break;
1500
1501   case nir_op_fceil: {
1502      src_reg tmp = src_reg(this, glsl_type::float_type);
1503      tmp.swizzle =
1504         brw_swizzle_for_size(instr->src[0].src.is_ssa ?
1505                              instr->src[0].src.ssa->num_components :
1506                              instr->src[0].src.reg.reg->num_components);
1507
1508      op[0].negate = !op[0].negate;
1509      emit(RNDD(dst_reg(tmp), op[0]));
1510      tmp.negate = true;
1511      inst = emit(MOV(dst, tmp));
1512      inst->saturate = instr->dest.saturate;
1513      break;
1514   }
1515
1516   case nir_op_ffloor:
1517      inst = emit(RNDD(dst, op[0]));
1518      inst->saturate = instr->dest.saturate;
1519      break;
1520
1521   case nir_op_ffract:
1522      inst = emit(FRC(dst, op[0]));
1523      inst->saturate = instr->dest.saturate;
1524      break;
1525
1526   case nir_op_fround_even:
1527      inst = emit(RNDE(dst, op[0]));
1528      inst->saturate = instr->dest.saturate;
1529      break;
1530
1531   case nir_op_fquantize2f16: {
1532      /* See also vec4_visitor::emit_pack_half_2x16() */
1533      src_reg tmp16 = src_reg(this, glsl_type::uvec4_type);
1534      src_reg tmp32 = src_reg(this, glsl_type::vec4_type);
1535      src_reg zero = src_reg(this, glsl_type::vec4_type);
1536
1537      /* Check for denormal */
1538      src_reg abs_src0 = op[0];
1539      abs_src0.abs = true;
1540      emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1541               BRW_CONDITIONAL_L));
1542      /* Get the appropriately signed zero */
1543      emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD),
1544               retype(op[0], BRW_REGISTER_TYPE_UD),
1545               brw_imm_ud(0x80000000)));
1546      /* Do the actual F32 -> F16 -> F32 conversion */
1547      emit(F32TO16(dst_reg(tmp16), op[0]));
1548      emit(F16TO32(dst_reg(tmp32), tmp16));
1549      /* Select that or zero based on normal status */
1550      inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32);
1551      inst->predicate = BRW_PREDICATE_NORMAL;
1552      inst->saturate = instr->dest.saturate;
1553      break;
1554   }
1555
1556   case nir_op_imin:
1557   case nir_op_umin:
1558      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1559   case nir_op_fmin:
1560      inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]);
1561      inst->saturate = instr->dest.saturate;
1562      break;
1563
1564   case nir_op_imax:
1565   case nir_op_umax:
1566      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1567   case nir_op_fmax:
1568      inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]);
1569      inst->saturate = instr->dest.saturate;
1570      break;
1571
1572   case nir_op_fddx:
1573   case nir_op_fddx_coarse:
1574   case nir_op_fddx_fine:
1575   case nir_op_fddy:
1576   case nir_op_fddy_coarse:
1577   case nir_op_fddy_fine:
1578      unreachable("derivatives are not valid in vertex shaders");
1579
1580   case nir_op_ilt:
1581   case nir_op_ult:
1582   case nir_op_ige:
1583   case nir_op_uge:
1584   case nir_op_ieq:
1585   case nir_op_ine:
1586      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1587      /* Fallthrough */
1588   case nir_op_flt:
1589   case nir_op_fge:
1590   case nir_op_feq:
1591   case nir_op_fne: {
1592      enum brw_conditional_mod conditional_mod =
1593         brw_conditional_for_nir_comparison(instr->op);
1594
1595      if (nir_src_bit_size(instr->src[0].src) < 64) {
1596         emit(CMP(dst, op[0], op[1], conditional_mod));
1597      } else {
1598         /* Produce a 32-bit boolean result from the DF comparison by selecting
1599          * only the low 32-bit in each DF produced. Do this in a temporary
1600          * so we can then move from there to the result using align16 again
1601          * to honor the original writemask.
1602          */
1603         dst_reg temp = dst_reg(this, glsl_type::dvec4_type);
1604         emit(CMP(temp, op[0], op[1], conditional_mod));
1605         dst_reg result = dst_reg(this, glsl_type::bvec4_type);
1606         emit(VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp));
1607         emit(MOV(dst, src_reg(result)));
1608      }
1609      break;
1610   }
1611
1612   case nir_op_ball_iequal2:
1613   case nir_op_ball_iequal3:
1614   case nir_op_ball_iequal4:
1615      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1616      /* Fallthrough */
1617   case nir_op_ball_fequal2:
1618   case nir_op_ball_fequal3:
1619   case nir_op_ball_fequal4: {
1620      unsigned swiz =
1621         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
1622
1623      emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
1624               brw_conditional_for_nir_comparison(instr->op)));
1625      emit(MOV(dst, brw_imm_d(0)));
1626      inst = emit(MOV(dst, brw_imm_d(~0)));
1627      inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1628      break;
1629   }
1630
1631   case nir_op_bany_inequal2:
1632   case nir_op_bany_inequal3:
1633   case nir_op_bany_inequal4:
1634      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1635      /* Fallthrough */
1636   case nir_op_bany_fnequal2:
1637   case nir_op_bany_fnequal3:
1638   case nir_op_bany_fnequal4: {
1639      unsigned swiz =
1640         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
1641
1642      emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
1643               brw_conditional_for_nir_comparison(instr->op)));
1644
1645      emit(MOV(dst, brw_imm_d(0)));
1646      inst = emit(MOV(dst, brw_imm_d(~0)));
1647      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1648      break;
1649   }
1650
1651   case nir_op_inot:
1652      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1653      if (devinfo->gen >= 8) {
1654         op[0] = resolve_source_modifiers(op[0]);
1655      }
1656      emit(NOT(dst, op[0]));
1657      break;
1658
1659   case nir_op_ixor:
1660      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1661      if (devinfo->gen >= 8) {
1662         op[0] = resolve_source_modifiers(op[0]);
1663         op[1] = resolve_source_modifiers(op[1]);
1664      }
1665      emit(XOR(dst, op[0], op[1]));
1666      break;
1667
1668   case nir_op_ior:
1669      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1670      if (devinfo->gen >= 8) {
1671         op[0] = resolve_source_modifiers(op[0]);
1672         op[1] = resolve_source_modifiers(op[1]);
1673      }
1674      emit(OR(dst, op[0], op[1]));
1675      break;
1676
1677   case nir_op_iand:
1678      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1679      if (devinfo->gen >= 8) {
1680         op[0] = resolve_source_modifiers(op[0]);
1681         op[1] = resolve_source_modifiers(op[1]);
1682      }
1683      emit(AND(dst, op[0], op[1]));
1684      break;
1685
1686   case nir_op_b2i:
1687   case nir_op_b2f:
1688      emit(MOV(dst, negate(op[0])));
1689      break;
1690
1691   case nir_op_f2b:
1692      emit(CMP(dst, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ));
1693      break;
1694
1695   case nir_op_d2b: {
1696      /* We use a MOV with conditional_mod to check if the provided value is
1697       * 0.0. We want this to flush denormalized numbers to zero, so we set a
1698       * source modifier on the source operand to trigger this, as source
1699       * modifiers don't affect the result of the testing against 0.0.
1700       */
1701      src_reg value = op[0];
1702      value.abs = true;
1703      vec4_instruction *inst = emit(MOV(dst_null_df(), value));
1704      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1705
1706      src_reg one = src_reg(this, glsl_type::ivec4_type);
1707      emit(MOV(dst_reg(one), brw_imm_d(~0)));
1708      inst = emit(BRW_OPCODE_SEL, dst, one, brw_imm_d(0));
1709      inst->predicate = BRW_PREDICATE_NORMAL;
1710      break;
1711   }
1712
1713   case nir_op_i2b:
1714      emit(CMP(dst, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
1715      break;
1716
1717   case nir_op_fnoise1_1:
1718   case nir_op_fnoise1_2:
1719   case nir_op_fnoise1_3:
1720   case nir_op_fnoise1_4:
1721   case nir_op_fnoise2_1:
1722   case nir_op_fnoise2_2:
1723   case nir_op_fnoise2_3:
1724   case nir_op_fnoise2_4:
1725   case nir_op_fnoise3_1:
1726   case nir_op_fnoise3_2:
1727   case nir_op_fnoise3_3:
1728   case nir_op_fnoise3_4:
1729   case nir_op_fnoise4_1:
1730   case nir_op_fnoise4_2:
1731   case nir_op_fnoise4_3:
1732   case nir_op_fnoise4_4:
1733      unreachable("not reached: should be handled by lower_noise");
1734
1735   case nir_op_unpack_half_2x16_split_x:
1736   case nir_op_unpack_half_2x16_split_y:
1737   case nir_op_pack_half_2x16_split:
1738      unreachable("not reached: should not occur in vertex shader");
1739
1740   case nir_op_unpack_snorm_2x16:
1741   case nir_op_unpack_unorm_2x16:
1742   case nir_op_pack_snorm_2x16:
1743   case nir_op_pack_unorm_2x16:
1744      unreachable("not reached: should be handled by lower_packing_builtins");
1745
1746   case nir_op_pack_uvec4_to_uint:
1747      unreachable("not reached");
1748
1749   case nir_op_pack_uvec2_to_uint: {
1750      dst_reg tmp1 = dst_reg(this, glsl_type::uint_type);
1751      tmp1.writemask = WRITEMASK_X;
1752      op[0].swizzle = BRW_SWIZZLE_YYYY;
1753      emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u))));
1754
1755      dst_reg tmp2 = dst_reg(this, glsl_type::uint_type);
1756      tmp2.writemask = WRITEMASK_X;
1757      op[0].swizzle = BRW_SWIZZLE_XXXX;
1758      emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu))));
1759
1760      emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
1761      break;
1762   }
1763
1764   case nir_op_pack_double_2x32_split: {
1765      dst_reg result = dst_reg(this, glsl_type::dvec4_type);
1766      dst_reg tmp = dst_reg(this, glsl_type::uvec4_type);
1767      emit(MOV(tmp, retype(op[0], BRW_REGISTER_TYPE_UD)));
1768      emit(VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp));
1769      emit(MOV(tmp, retype(op[1], BRW_REGISTER_TYPE_UD)));
1770      emit(VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp));
1771      emit(MOV(dst, src_reg(result)));
1772      break;
1773   }
1774
1775   case nir_op_unpack_double_2x32_split_x:
1776   case nir_op_unpack_double_2x32_split_y: {
1777      enum opcode oper = (instr->op == nir_op_unpack_double_2x32_split_x) ?
1778         VEC4_OPCODE_PICK_LOW_32BIT : VEC4_OPCODE_PICK_HIGH_32BIT;
1779      dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
1780      emit(MOV(tmp, op[0]));
1781      dst_reg tmp2 = dst_reg(this, glsl_type::uvec4_type);
1782      emit(oper, tmp2, src_reg(tmp));
1783      emit(MOV(dst, src_reg(tmp2)));
1784      break;
1785   }
1786
1787   case nir_op_unpack_half_2x16:
1788      /* As NIR does not guarantee that we have a correct swizzle outside the
1789       * boundaries of a vector, and the implementation of emit_unpack_half_2x16
1790       * uses the source operand in an operation with WRITEMASK_Y while our
1791       * source operand has only size 1, it accessed incorrect data producing
1792       * regressions in Piglit. We repeat the swizzle of the first component on the
1793       * rest of components to avoid regressions. In the vec4_visitor IR code path
1794       * this is not needed because the operand has already the correct swizzle.
1795       */
1796      op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle);
1797      emit_unpack_half_2x16(dst, op[0]);
1798      break;
1799
1800   case nir_op_pack_half_2x16:
1801      emit_pack_half_2x16(dst, op[0]);
1802      break;
1803
1804   case nir_op_unpack_unorm_4x8:
1805      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1806      emit_unpack_unorm_4x8(dst, op[0]);
1807      break;
1808
1809   case nir_op_pack_unorm_4x8:
1810      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1811      emit_pack_unorm_4x8(dst, op[0]);
1812      break;
1813
1814   case nir_op_unpack_snorm_4x8:
1815      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1816      emit_unpack_snorm_4x8(dst, op[0]);
1817      break;
1818
1819   case nir_op_pack_snorm_4x8:
1820      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1821      emit_pack_snorm_4x8(dst, op[0]);
1822      break;
1823
1824   case nir_op_bitfield_reverse:
1825      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1826      emit(BFREV(dst, op[0]));
1827      break;
1828
1829   case nir_op_bit_count:
1830      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1831      emit(CBIT(dst, op[0]));
1832      break;
1833
1834   case nir_op_ufind_msb:
1835      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1836      emit_find_msb_using_lzd(vec4_builder(this).at_end(), dst, op[0], false);
1837      break;
1838
1839   case nir_op_ifind_msb: {
1840      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1841      vec4_builder bld = vec4_builder(this).at_end();
1842      src_reg src(dst);
1843
1844      if (devinfo->gen < 7) {
1845         emit_find_msb_using_lzd(bld, dst, op[0], true);
1846      } else {
1847         emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0]));
1848
1849         /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1850          * count from the LSB side. If FBH didn't return an error
1851          * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1852          * count into an LSB count.
1853          */
1854         bld.CMP(dst_null_d(), src, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1855
1856         inst = bld.ADD(dst, src, brw_imm_d(31));
1857         inst->predicate = BRW_PREDICATE_NORMAL;
1858         inst->src[0].negate = true;
1859      }
1860      break;
1861   }
1862
1863   case nir_op_find_lsb: {
1864      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1865      vec4_builder bld = vec4_builder(this).at_end();
1866
1867      if (devinfo->gen < 7) {
1868         dst_reg temp = bld.vgrf(BRW_REGISTER_TYPE_D);
1869
1870         /* (x & -x) generates a value that consists of only the LSB of x.
1871          * For all powers of 2, findMSB(y) == findLSB(y).
1872          */
1873         src_reg src = src_reg(retype(op[0], BRW_REGISTER_TYPE_D));
1874         src_reg negated_src = src;
1875
1876         /* One must be negated, and the other must be non-negated.  It
1877          * doesn't matter which is which.
1878          */
1879         negated_src.negate = true;
1880         src.negate = false;
1881
1882         bld.AND(temp, src, negated_src);
1883         emit_find_msb_using_lzd(bld, dst, src_reg(temp), false);
1884      } else {
1885         bld.FBL(dst, op[0]);
1886      }
1887      break;
1888   }
1889
1890   case nir_op_ubitfield_extract:
1891   case nir_op_ibitfield_extract:
1892      unreachable("should have been lowered");
1893   case nir_op_ubfe:
1894   case nir_op_ibfe:
1895      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1896      op[0] = fix_3src_operand(op[0]);
1897      op[1] = fix_3src_operand(op[1]);
1898      op[2] = fix_3src_operand(op[2]);
1899
1900      emit(BFE(dst, op[2], op[1], op[0]));
1901      break;
1902
1903   case nir_op_bfm:
1904      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1905      emit(BFI1(dst, op[0], op[1]));
1906      break;
1907
1908   case nir_op_bfi:
1909      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1910      op[0] = fix_3src_operand(op[0]);
1911      op[1] = fix_3src_operand(op[1]);
1912      op[2] = fix_3src_operand(op[2]);
1913
1914      emit(BFI2(dst, op[0], op[1], op[2]));
1915      break;
1916
1917   case nir_op_bitfield_insert:
1918      unreachable("not reached: should have been lowered");
1919
1920   case nir_op_fsign:
1921      if (type_sz(op[0].type) < 8) {
1922         /* AND(val, 0x80000000) gives the sign bit.
1923          *
1924          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1925          * zero.
1926          */
1927         emit(CMP(dst_null_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ));
1928
1929         op[0].type = BRW_REGISTER_TYPE_UD;
1930         dst.type = BRW_REGISTER_TYPE_UD;
1931         emit(AND(dst, op[0], brw_imm_ud(0x80000000u)));
1932
1933         inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u)));
1934         inst->predicate = BRW_PREDICATE_NORMAL;
1935         dst.type = BRW_REGISTER_TYPE_F;
1936
1937         if (instr->dest.saturate) {
1938            inst = emit(MOV(dst, src_reg(dst)));
1939            inst->saturate = true;
1940         }
1941      } else {
1942         /* For doubles we do the same but we need to consider:
1943          *
1944          * - We use a MOV with conditional_mod instead of a CMP so that we can
1945          *   skip loading a 0.0 immediate. We use a source modifier on the
1946          *   source of the MOV so that we flush denormalized values to 0.
1947          *   Since we want to compare against 0, this won't alter the result.
1948          * - We need to extract the high 32-bit of each DF where the sign
1949          *   is stored.
1950          * - We need to produce a DF result.
1951          */
1952
1953         /* Check for zero */
1954         src_reg value = op[0];
1955         value.abs = true;
1956         inst = emit(MOV(dst_null_df(), value));
1957         inst->conditional_mod = BRW_CONDITIONAL_NZ;
1958
1959         /* AND each high 32-bit channel with 0x80000000u */
1960         dst_reg tmp = dst_reg(this, glsl_type::uvec4_type);
1961         emit(VEC4_OPCODE_PICK_HIGH_32BIT, tmp, op[0]);
1962         emit(AND(tmp, src_reg(tmp), brw_imm_ud(0x80000000u)));
1963
1964         /* Add 1.0 to each channel, predicated to skip the cases where the
1965          * channel's value was 0
1966          */
1967         inst = emit(OR(tmp, src_reg(tmp), brw_imm_ud(0x3f800000u)));
1968         inst->predicate = BRW_PREDICATE_NORMAL;
1969
1970         /* Now convert the result from float to double */
1971         emit_conversion_to_double(dst, src_reg(tmp), instr->dest.saturate,
1972                                   BRW_REGISTER_TYPE_F);
1973      }
1974      break;
1975
1976   case nir_op_isign:
1977      /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1978       *               -> non-negative val generates 0x00000000.
1979       *  Predicated OR sets 1 if val is positive.
1980       */
1981      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1982      emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G));
1983      emit(ASR(dst, op[0], brw_imm_d(31)));
1984      inst = emit(OR(dst, src_reg(dst), brw_imm_d(1)));
1985      inst->predicate = BRW_PREDICATE_NORMAL;
1986      break;
1987
1988   case nir_op_ishl:
1989      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1990      emit(SHL(dst, op[0], op[1]));
1991      break;
1992
1993   case nir_op_ishr:
1994      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1995      emit(ASR(dst, op[0], op[1]));
1996      break;
1997
1998   case nir_op_ushr:
1999      assert(nir_dest_bit_size(instr->dest.dest) < 64);
2000      emit(SHR(dst, op[0], op[1]));
2001      break;
2002
2003   case nir_op_ffma:
2004      if (type_sz(dst.type) == 8) {
2005         dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
2006         emit(MUL(mul_dst, op[1], op[0]));
2007         inst = emit(ADD(dst, src_reg(mul_dst), op[2]));
2008         inst->saturate = instr->dest.saturate;
2009      } else {
2010         op[0] = fix_3src_operand(op[0]);
2011         op[1] = fix_3src_operand(op[1]);
2012         op[2] = fix_3src_operand(op[2]);
2013
2014         inst = emit(MAD(dst, op[2], op[1], op[0]));
2015         inst->saturate = instr->dest.saturate;
2016      }
2017      break;
2018
2019   case nir_op_flrp:
2020      inst = emit_lrp(dst, op[0], op[1], op[2]);
2021      inst->saturate = instr->dest.saturate;
2022      break;
2023
2024   case nir_op_bcsel:
2025      enum brw_predicate predicate;
2026      if (!optimize_predicate(instr, &predicate)) {
2027         emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
2028         switch (dst.writemask) {
2029         case WRITEMASK_X:
2030            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
2031            break;
2032         case WRITEMASK_Y:
2033            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
2034            break;
2035         case WRITEMASK_Z:
2036            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
2037            break;
2038         case WRITEMASK_W:
2039            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
2040            break;
2041         default:
2042            predicate = BRW_PREDICATE_NORMAL;
2043            break;
2044         }
2045      }
2046      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
2047      inst->predicate = predicate;
2048      break;
2049
2050   case nir_op_fdot_replicated2:
2051      inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]);
2052      inst->saturate = instr->dest.saturate;
2053      break;
2054
2055   case nir_op_fdot_replicated3:
2056      inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]);
2057      inst->saturate = instr->dest.saturate;
2058      break;
2059
2060   case nir_op_fdot_replicated4:
2061      inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]);
2062      inst->saturate = instr->dest.saturate;
2063      break;
2064
2065   case nir_op_fdph_replicated:
2066      inst = emit(BRW_OPCODE_DPH, dst, op[0], op[1]);
2067      inst->saturate = instr->dest.saturate;
2068      break;
2069
2070   case nir_op_iabs:
2071   case nir_op_ineg:
2072      assert(nir_dest_bit_size(instr->dest.dest) < 64);
2073   case nir_op_fabs:
2074   case nir_op_fneg:
2075   case nir_op_fsat:
2076      unreachable("not reached: should be lowered by lower_source mods");
2077
2078   case nir_op_fdiv:
2079      unreachable("not reached: should be lowered by DIV_TO_MUL_RCP in the compiler");
2080
2081   case nir_op_fmod:
2082      unreachable("not reached: should be lowered by MOD_TO_FLOOR in the compiler");
2083
2084   case nir_op_fsub:
2085   case nir_op_isub:
2086      unreachable("not reached: should be handled by ir_sub_to_add_neg");
2087
2088   default:
2089      unreachable("Unimplemented ALU operation");
2090   }
2091
2092   /* If we need to do a boolean resolve, replace the result with -(x & 1)
2093    * to sign extend the low bit to 0/~0
2094    */
2095   if (devinfo->gen <= 5 &&
2096       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) ==
2097       BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
2098      dst_reg masked = dst_reg(this, glsl_type::int_type);
2099      masked.writemask = dst.writemask;
2100      emit(AND(masked, src_reg(dst), brw_imm_d(1)));
2101      src_reg masked_neg = src_reg(masked);
2102      masked_neg.negate = true;
2103      emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg));
2104   }
2105}
2106
2107void
2108vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
2109{
2110   switch (instr->type) {
2111   case nir_jump_break:
2112      emit(BRW_OPCODE_BREAK);
2113      break;
2114
2115   case nir_jump_continue:
2116      emit(BRW_OPCODE_CONTINUE);
2117      break;
2118
2119   case nir_jump_return:
2120      /* fall through */
2121   default:
2122      unreachable("unknown jump");
2123   }
2124}
2125
2126enum ir_texture_opcode
2127ir_texture_opcode_for_nir_texop(nir_texop texop)
2128{
2129   enum ir_texture_opcode op;
2130
2131   switch (texop) {
2132   case nir_texop_lod: op = ir_lod; break;
2133   case nir_texop_query_levels: op = ir_query_levels; break;
2134   case nir_texop_texture_samples: op = ir_texture_samples; break;
2135   case nir_texop_tex: op = ir_tex; break;
2136   case nir_texop_tg4: op = ir_tg4; break;
2137   case nir_texop_txb: op = ir_txb; break;
2138   case nir_texop_txd: op = ir_txd; break;
2139   case nir_texop_txf: op = ir_txf; break;
2140   case nir_texop_txf_ms: op = ir_txf_ms; break;
2141   case nir_texop_txl: op = ir_txl; break;
2142   case nir_texop_txs: op = ir_txs; break;
2143   case nir_texop_samples_identical: op = ir_samples_identical; break;
2144   default:
2145      unreachable("unknown texture opcode");
2146   }
2147
2148   return op;
2149}
2150const glsl_type *
2151glsl_type_for_nir_alu_type(nir_alu_type alu_type,
2152                           unsigned components)
2153{
2154   return glsl_type::get_instance(brw_glsl_base_type_for_nir_type(alu_type),
2155                                  components, 1);
2156}
2157
2158void
2159vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
2160{
2161   unsigned texture = instr->texture_index;
2162   unsigned sampler = instr->sampler_index;
2163   src_reg texture_reg = brw_imm_ud(texture);
2164   src_reg sampler_reg = brw_imm_ud(sampler);
2165   src_reg coordinate;
2166   const glsl_type *coord_type = NULL;
2167   src_reg shadow_comparator;
2168   src_reg offset_value;
2169   src_reg lod, lod2;
2170   src_reg sample_index;
2171   src_reg mcs;
2172
2173   const glsl_type *dest_type =
2174      glsl_type_for_nir_alu_type(instr->dest_type,
2175                                 nir_tex_instr_dest_size(instr));
2176   dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
2177
2178   /* The hardware requires a LOD for buffer textures */
2179   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
2180      lod = brw_imm_d(0);
2181
2182   /* Load the texture operation sources */
2183   uint32_t constant_offset = 0;
2184   for (unsigned i = 0; i < instr->num_srcs; i++) {
2185      switch (instr->src[i].src_type) {
2186      case nir_tex_src_comparator:
2187         shadow_comparator = get_nir_src(instr->src[i].src,
2188                                         BRW_REGISTER_TYPE_F, 1);
2189         break;
2190
2191      case nir_tex_src_coord: {
2192         unsigned src_size = nir_tex_instr_src_size(instr, i);
2193
2194         switch (instr->op) {
2195         case nir_texop_txf:
2196         case nir_texop_txf_ms:
2197         case nir_texop_samples_identical:
2198            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D,
2199                                     src_size);
2200            coord_type = glsl_type::ivec(src_size);
2201            break;
2202
2203         default:
2204            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
2205                                     src_size);
2206            coord_type = glsl_type::vec(src_size);
2207            break;
2208         }
2209         break;
2210      }
2211
2212      case nir_tex_src_ddx:
2213         lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
2214                           nir_tex_instr_src_size(instr, i));
2215         break;
2216
2217      case nir_tex_src_ddy:
2218         lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
2219                           nir_tex_instr_src_size(instr, i));
2220         break;
2221
2222      case nir_tex_src_lod:
2223         switch (instr->op) {
2224         case nir_texop_txs:
2225         case nir_texop_txf:
2226            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
2227            break;
2228
2229         default:
2230            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1);
2231            break;
2232         }
2233         break;
2234
2235      case nir_tex_src_ms_index: {
2236         sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
2237         break;
2238      }
2239
2240      case nir_tex_src_offset: {
2241         nir_const_value *const_offset =
2242            nir_src_as_const_value(instr->src[i].src);
2243         if (!const_offset ||
2244             !brw_texture_offset(const_offset->i32,
2245                                 nir_tex_instr_src_size(instr, i),
2246                                 &constant_offset)) {
2247            offset_value =
2248               get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
2249         }
2250         break;
2251      }
2252
2253      case nir_tex_src_texture_offset: {
2254         /* The highest texture which may be used by this operation is
2255          * the last element of the array. Mark it here, because the generator
2256          * doesn't have enough information to determine the bound.
2257          */
2258         uint32_t array_size = instr->texture_array_size;
2259         uint32_t max_used = texture + array_size - 1;
2260         if (instr->op == nir_texop_tg4) {
2261            max_used += prog_data->base.binding_table.gather_texture_start;
2262         } else {
2263            max_used += prog_data->base.binding_table.texture_start;
2264         }
2265
2266         brw_mark_surface_used(&prog_data->base, max_used);
2267
2268         /* Emit code to evaluate the actual indexing expression */
2269         src_reg src = get_nir_src(instr->src[i].src, 1);
2270         src_reg temp(this, glsl_type::uint_type);
2271         emit(ADD(dst_reg(temp), src, brw_imm_ud(texture)));
2272         texture_reg = emit_uniformize(temp);
2273         break;
2274      }
2275
2276      case nir_tex_src_sampler_offset: {
2277         /* Emit code to evaluate the actual indexing expression */
2278         src_reg src = get_nir_src(instr->src[i].src, 1);
2279         src_reg temp(this, glsl_type::uint_type);
2280         emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler)));
2281         sampler_reg = emit_uniformize(temp);
2282         break;
2283      }
2284
2285      case nir_tex_src_projector:
2286         unreachable("Should be lowered by do_lower_texture_projection");
2287
2288      case nir_tex_src_bias:
2289         unreachable("LOD bias is not valid for vertex shaders.\n");
2290
2291      default:
2292         unreachable("unknown texture source");
2293      }
2294   }
2295
2296   if (instr->op == nir_texop_txf_ms ||
2297       instr->op == nir_texop_samples_identical) {
2298      assert(coord_type != NULL);
2299      if (devinfo->gen >= 7 &&
2300          key_tex->compressed_multisample_layout_mask & (1 << texture)) {
2301         mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg);
2302      } else {
2303         mcs = brw_imm_ud(0u);
2304      }
2305   }
2306
2307   /* Stuff the channel select bits in the top of the texture offset */
2308   if (instr->op == nir_texop_tg4) {
2309      if (instr->component == 1 &&
2310          (key_tex->gather_channel_quirk_mask & (1 << texture))) {
2311         /* gather4 sampler is broken for green channel on RG32F --
2312          * we must ask for blue instead.
2313          */
2314         constant_offset |= 2 << 16;
2315      } else {
2316         constant_offset |= instr->component << 16;
2317      }
2318   }
2319
2320   ir_texture_opcode op = ir_texture_opcode_for_nir_texop(instr->op);
2321
2322   emit_texture(op, dest, dest_type, coordinate, instr->coord_components,
2323                shadow_comparator,
2324                lod, lod2, sample_index,
2325                constant_offset, offset_value, mcs,
2326                texture, texture_reg, sampler_reg);
2327}
2328
2329void
2330vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr)
2331{
2332   nir_ssa_values[instr->def.index] =
2333      dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32)));
2334}
2335
2336/* SIMD4x2 64bit data is stored in register space like this:
2337 *
2338 * r0.0:DF  x0 y0 z0 w0
2339 * r1.0:DF  x1 y1 z1 w1
2340 *
2341 * When we need to write data such as this to memory using 32-bit write
2342 * messages we need to shuffle it in this fashion:
2343 *
2344 * r0.0:DF  x0 y0 x1 y1 (to be written at base offset)
2345 * r0.0:DF  z0 w0 z1 w1 (to be written at base offset + 16)
2346 *
2347 * We need to do the inverse operation when we read using 32-bit messages,
2348 * which we can do by applying the same exact shuffling on the 64-bit data
2349 * read, only that because the data for each vertex is positioned differently
2350 * we need to apply different channel enables.
2351 *
2352 * This function takes 64bit data and shuffles it as explained above.
2353 *
2354 * The @for_write parameter is used to specify if the shuffling is being done
2355 * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit
2356 * write message (for_write = true), or instead we are doing the inverse
2357 * operation and we have just read 64-bit data using a 32-bit messages that we
2358 * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false).
2359 *
2360 * If @block and @ref are non-NULL, then the shuffling is done after @ref,
2361 * otherwise the instructions are emitted normally at the end. The function
2362 * returns the last instruction inserted.
2363 *
2364 * Notice that @src and @dst cannot be the same register.
2365 */
2366vec4_instruction *
2367vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
2368                                 bblock_t *block, vec4_instruction *ref)
2369{
2370   assert(type_sz(src.type) == 8);
2371   assert(type_sz(dst.type) == 8);
2372   assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
2373   assert(!ref == !block);
2374
2375   const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
2376                                   vec4_builder(this).at(block, ref->next);
2377
2378   /* Resolve swizzle in src */
2379   vec4_instruction *inst;
2380   if (src.swizzle != BRW_SWIZZLE_XYZW) {
2381      dst_reg data = dst_reg(this, glsl_type::dvec4_type);
2382      inst = bld.MOV(data, src);
2383      src = src_reg(data);
2384   }
2385
2386   /* dst+0.XY = src+0.XY */
2387   inst = bld.group(4, 0).MOV(writemask(dst, WRITEMASK_XY), src);
2388
2389   /* dst+0.ZW = src+1.XY */
2390   inst = bld.group(4, for_write ? 1 : 0)
2391             .MOV(writemask(dst, WRITEMASK_ZW),
2392                  swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY));
2393
2394   /* dst+1.XY = src+0.ZW */
2395   inst = bld.group(4, for_write ? 0 : 1)
2396            .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
2397                 swizzle(src, BRW_SWIZZLE_ZWZW));
2398
2399   /* dst+1.ZW = src+1.ZW */
2400   inst = bld.group(4, 1)
2401             .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
2402                 byte_offset(src, REG_SIZE));
2403
2404   return inst;
2405}
2406
2407}
2408