1#! /usr/bin/env python
2#
3# Copyright (C) 2014 Connor Abbott
4#
5# Permission is hereby granted, free of charge, to any person obtaining a
6# copy of this software and associated documentation files (the "Software"),
7# to deal in the Software without restriction, including without limitation
8# the rights to use, copy, modify, merge, publish, distribute, sublicense,
9# and/or sell copies of the Software, and to permit persons to whom the
10# Software is furnished to do so, subject to the following conditions:
11#
12# The above copyright notice and this permission notice (including the next
13# paragraph) shall be included in all copies or substantial portions of the
14# Software.
15#
16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22# IN THE SOFTWARE.
23#
24# Authors:
25#    Connor Abbott (cwabbott0@gmail.com)
26
27
28# Class that represents all the information we have about the opcode
29# NOTE: this must be kept in sync with nir_op_info
30
31class Opcode(object):
32   """Class that represents all the information we have about the opcode
33   NOTE: this must be kept in sync with nir_op_info
34   """
35   def __init__(self, name, output_size, output_type, input_sizes,
36                input_types, algebraic_properties, const_expr):
37      """Parameters:
38
39      - name is the name of the opcode (prepend nir_op_ for the enum name)
40      - all types are strings that get nir_type_ prepended to them
41      - input_types is a list of types
42      - algebraic_properties is a space-seperated string, where nir_op_is_ is
43        prepended before each entry
44      - const_expr is an expression or series of statements that computes the
45        constant value of the opcode given the constant values of its inputs.
46
47      Constant expressions are formed from the variables src0, src1, ...,
48      src(N-1), where N is the number of arguments.  The output of the
49      expression should be stored in the dst variable.  Per-component input
50      and output variables will be scalars and non-per-component input and
51      output variables will be a struct with fields named x, y, z, and w
52      all of the correct type.  Input and output variables can be assumed
53      to already be of the correct type and need no conversion.  In
54      particular, the conversion from the C bool type to/from  NIR_TRUE and
55      NIR_FALSE happens automatically.
56
57      For per-component instructions, the entire expression will be
58      executed once for each component.  For non-per-component
59      instructions, the expression is expected to store the correct values
60      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
61      constant expression, an assignment to dst will happen automatically
62      and the result will be equivalent to "dst = <expression>" for
63      per-component instructions and "dst.x = dst.y = ... = <expression>"
64      for non-per-component instructions.
65      """
66      assert isinstance(name, str)
67      assert isinstance(output_size, int)
68      assert isinstance(output_type, str)
69      assert isinstance(input_sizes, list)
70      assert isinstance(input_sizes[0], int)
71      assert isinstance(input_types, list)
72      assert isinstance(input_types[0], str)
73      assert isinstance(algebraic_properties, str)
74      assert isinstance(const_expr, str)
75      assert len(input_sizes) == len(input_types)
76      assert 0 <= output_size <= 4
77      for size in input_sizes:
78         assert 0 <= size <= 4
79         if output_size != 0:
80            assert size != 0
81      self.name = name
82      self.num_inputs = len(input_sizes)
83      self.output_size = output_size
84      self.output_type = output_type
85      self.input_sizes = input_sizes
86      self.input_types = input_types
87      self.algebraic_properties = algebraic_properties
88      self.const_expr = const_expr
89
90# helper variables for strings
91tfloat = "float"
92tint = "int"
93tbool = "bool32"
94tuint = "uint"
95tfloat32 = "float32"
96tint32 = "int32"
97tuint32 = "uint32"
98tuint64 = "uint64"
99tfloat64 = "float64"
100
101commutative = "commutative "
102associative = "associative "
103
104# global dictionary of opcodes
105opcodes = {}
106
107def opcode(name, output_size, output_type, input_sizes, input_types,
108           algebraic_properties, const_expr):
109   assert name not in opcodes
110   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
111                          input_types, algebraic_properties, const_expr)
112
113def unop_convert(name, out_type, in_type, const_expr):
114   opcode(name, 0, out_type, [0], [in_type], "", const_expr)
115
116def unop(name, ty, const_expr):
117   opcode(name, 0, ty, [0], [ty], "", const_expr)
118
119def unop_horiz(name, output_size, output_type, input_size, input_type,
120               const_expr):
121   opcode(name, output_size, output_type, [input_size], [input_type], "",
122          const_expr)
123
124def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
125                reduce_expr, final_expr):
126   def prereduce(src):
127      return "(" + prereduce_expr.format(src=src) + ")"
128   def final(src):
129      return final_expr.format(src="(" + src + ")")
130   def reduce_(src0, src1):
131      return reduce_expr.format(src0=src0, src1=src1)
132   src0 = prereduce("src0.x")
133   src1 = prereduce("src0.y")
134   src2 = prereduce("src0.z")
135   src3 = prereduce("src0.w")
136   unop_horiz(name + "2", output_size, output_type, 2, input_type,
137              final(reduce_(src0, src1)))
138   unop_horiz(name + "3", output_size, output_type, 3, input_type,
139              final(reduce_(reduce_(src0, src1), src2)))
140   unop_horiz(name + "4", output_size, output_type, 4, input_type,
141              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
142
143
144# These two move instructions differ in what modifiers they support and what
145# the negate modifier means. Otherwise, they are identical.
146unop("fmov", tfloat, "src0")
147unop("imov", tint, "src0")
148
149unop("ineg", tint, "-src0")
150unop("fneg", tfloat, "-src0")
151unop("inot", tint, "~src0") # invert every bit of the integer
152unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
153                      "((src0 == 0.0f) ? 1.0f : 0.0f)"))
154unop("fsign", tfloat, ("bit_size == 64 ? " +
155                       "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
156                       "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
157unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
158unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
159unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)")
160unop("fsat", tfloat, ("bit_size == 64 ? " +
161                      "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
162                      "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
163unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
164unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
165unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
166unop("fexp2", tfloat, "exp2f(src0)")
167unop("flog2", tfloat, "log2f(src0)")
168unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
169unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
170unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
171unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
172unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
173unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
174# Float-to-boolean conversion
175unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
176unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
177# Boolean-to-float conversion
178unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
179# Int-to-boolean conversion
180unop_convert("i2b", tbool, tint32, "src0 != 0")
181unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
182unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
183unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
184# double-to-float conversion
185unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision
186unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision
187
188# Unary floating-point rounding operations.
189
190
191unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
192unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
193unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
194unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
195unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
196
197unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
198
199# Trigonometric operations.
200
201
202unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
203unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
204
205
206# Partial derivatives.
207
208
209unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
210unop("fddy", tfloat, "0.0")
211unop("fddx_fine", tfloat, "0.0")
212unop("fddy_fine", tfloat, "0.0")
213unop("fddx_coarse", tfloat, "0.0")
214unop("fddy_coarse", tfloat, "0.0")
215
216
217# Floating point pack and unpack operations.
218
219def pack_2x16(fmt):
220   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
221dst.x = (uint32_t) pack_fmt_1x16(src0.x);
222dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
223""".replace("fmt", fmt))
224
225def pack_4x8(fmt):
226   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
227dst.x = (uint32_t) pack_fmt_1x8(src0.x);
228dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
229dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
230dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
231""".replace("fmt", fmt))
232
233def unpack_2x16(fmt):
234   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
235dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
236dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
237""".replace("fmt", fmt))
238
239def unpack_4x8(fmt):
240   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
241dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
242dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
243dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
244dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
245""".replace("fmt", fmt))
246
247
248pack_2x16("snorm")
249pack_4x8("snorm")
250pack_2x16("unorm")
251pack_4x8("unorm")
252pack_2x16("half")
253unpack_2x16("snorm")
254unpack_4x8("snorm")
255unpack_2x16("unorm")
256unpack_4x8("unorm")
257unpack_2x16("half")
258
259unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
260dst.x = (src0.x & 0xffff) | (src0.y << 16);
261""")
262
263unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
264dst.x = (src0.x <<  0) |
265        (src0.y <<  8) |
266        (src0.z << 16) |
267        (src0.w << 24);
268""")
269
270unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32,
271           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
272
273unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64,
274           "dst.x = src0.x; dst.y = src0.x >> 32;")
275
276# Lowered floating point unpacking operations.
277
278
279unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
280           "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
281unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
282           "unpack_half_1x16((uint16_t)(src0.x >> 16))")
283
284unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0")
285unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32")
286
287# Bit operations, part of ARB_gpu_shader5.
288
289
290unop("bitfield_reverse", tuint32, """
291/* we're not winning any awards for speed here, but that's ok */
292dst = 0;
293for (unsigned bit = 0; bit < 32; bit++)
294   dst |= ((src0 >> bit) & 1) << (31 - bit);
295""")
296unop("bit_count", tuint32, """
297dst = 0;
298for (unsigned bit = 0; bit < 32; bit++) {
299   if ((src0 >> bit) & 1)
300      dst++;
301}
302""")
303
304unop_convert("ufind_msb", tint32, tuint32, """
305dst = -1;
306for (int bit = 31; bit > 0; bit--) {
307   if ((src0 >> bit) & 1) {
308      dst = bit;
309      break;
310   }
311}
312""")
313
314unop("ifind_msb", tint32, """
315dst = -1;
316for (int bit = 31; bit >= 0; bit--) {
317   /* If src0 < 0, we're looking for the first 0 bit.
318    * if src0 >= 0, we're looking for the first 1 bit.
319    */
320   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
321      (!((src0 >> bit) & 1) && (src0 < 0))) {
322      dst = bit;
323      break;
324   }
325}
326""")
327
328unop("find_lsb", tint32, """
329dst = -1;
330for (unsigned bit = 0; bit < 32; bit++) {
331   if ((src0 >> bit) & 1) {
332      dst = bit;
333      break;
334   }
335}
336""")
337
338
339for i in xrange(1, 5):
340   for j in xrange(1, 5):
341      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
342
343def binop_convert(name, out_type, in_type, alg_props, const_expr):
344   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
345
346def binop(name, ty, alg_props, const_expr):
347   binop_convert(name, ty, ty, alg_props, const_expr)
348
349def binop_compare(name, ty, alg_props, const_expr):
350   binop_convert(name, tbool, ty, alg_props, const_expr)
351
352def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
353                src2_type, const_expr):
354   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
355          "", const_expr)
356
357def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
358                 reduce_expr, final_expr):
359   def final(src):
360      return final_expr.format(src= "(" + src + ")")
361   def reduce_(src0, src1):
362      return reduce_expr.format(src0=src0, src1=src1)
363   def prereduce(src0, src1):
364      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
365   src0 = prereduce("src0.x", "src1.x")
366   src1 = prereduce("src0.y", "src1.y")
367   src2 = prereduce("src0.z", "src1.z")
368   src3 = prereduce("src0.w", "src1.w")
369   opcode(name + "2", output_size, output_type,
370          [2, 2], [src_type, src_type], commutative,
371          final(reduce_(src0, src1)))
372   opcode(name + "3", output_size, output_type,
373          [3, 3], [src_type, src_type], commutative,
374          final(reduce_(reduce_(src0, src1), src2)))
375   opcode(name + "4", output_size, output_type,
376          [4, 4], [src_type, src_type], commutative,
377          final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
378
379binop("fadd", tfloat, commutative + associative, "src0 + src1")
380binop("iadd", tint, commutative + associative, "src0 + src1")
381binop("fsub", tfloat, "", "src0 - src1")
382binop("isub", tint, "", "src0 - src1")
383
384binop("fmul", tfloat, commutative + associative, "src0 * src1")
385# low 32-bits of signed/unsigned integer multiply
386binop("imul", tint, commutative + associative, "src0 * src1")
387# high 32-bits of signed integer multiply
388binop("imul_high", tint32, commutative,
389      "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
390# high 32-bits of unsigned integer multiply
391binop("umul_high", tuint32, commutative,
392      "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
393
394binop("fdiv", tfloat, "", "src0 / src1")
395binop("idiv", tint, "", "src0 / src1")
396binop("udiv", tuint, "", "src0 / src1")
397
398# returns a boolean representing the carry resulting from the addition of
399# the two unsigned arguments.
400
401binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
402
403# returns a boolean representing the borrow resulting from the subtraction
404# of the two unsigned arguments.
405
406binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
407
408binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
409
410# For signed integers, there are several different possible definitions of
411# "modulus" or "remainder".  We follow the conventions used by LLVM and
412# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
413# operation while the imod opcode implements the more mathematical
414# "modulus" operation.  For details on the difference, see
415#
416# http://mathforum.org/library/drmath/view/52343.html
417
418binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
419binop("imod", tint, "",
420      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
421      "                 src0 % src1 : src0 % src1 + src1)")
422binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
423binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
424
425#
426# Comparisons
427#
428
429
430# these integer-aware comparisons return a boolean (0 or ~0)
431
432binop_compare("flt", tfloat, "", "src0 < src1")
433binop_compare("fge", tfloat, "", "src0 >= src1")
434binop_compare("feq", tfloat, commutative, "src0 == src1")
435binop_compare("fne", tfloat, commutative, "src0 != src1")
436binop_compare("ilt", tint, "", "src0 < src1")
437binop_compare("ige", tint, "", "src0 >= src1")
438binop_compare("ieq", tint, commutative, "src0 == src1")
439binop_compare("ine", tint, commutative, "src0 != src1")
440binop_compare("ult", tuint, "", "src0 < src1")
441binop_compare("uge", tuint, "", "src0 >= src1")
442
443# integer-aware GLSL-style comparisons that compare floats and ints
444
445binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
446             "{src0} && {src1}", "{src}")
447binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
448             "{src0} || {src1}", "{src}")
449binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
450             "{src0} && {src1}", "{src}")
451binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
452             "{src0} || {src1}", "{src}")
453
454# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
455
456binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
457             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
458binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
459             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
460
461# These comparisons for integer-less hardware return 1.0 and 0.0 for true
462# and false respectively
463
464binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
465binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
466binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
467binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
468
469
470binop("ishl", tint, "", "src0 << src1")
471binop("ishr", tint, "", "src0 >> src1")
472binop("ushr", tuint, "", "src0 >> src1")
473
474# bitwise logic operators
475#
476# These are also used as boolean and, or, xor for hardware supporting
477# integers.
478
479
480binop("iand", tuint, commutative + associative, "src0 & src1")
481binop("ior", tuint, commutative + associative, "src0 | src1")
482binop("ixor", tuint, commutative + associative, "src0 ^ src1")
483
484
485# floating point logic operators
486#
487# These use (src != 0.0) for testing the truth of the input, and output 1.0
488# for true and 0.0 for false
489
490binop("fand", tfloat32, commutative,
491      "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
492binop("for", tfloat32, commutative,
493      "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
494binop("fxor", tfloat32, commutative,
495      "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
496
497binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
498             "{src}")
499
500binop_reduce("fdot_replicated", 4, tfloat, tfloat,
501             "{src0} * {src1}", "{src0} + {src1}", "{src}")
502
503opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
504       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
505opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
506       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
507
508binop("fmin", tfloat, "", "fminf(src0, src1)")
509binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
510binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
511binop("fmax", tfloat, "", "fmaxf(src0, src1)")
512binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
513binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
514
515# Saturated vector add for 4 8bit ints.
516binop("usadd_4x8", tint32, commutative + associative, """
517dst = 0;
518for (int i = 0; i < 32; i += 8) {
519   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
520}
521""")
522
523# Saturated vector subtract for 4 8bit ints.
524binop("ussub_4x8", tint32, "", """
525dst = 0;
526for (int i = 0; i < 32; i += 8) {
527   int src0_chan = (src0 >> i) & 0xff;
528   int src1_chan = (src1 >> i) & 0xff;
529   if (src0_chan > src1_chan)
530      dst |= (src0_chan - src1_chan) << i;
531}
532""")
533
534# vector min for 4 8bit ints.
535binop("umin_4x8", tint32, commutative + associative, """
536dst = 0;
537for (int i = 0; i < 32; i += 8) {
538   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
539}
540""")
541
542# vector max for 4 8bit ints.
543binop("umax_4x8", tint32, commutative + associative, """
544dst = 0;
545for (int i = 0; i < 32; i += 8) {
546   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
547}
548""")
549
550# unorm multiply: (a * b) / 255.
551binop("umul_unorm_4x8", tint32, commutative + associative, """
552dst = 0;
553for (int i = 0; i < 32; i += 8) {
554   int src0_chan = (src0 >> i) & 0xff;
555   int src1_chan = (src1 >> i) & 0xff;
556   dst |= ((src0_chan * src1_chan) / 255) << i;
557}
558""")
559
560binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
561
562binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
563            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
564
565binop_convert("pack_double_2x32_split", tuint64, tuint32, "",
566              "src0 | ((uint64_t)src1 << 32)")
567
568# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
569# and that of the "bfi1" i965 instruction. That is, it has undefined behavior
570# if either of its arguments are 32.
571binop_convert("bfm", tuint32, tint32, "", """
572int bits = src0, offset = src1;
573if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
574   dst = 0; /* undefined */
575else
576   dst = ((1u << bits) - 1) << offset;
577""")
578
579opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
580dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
581/* flush denormals to zero. */
582if (!isnormal(dst))
583   dst = copysignf(0.0f, src0);
584""")
585
586# Combines the first component of each input to make a 2-component vector.
587
588binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
589dst.x = src0.x;
590dst.y = src1.x;
591""")
592
593# Byte extraction
594binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
595binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
596
597# Word extraction
598binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
599binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
600
601
602def triop(name, ty, const_expr):
603   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
604def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
605   opcode(name, output_size, tuint,
606   [src1_size, src2_size, src3_size],
607   [tuint, tuint, tuint], "", const_expr)
608
609triop("ffma", tfloat, "src0 * src1 + src2")
610
611triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
612
613# Conditional Select
614#
615# A vector conditional select instruction (like ?:, but operating per-
616# component on vectors). There are two versions, one for floating point
617# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
618
619
620triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
621opcode("bcsel", 0, tuint, [0, 0, 0],
622      [tbool, tuint, tuint], "", "src0 ? src1 : src2")
623
624# SM5 bfi assembly
625triop("bfi", tuint32, """
626unsigned mask = src0, insert = src1, base = src2;
627if (mask == 0) {
628   dst = base;
629} else {
630   unsigned tmp = mask;
631   while (!(tmp & 1)) {
632      tmp >>= 1;
633      insert <<= 1;
634   }
635   dst = (base & ~mask) | (insert & mask);
636}
637""")
638
639# SM5 ubfe/ibfe assembly
640opcode("ubfe", 0, tuint32,
641       [0, 0, 0], [tuint32, tint32, tint32], "", """
642unsigned base = src0;
643int offset = src1, bits = src2;
644if (bits == 0) {
645   dst = 0;
646} else if (bits < 0 || offset < 0) {
647   dst = 0; /* undefined */
648} else if (offset + bits < 32) {
649   dst = (base << (32 - bits - offset)) >> (32 - bits);
650} else {
651   dst = base >> offset;
652}
653""")
654opcode("ibfe", 0, tint32,
655       [0, 0, 0], [tint32, tint32, tint32], "", """
656int base = src0;
657int offset = src1, bits = src2;
658if (bits == 0) {
659   dst = 0;
660} else if (bits < 0 || offset < 0) {
661   dst = 0; /* undefined */
662} else if (offset + bits < 32) {
663   dst = (base << (32 - bits - offset)) >> (32 - bits);
664} else {
665   dst = base >> offset;
666}
667""")
668
669# GLSL bitfieldExtract()
670opcode("ubitfield_extract", 0, tuint32,
671       [0, 0, 0], [tuint32, tint32, tint32], "", """
672unsigned base = src0;
673int offset = src1, bits = src2;
674if (bits == 0) {
675   dst = 0;
676} else if (bits < 0 || offset < 0 || offset + bits > 32) {
677   dst = 0; /* undefined per the spec */
678} else {
679   dst = (base >> offset) & ((1ull << bits) - 1);
680}
681""")
682opcode("ibitfield_extract", 0, tint32,
683       [0, 0, 0], [tint32, tint32, tint32], "", """
684int base = src0;
685int offset = src1, bits = src2;
686if (bits == 0) {
687   dst = 0;
688} else if (offset < 0 || bits < 0 || offset + bits > 32) {
689   dst = 0;
690} else {
691   dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
692}
693""")
694
695# Combines the first component of each input to make a 3-component vector.
696
697triop_horiz("vec3", 3, 1, 1, 1, """
698dst.x = src0.x;
699dst.y = src1.x;
700dst.z = src2.x;
701""")
702
703def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
704                 src4_size, const_expr):
705   opcode(name, output_size, tuint,
706          [src1_size, src2_size, src3_size, src4_size],
707          [tuint, tuint, tuint, tuint],
708          "", const_expr)
709
710opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
711       [tuint32, tuint32, tint32, tint32], "", """
712unsigned base = src0, insert = src1;
713int offset = src2, bits = src3;
714if (bits == 0) {
715   dst = 0;
716} else if (offset < 0 || bits < 0 || bits + offset > 32) {
717   dst = 0;
718} else {
719   unsigned mask = ((1ull << bits) - 1) << offset;
720   dst = (base & ~mask) | ((insert << bits) & mask);
721}
722""")
723
724quadop_horiz("vec4", 4, 1, 1, 1, 1, """
725dst.x = src0.x;
726dst.y = src1.x;
727dst.z = src2.x;
728dst.w = src3.x;
729""")
730
731
732