1#
2# Copyright (C) 2014 Connor Abbott
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23# Authors:
24#    Connor Abbott (cwabbott0@gmail.com)
25
26
27# Class that represents all the information we have about the opcode
28# NOTE: this must be kept in sync with nir_op_info
29
30class Opcode(object):
31   """Class that represents all the information we have about the opcode
32   NOTE: this must be kept in sync with nir_op_info
33   """
34   def __init__(self, name, output_size, output_type, input_sizes,
35                input_types, algebraic_properties, const_expr):
36      """Parameters:
37
38      - name is the name of the opcode (prepend nir_op_ for the enum name)
39      - all types are strings that get nir_type_ prepended to them
40      - input_types is a list of types
41      - algebraic_properties is a space-seperated string, where nir_op_is_ is
42        prepended before each entry
43      - const_expr is an expression or series of statements that computes the
44        constant value of the opcode given the constant values of its inputs.
45
46      Constant expressions are formed from the variables src0, src1, ...,
47      src(N-1), where N is the number of arguments.  The output of the
48      expression should be stored in the dst variable.  Per-component input
49      and output variables will be scalars and non-per-component input and
50      output variables will be a struct with fields named x, y, z, and w
51      all of the correct type.  Input and output variables can be assumed
52      to already be of the correct type and need no conversion.  In
53      particular, the conversion from the C bool type to/from  NIR_TRUE and
54      NIR_FALSE happens automatically.
55
56      For per-component instructions, the entire expression will be
57      executed once for each component.  For non-per-component
58      instructions, the expression is expected to store the correct values
59      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
60      constant expression, an assignment to dst will happen automatically
61      and the result will be equivalent to "dst = <expression>" for
62      per-component instructions and "dst.x = dst.y = ... = <expression>"
63      for non-per-component instructions.
64      """
65      assert isinstance(name, str)
66      assert isinstance(output_size, int)
67      assert isinstance(output_type, str)
68      assert isinstance(input_sizes, list)
69      assert isinstance(input_sizes[0], int)
70      assert isinstance(input_types, list)
71      assert isinstance(input_types[0], str)
72      assert isinstance(algebraic_properties, str)
73      assert isinstance(const_expr, str)
74      assert len(input_sizes) == len(input_types)
75      assert 0 <= output_size <= 4
76      for size in input_sizes:
77         assert 0 <= size <= 4
78         if output_size != 0:
79            assert size != 0
80      self.name = name
81      self.num_inputs = len(input_sizes)
82      self.output_size = output_size
83      self.output_type = output_type
84      self.input_sizes = input_sizes
85      self.input_types = input_types
86      self.algebraic_properties = algebraic_properties
87      self.const_expr = const_expr
88
89# helper variables for strings
90tfloat = "float"
91tint = "int"
92tbool = "bool32"
93tuint = "uint"
94tfloat32 = "float32"
95tint32 = "int32"
96tuint32 = "uint32"
97tint64 = "int64"
98tuint64 = "uint64"
99tfloat64 = "float64"
100
101commutative = "commutative "
102associative = "associative "
103
104# global dictionary of opcodes
105opcodes = {}
106
107def opcode(name, output_size, output_type, input_sizes, input_types,
108           algebraic_properties, const_expr):
109   assert name not in opcodes
110   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
111                          input_types, algebraic_properties, const_expr)
112
113def unop_convert(name, out_type, in_type, const_expr):
114   opcode(name, 0, out_type, [0], [in_type], "", const_expr)
115
116def unop(name, ty, const_expr):
117   opcode(name, 0, ty, [0], [ty], "", const_expr)
118
119def unop_horiz(name, output_size, output_type, input_size, input_type,
120               const_expr):
121   opcode(name, output_size, output_type, [input_size], [input_type], "",
122          const_expr)
123
124def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
125                reduce_expr, final_expr):
126   def prereduce(src):
127      return "(" + prereduce_expr.format(src=src) + ")"
128   def final(src):
129      return final_expr.format(src="(" + src + ")")
130   def reduce_(src0, src1):
131      return reduce_expr.format(src0=src0, src1=src1)
132   src0 = prereduce("src0.x")
133   src1 = prereduce("src0.y")
134   src2 = prereduce("src0.z")
135   src3 = prereduce("src0.w")
136   unop_horiz(name + "2", output_size, output_type, 2, input_type,
137              final(reduce_(src0, src1)))
138   unop_horiz(name + "3", output_size, output_type, 3, input_type,
139              final(reduce_(reduce_(src0, src1), src2)))
140   unop_horiz(name + "4", output_size, output_type, 4, input_type,
141              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
142
143
144# These two move instructions differ in what modifiers they support and what
145# the negate modifier means. Otherwise, they are identical.
146unop("fmov", tfloat, "src0")
147unop("imov", tint, "src0")
148
149unop("ineg", tint, "-src0")
150unop("fneg", tfloat, "-src0")
151unop("inot", tint, "~src0") # invert every bit of the integer
152unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
153                      "((src0 == 0.0f) ? 1.0f : 0.0f)"))
154unop("fsign", tfloat, ("bit_size == 64 ? " +
155                       "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
156                       "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
157unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
158unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
159unop("fabs", tfloat, "fabs(src0)")
160unop("fsat", tfloat, ("bit_size == 64 ? " +
161                      "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
162                      "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
163unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
164unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
165unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
166unop("fexp2", tfloat, "exp2f(src0)")
167unop("flog2", tfloat, "log2f(src0)")
168
169# Generate all of the numeric conversion opcodes
170for src_t in [tint, tuint, tfloat]:
171   if src_t in (tint, tuint):
172      dst_types = [tfloat, src_t]
173   elif src_t == tfloat:
174      dst_types = [tint, tuint, tfloat]
175
176   for dst_t in dst_types:
177      if dst_t == tfloat:
178         bit_sizes = [16, 32, 64]
179      else:
180         bit_sizes = [8, 16, 32, 64]
181      for bit_size in bit_sizes:
182          if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
183              rnd_modes = ['rtne', 'rtz', 'undef']
184              for rnd_mode in rnd_modes:
185                  unop_convert("{0}2{1}{2}_{3}".format(src_t[0], dst_t[0],
186                                                       bit_size, rnd_mode),
187                               dst_t + str(bit_size), src_t, "src0")
188          else:
189              unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
190                           dst_t + str(bit_size), src_t, "src0")
191
192# We'll hand-code the to/from bool conversion opcodes.  Because bool doesn't
193# have multiple bit-sizes, we can always infer the size from the other type.
194unop_convert("f2b", tbool, tfloat, "src0 != 0.0")
195unop_convert("i2b", tbool, tint, "src0 != 0")
196unop_convert("b2f", tfloat, tbool, "src0 ? 1.0 : 0.0")
197unop_convert("b2i", tint, tbool, "src0 ? 1 : 0")
198
199
200# Unary floating-point rounding operations.
201
202
203unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
204unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
205unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
206unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
207unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
208
209unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
210
211# Trigonometric operations.
212
213
214unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
215unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
216
217
218# Partial derivatives.
219
220
221unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
222unop("fddy", tfloat, "0.0")
223unop("fddx_fine", tfloat, "0.0")
224unop("fddy_fine", tfloat, "0.0")
225unop("fddx_coarse", tfloat, "0.0")
226unop("fddy_coarse", tfloat, "0.0")
227
228
229# Floating point pack and unpack operations.
230
231def pack_2x16(fmt):
232   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
233dst.x = (uint32_t) pack_fmt_1x16(src0.x);
234dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
235""".replace("fmt", fmt))
236
237def pack_4x8(fmt):
238   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
239dst.x = (uint32_t) pack_fmt_1x8(src0.x);
240dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
241dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
242dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
243""".replace("fmt", fmt))
244
245def unpack_2x16(fmt):
246   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
247dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
248dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
249""".replace("fmt", fmt))
250
251def unpack_4x8(fmt):
252   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
253dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
254dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
255dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
256dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
257""".replace("fmt", fmt))
258
259
260pack_2x16("snorm")
261pack_4x8("snorm")
262pack_2x16("unorm")
263pack_4x8("unorm")
264pack_2x16("half")
265unpack_2x16("snorm")
266unpack_4x8("snorm")
267unpack_2x16("unorm")
268unpack_4x8("unorm")
269unpack_2x16("half")
270
271unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
272dst.x = (src0.x & 0xffff) | (src0.y << 16);
273""")
274
275unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
276dst.x = (src0.x <<  0) |
277        (src0.y <<  8) |
278        (src0.z << 16) |
279        (src0.w << 24);
280""")
281
282unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
283           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
284
285unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
286           "dst.x = src0.x; dst.y = src0.x >> 32;")
287
288# Lowered floating point unpacking operations.
289
290
291unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
292           "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
293unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
294           "unpack_half_1x16((uint16_t)(src0.x >> 16))")
295
296unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
297unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
298
299# Bit operations, part of ARB_gpu_shader5.
300
301
302unop("bitfield_reverse", tuint32, """
303/* we're not winning any awards for speed here, but that's ok */
304dst = 0;
305for (unsigned bit = 0; bit < 32; bit++)
306   dst |= ((src0 >> bit) & 1) << (31 - bit);
307""")
308unop("bit_count", tuint32, """
309dst = 0;
310for (unsigned bit = 0; bit < 32; bit++) {
311   if ((src0 >> bit) & 1)
312      dst++;
313}
314""")
315
316unop_convert("ufind_msb", tint32, tuint32, """
317dst = -1;
318for (int bit = 31; bit >= 0; bit--) {
319   if ((src0 >> bit) & 1) {
320      dst = bit;
321      break;
322   }
323}
324""")
325
326unop("ifind_msb", tint32, """
327dst = -1;
328for (int bit = 31; bit >= 0; bit--) {
329   /* If src0 < 0, we're looking for the first 0 bit.
330    * if src0 >= 0, we're looking for the first 1 bit.
331    */
332   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
333      (!((src0 >> bit) & 1) && (src0 < 0))) {
334      dst = bit;
335      break;
336   }
337}
338""")
339
340unop("find_lsb", tint32, """
341dst = -1;
342for (unsigned bit = 0; bit < 32; bit++) {
343   if ((src0 >> bit) & 1) {
344      dst = bit;
345      break;
346   }
347}
348""")
349
350
351for i in xrange(1, 5):
352   for j in xrange(1, 5):
353      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
354
355def binop_convert(name, out_type, in_type, alg_props, const_expr):
356   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
357
358def binop(name, ty, alg_props, const_expr):
359   binop_convert(name, ty, ty, alg_props, const_expr)
360
361def binop_compare(name, ty, alg_props, const_expr):
362   binop_convert(name, tbool, ty, alg_props, const_expr)
363
364def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
365                src2_type, const_expr):
366   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
367          "", const_expr)
368
369def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
370                 reduce_expr, final_expr):
371   def final(src):
372      return final_expr.format(src= "(" + src + ")")
373   def reduce_(src0, src1):
374      return reduce_expr.format(src0=src0, src1=src1)
375   def prereduce(src0, src1):
376      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
377   src0 = prereduce("src0.x", "src1.x")
378   src1 = prereduce("src0.y", "src1.y")
379   src2 = prereduce("src0.z", "src1.z")
380   src3 = prereduce("src0.w", "src1.w")
381   opcode(name + "2", output_size, output_type,
382          [2, 2], [src_type, src_type], commutative,
383          final(reduce_(src0, src1)))
384   opcode(name + "3", output_size, output_type,
385          [3, 3], [src_type, src_type], commutative,
386          final(reduce_(reduce_(src0, src1), src2)))
387   opcode(name + "4", output_size, output_type,
388          [4, 4], [src_type, src_type], commutative,
389          final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
390
391binop("fadd", tfloat, commutative + associative, "src0 + src1")
392binop("iadd", tint, commutative + associative, "src0 + src1")
393binop("fsub", tfloat, "", "src0 - src1")
394binop("isub", tint, "", "src0 - src1")
395
396binop("fmul", tfloat, commutative + associative, "src0 * src1")
397# low 32-bits of signed/unsigned integer multiply
398binop("imul", tint, commutative + associative, "src0 * src1")
399# high 32-bits of signed integer multiply
400binop("imul_high", tint32, commutative,
401      "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
402# high 32-bits of unsigned integer multiply
403binop("umul_high", tuint32, commutative,
404      "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
405
406binop("fdiv", tfloat, "", "src0 / src1")
407binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
408binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
409
410# returns a boolean representing the carry resulting from the addition of
411# the two unsigned arguments.
412
413binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
414
415# returns a boolean representing the borrow resulting from the subtraction
416# of the two unsigned arguments.
417
418binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
419
420binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
421
422# For signed integers, there are several different possible definitions of
423# "modulus" or "remainder".  We follow the conventions used by LLVM and
424# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
425# operation while the imod opcode implements the more mathematical
426# "modulus" operation.  For details on the difference, see
427#
428# http://mathforum.org/library/drmath/view/52343.html
429
430binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
431binop("imod", tint, "",
432      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
433      "                 src0 % src1 : src0 % src1 + src1)")
434binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
435binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
436
437#
438# Comparisons
439#
440
441
442# these integer-aware comparisons return a boolean (0 or ~0)
443
444binop_compare("flt", tfloat, "", "src0 < src1")
445binop_compare("fge", tfloat, "", "src0 >= src1")
446binop_compare("feq", tfloat, commutative, "src0 == src1")
447binop_compare("fne", tfloat, commutative, "src0 != src1")
448binop_compare("ilt", tint, "", "src0 < src1")
449binop_compare("ige", tint, "", "src0 >= src1")
450binop_compare("ieq", tint, commutative, "src0 == src1")
451binop_compare("ine", tint, commutative, "src0 != src1")
452binop_compare("ult", tuint, "", "src0 < src1")
453binop_compare("uge", tuint, "", "src0 >= src1")
454
455# integer-aware GLSL-style comparisons that compare floats and ints
456
457binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
458             "{src0} && {src1}", "{src}")
459binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
460             "{src0} || {src1}", "{src}")
461binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
462             "{src0} && {src1}", "{src}")
463binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
464             "{src0} || {src1}", "{src}")
465
466# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
467
468binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
469             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
470binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
471             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
472
473# These comparisons for integer-less hardware return 1.0 and 0.0 for true
474# and false respectively
475
476binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
477binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
478binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
479binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
480
481
482opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
483opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
484opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
485
486# bitwise logic operators
487#
488# These are also used as boolean and, or, xor for hardware supporting
489# integers.
490
491
492binop("iand", tuint, commutative + associative, "src0 & src1")
493binop("ior", tuint, commutative + associative, "src0 | src1")
494binop("ixor", tuint, commutative + associative, "src0 ^ src1")
495
496
497# floating point logic operators
498#
499# These use (src != 0.0) for testing the truth of the input, and output 1.0
500# for true and 0.0 for false
501
502binop("fand", tfloat32, commutative,
503      "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
504binop("for", tfloat32, commutative,
505      "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
506binop("fxor", tfloat32, commutative,
507      "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
508
509binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
510             "{src}")
511
512binop_reduce("fdot_replicated", 4, tfloat, tfloat,
513             "{src0} * {src1}", "{src0} + {src1}", "{src}")
514
515opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
516       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
517opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
518       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
519
520binop("fmin", tfloat, "", "fminf(src0, src1)")
521binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
522binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
523binop("fmax", tfloat, "", "fmaxf(src0, src1)")
524binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
525binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
526
527# Saturated vector add for 4 8bit ints.
528binop("usadd_4x8", tint32, commutative + associative, """
529dst = 0;
530for (int i = 0; i < 32; i += 8) {
531   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
532}
533""")
534
535# Saturated vector subtract for 4 8bit ints.
536binop("ussub_4x8", tint32, "", """
537dst = 0;
538for (int i = 0; i < 32; i += 8) {
539   int src0_chan = (src0 >> i) & 0xff;
540   int src1_chan = (src1 >> i) & 0xff;
541   if (src0_chan > src1_chan)
542      dst |= (src0_chan - src1_chan) << i;
543}
544""")
545
546# vector min for 4 8bit ints.
547binop("umin_4x8", tint32, commutative + associative, """
548dst = 0;
549for (int i = 0; i < 32; i += 8) {
550   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
551}
552""")
553
554# vector max for 4 8bit ints.
555binop("umax_4x8", tint32, commutative + associative, """
556dst = 0;
557for (int i = 0; i < 32; i += 8) {
558   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
559}
560""")
561
562# unorm multiply: (a * b) / 255.
563binop("umul_unorm_4x8", tint32, commutative + associative, """
564dst = 0;
565for (int i = 0; i < 32; i += 8) {
566   int src0_chan = (src0 >> i) & 0xff;
567   int src1_chan = (src1 >> i) & 0xff;
568   dst |= ((src0_chan * src1_chan) / 255) << i;
569}
570""")
571
572binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
573
574binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
575            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
576
577binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
578              "src0 | ((uint64_t)src1 << 32)")
579
580# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
581# and that of the "bfi1" i965 instruction. That is, it has undefined behavior
582# if either of its arguments are 32.
583binop_convert("bfm", tuint32, tint32, "", """
584int bits = src0, offset = src1;
585if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
586   dst = 0; /* undefined */
587else
588   dst = ((1u << bits) - 1) << offset;
589""")
590
591opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
592dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
593/* flush denormals to zero. */
594if (!isnormal(dst))
595   dst = copysignf(0.0f, src0);
596""")
597
598# Combines the first component of each input to make a 2-component vector.
599
600binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
601dst.x = src0.x;
602dst.y = src1.x;
603""")
604
605# Byte extraction
606binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
607binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
608
609# Word extraction
610binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
611binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
612
613
614def triop(name, ty, const_expr):
615   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
616def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
617   opcode(name, output_size, tuint,
618   [src1_size, src2_size, src3_size],
619   [tuint, tuint, tuint], "", const_expr)
620
621triop("ffma", tfloat, "src0 * src1 + src2")
622
623triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
624
625# Conditional Select
626#
627# A vector conditional select instruction (like ?:, but operating per-
628# component on vectors). There are two versions, one for floating point
629# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
630
631
632triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
633opcode("bcsel", 0, tuint, [0, 0, 0],
634      [tbool, tuint, tuint], "", "src0 ? src1 : src2")
635
636# SM5 bfi assembly
637triop("bfi", tuint32, """
638unsigned mask = src0, insert = src1, base = src2;
639if (mask == 0) {
640   dst = base;
641} else {
642   unsigned tmp = mask;
643   while (!(tmp & 1)) {
644      tmp >>= 1;
645      insert <<= 1;
646   }
647   dst = (base & ~mask) | (insert & mask);
648}
649""")
650
651# SM5 ubfe/ibfe assembly
652opcode("ubfe", 0, tuint32,
653       [0, 0, 0], [tuint32, tint32, tint32], "", """
654unsigned base = src0;
655int offset = src1, bits = src2;
656if (bits == 0) {
657   dst = 0;
658} else if (bits < 0 || offset < 0) {
659   dst = 0; /* undefined */
660} else if (offset + bits < 32) {
661   dst = (base << (32 - bits - offset)) >> (32 - bits);
662} else {
663   dst = base >> offset;
664}
665""")
666opcode("ibfe", 0, tint32,
667       [0, 0, 0], [tint32, tint32, tint32], "", """
668int base = src0;
669int offset = src1, bits = src2;
670if (bits == 0) {
671   dst = 0;
672} else if (bits < 0 || offset < 0) {
673   dst = 0; /* undefined */
674} else if (offset + bits < 32) {
675   dst = (base << (32 - bits - offset)) >> (32 - bits);
676} else {
677   dst = base >> offset;
678}
679""")
680
681# GLSL bitfieldExtract()
682opcode("ubitfield_extract", 0, tuint32,
683       [0, 0, 0], [tuint32, tint32, tint32], "", """
684unsigned base = src0;
685int offset = src1, bits = src2;
686if (bits == 0) {
687   dst = 0;
688} else if (bits < 0 || offset < 0 || offset + bits > 32) {
689   dst = 0; /* undefined per the spec */
690} else {
691   dst = (base >> offset) & ((1ull << bits) - 1);
692}
693""")
694opcode("ibitfield_extract", 0, tint32,
695       [0, 0, 0], [tint32, tint32, tint32], "", """
696int base = src0;
697int offset = src1, bits = src2;
698if (bits == 0) {
699   dst = 0;
700} else if (offset < 0 || bits < 0 || offset + bits > 32) {
701   dst = 0;
702} else {
703   dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
704}
705""")
706
707# Combines the first component of each input to make a 3-component vector.
708
709triop_horiz("vec3", 3, 1, 1, 1, """
710dst.x = src0.x;
711dst.y = src1.x;
712dst.z = src2.x;
713""")
714
715def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
716                 src4_size, const_expr):
717   opcode(name, output_size, tuint,
718          [src1_size, src2_size, src3_size, src4_size],
719          [tuint, tuint, tuint, tuint],
720          "", const_expr)
721
722opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
723       [tuint32, tuint32, tint32, tint32], "", """
724unsigned base = src0, insert = src1;
725int offset = src2, bits = src3;
726if (bits == 0) {
727   dst = base;
728} else if (offset < 0 || bits < 0 || bits + offset > 32) {
729   dst = 0;
730} else {
731   unsigned mask = ((1ull << bits) - 1) << offset;
732   dst = (base & ~mask) | ((insert << offset) & mask);
733}
734""")
735
736quadop_horiz("vec4", 4, 1, 1, 1, 1, """
737dst.x = src0.x;
738dst.y = src1.x;
739dst.z = src2.x;
740dst.w = src3.x;
741""")
742
743
744