1# 2# Copyright (C) 2014 Connor Abbott 3# 4# Permission is hereby granted, free of charge, to any person obtaining a 5# copy of this software and associated documentation files (the "Software"), 6# to deal in the Software without restriction, including without limitation 7# the rights to use, copy, modify, merge, publish, distribute, sublicense, 8# and/or sell copies of the Software, and to permit persons to whom the 9# Software is furnished to do so, subject to the following conditions: 10# 11# The above copyright notice and this permission notice (including the next 12# paragraph) shall be included in all copies or substantial portions of the 13# Software. 14# 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21# IN THE SOFTWARE. 22# 23# Authors: 24# Connor Abbott (cwabbott0@gmail.com) 25 26 27# Class that represents all the information we have about the opcode 28# NOTE: this must be kept in sync with nir_op_info 29 30class Opcode(object): 31 """Class that represents all the information we have about the opcode 32 NOTE: this must be kept in sync with nir_op_info 33 """ 34 def __init__(self, name, output_size, output_type, input_sizes, 35 input_types, algebraic_properties, const_expr): 36 """Parameters: 37 38 - name is the name of the opcode (prepend nir_op_ for the enum name) 39 - all types are strings that get nir_type_ prepended to them 40 - input_types is a list of types 41 - algebraic_properties is a space-seperated string, where nir_op_is_ is 42 prepended before each entry 43 - const_expr is an expression or series of statements that computes the 44 constant value of the opcode given the constant values of its inputs. 45 46 Constant expressions are formed from the variables src0, src1, ..., 47 src(N-1), where N is the number of arguments. The output of the 48 expression should be stored in the dst variable. Per-component input 49 and output variables will be scalars and non-per-component input and 50 output variables will be a struct with fields named x, y, z, and w 51 all of the correct type. Input and output variables can be assumed 52 to already be of the correct type and need no conversion. In 53 particular, the conversion from the C bool type to/from NIR_TRUE and 54 NIR_FALSE happens automatically. 55 56 For per-component instructions, the entire expression will be 57 executed once for each component. For non-per-component 58 instructions, the expression is expected to store the correct values 59 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the 60 constant expression, an assignment to dst will happen automatically 61 and the result will be equivalent to "dst = <expression>" for 62 per-component instructions and "dst.x = dst.y = ... = <expression>" 63 for non-per-component instructions. 64 """ 65 assert isinstance(name, str) 66 assert isinstance(output_size, int) 67 assert isinstance(output_type, str) 68 assert isinstance(input_sizes, list) 69 assert isinstance(input_sizes[0], int) 70 assert isinstance(input_types, list) 71 assert isinstance(input_types[0], str) 72 assert isinstance(algebraic_properties, str) 73 assert isinstance(const_expr, str) 74 assert len(input_sizes) == len(input_types) 75 assert 0 <= output_size <= 4 76 for size in input_sizes: 77 assert 0 <= size <= 4 78 if output_size != 0: 79 assert size != 0 80 self.name = name 81 self.num_inputs = len(input_sizes) 82 self.output_size = output_size 83 self.output_type = output_type 84 self.input_sizes = input_sizes 85 self.input_types = input_types 86 self.algebraic_properties = algebraic_properties 87 self.const_expr = const_expr 88 89# helper variables for strings 90tfloat = "float" 91tint = "int" 92tbool = "bool32" 93tuint = "uint" 94tfloat32 = "float32" 95tint32 = "int32" 96tuint32 = "uint32" 97tint64 = "int64" 98tuint64 = "uint64" 99tfloat64 = "float64" 100 101commutative = "commutative " 102associative = "associative " 103 104# global dictionary of opcodes 105opcodes = {} 106 107def opcode(name, output_size, output_type, input_sizes, input_types, 108 algebraic_properties, const_expr): 109 assert name not in opcodes 110 opcodes[name] = Opcode(name, output_size, output_type, input_sizes, 111 input_types, algebraic_properties, const_expr) 112 113def unop_convert(name, out_type, in_type, const_expr): 114 opcode(name, 0, out_type, [0], [in_type], "", const_expr) 115 116def unop(name, ty, const_expr): 117 opcode(name, 0, ty, [0], [ty], "", const_expr) 118 119def unop_horiz(name, output_size, output_type, input_size, input_type, 120 const_expr): 121 opcode(name, output_size, output_type, [input_size], [input_type], "", 122 const_expr) 123 124def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, 125 reduce_expr, final_expr): 126 def prereduce(src): 127 return "(" + prereduce_expr.format(src=src) + ")" 128 def final(src): 129 return final_expr.format(src="(" + src + ")") 130 def reduce_(src0, src1): 131 return reduce_expr.format(src0=src0, src1=src1) 132 src0 = prereduce("src0.x") 133 src1 = prereduce("src0.y") 134 src2 = prereduce("src0.z") 135 src3 = prereduce("src0.w") 136 unop_horiz(name + "2", output_size, output_type, 2, input_type, 137 final(reduce_(src0, src1))) 138 unop_horiz(name + "3", output_size, output_type, 3, input_type, 139 final(reduce_(reduce_(src0, src1), src2))) 140 unop_horiz(name + "4", output_size, output_type, 4, input_type, 141 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 142 143 144# These two move instructions differ in what modifiers they support and what 145# the negate modifier means. Otherwise, they are identical. 146unop("fmov", tfloat, "src0") 147unop("imov", tint, "src0") 148 149unop("ineg", tint, "-src0") 150unop("fneg", tfloat, "-src0") 151unop("inot", tint, "~src0") # invert every bit of the integer 152unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " + 153 "((src0 == 0.0f) ? 1.0f : 0.0f)")) 154unop("fsign", tfloat, ("bit_size == 64 ? " + 155 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " + 156 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))")) 157unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)") 158unop("iabs", tint, "(src0 < 0) ? -src0 : src0") 159unop("fabs", tfloat, "fabs(src0)") 160unop("fsat", tfloat, ("bit_size == 64 ? " + 161 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " + 162 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))")) 163unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0") 164unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)") 165unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)") 166unop("fexp2", tfloat, "exp2f(src0)") 167unop("flog2", tfloat, "log2f(src0)") 168 169# Generate all of the numeric conversion opcodes 170for src_t in [tint, tuint, tfloat]: 171 if src_t in (tint, tuint): 172 dst_types = [tfloat, src_t] 173 elif src_t == tfloat: 174 dst_types = [tint, tuint, tfloat] 175 176 for dst_t in dst_types: 177 if dst_t == tfloat: 178 bit_sizes = [16, 32, 64] 179 else: 180 bit_sizes = [8, 16, 32, 64] 181 for bit_size in bit_sizes: 182 if bit_size == 16 and dst_t == tfloat and src_t == tfloat: 183 rnd_modes = ['rtne', 'rtz', 'undef'] 184 for rnd_mode in rnd_modes: 185 unop_convert("{0}2{1}{2}_{3}".format(src_t[0], dst_t[0], 186 bit_size, rnd_mode), 187 dst_t + str(bit_size), src_t, "src0") 188 else: 189 unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size), 190 dst_t + str(bit_size), src_t, "src0") 191 192# We'll hand-code the to/from bool conversion opcodes. Because bool doesn't 193# have multiple bit-sizes, we can always infer the size from the other type. 194unop_convert("f2b", tbool, tfloat, "src0 != 0.0") 195unop_convert("i2b", tbool, tint, "src0 != 0") 196unop_convert("b2f", tfloat, tbool, "src0 ? 1.0 : 0.0") 197unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") 198 199 200# Unary floating-point rounding operations. 201 202 203unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") 204unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") 205unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") 206unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") 207unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") 208 209unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") 210 211# Trigonometric operations. 212 213 214unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") 215unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") 216 217 218# Partial derivatives. 219 220 221unop("fddx", tfloat, "0.0") # the derivative of a constant is 0. 222unop("fddy", tfloat, "0.0") 223unop("fddx_fine", tfloat, "0.0") 224unop("fddy_fine", tfloat, "0.0") 225unop("fddx_coarse", tfloat, "0.0") 226unop("fddy_coarse", tfloat, "0.0") 227 228 229# Floating point pack and unpack operations. 230 231def pack_2x16(fmt): 232 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """ 233dst.x = (uint32_t) pack_fmt_1x16(src0.x); 234dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; 235""".replace("fmt", fmt)) 236 237def pack_4x8(fmt): 238 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ 239dst.x = (uint32_t) pack_fmt_1x8(src0.x); 240dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; 241dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; 242dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; 243""".replace("fmt", fmt)) 244 245def unpack_2x16(fmt): 246 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ 247dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); 248dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); 249""".replace("fmt", fmt)) 250 251def unpack_4x8(fmt): 252 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ 253dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); 254dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); 255dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); 256dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24)); 257""".replace("fmt", fmt)) 258 259 260pack_2x16("snorm") 261pack_4x8("snorm") 262pack_2x16("unorm") 263pack_4x8("unorm") 264pack_2x16("half") 265unpack_2x16("snorm") 266unpack_4x8("snorm") 267unpack_2x16("unorm") 268unpack_4x8("unorm") 269unpack_2x16("half") 270 271unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ 272dst.x = (src0.x & 0xffff) | (src0.y << 16); 273""") 274 275unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ 276dst.x = (src0.x << 0) | 277 (src0.y << 8) | 278 (src0.z << 16) | 279 (src0.w << 24); 280""") 281 282unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32, 283 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 284 285unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64, 286 "dst.x = src0.x; dst.y = src0.x >> 32;") 287 288# Lowered floating point unpacking operations. 289 290 291unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32, 292 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))") 293unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32, 294 "unpack_half_1x16((uint16_t)(src0.x >> 16))") 295 296unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0") 297unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32") 298 299# Bit operations, part of ARB_gpu_shader5. 300 301 302unop("bitfield_reverse", tuint32, """ 303/* we're not winning any awards for speed here, but that's ok */ 304dst = 0; 305for (unsigned bit = 0; bit < 32; bit++) 306 dst |= ((src0 >> bit) & 1) << (31 - bit); 307""") 308unop("bit_count", tuint32, """ 309dst = 0; 310for (unsigned bit = 0; bit < 32; bit++) { 311 if ((src0 >> bit) & 1) 312 dst++; 313} 314""") 315 316unop_convert("ufind_msb", tint32, tuint32, """ 317dst = -1; 318for (int bit = 31; bit >= 0; bit--) { 319 if ((src0 >> bit) & 1) { 320 dst = bit; 321 break; 322 } 323} 324""") 325 326unop("ifind_msb", tint32, """ 327dst = -1; 328for (int bit = 31; bit >= 0; bit--) { 329 /* If src0 < 0, we're looking for the first 0 bit. 330 * if src0 >= 0, we're looking for the first 1 bit. 331 */ 332 if ((((src0 >> bit) & 1) && (src0 >= 0)) || 333 (!((src0 >> bit) & 1) && (src0 < 0))) { 334 dst = bit; 335 break; 336 } 337} 338""") 339 340unop("find_lsb", tint32, """ 341dst = -1; 342for (unsigned bit = 0; bit < 32; bit++) { 343 if ((src0 >> bit) & 1) { 344 dst = bit; 345 break; 346 } 347} 348""") 349 350 351for i in xrange(1, 5): 352 for j in xrange(1, 5): 353 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f") 354 355def binop_convert(name, out_type, in_type, alg_props, const_expr): 356 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr) 357 358def binop(name, ty, alg_props, const_expr): 359 binop_convert(name, ty, ty, alg_props, const_expr) 360 361def binop_compare(name, ty, alg_props, const_expr): 362 binop_convert(name, tbool, ty, alg_props, const_expr) 363 364def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, 365 src2_type, const_expr): 366 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], 367 "", const_expr) 368 369def binop_reduce(name, output_size, output_type, src_type, prereduce_expr, 370 reduce_expr, final_expr): 371 def final(src): 372 return final_expr.format(src= "(" + src + ")") 373 def reduce_(src0, src1): 374 return reduce_expr.format(src0=src0, src1=src1) 375 def prereduce(src0, src1): 376 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")" 377 src0 = prereduce("src0.x", "src1.x") 378 src1 = prereduce("src0.y", "src1.y") 379 src2 = prereduce("src0.z", "src1.z") 380 src3 = prereduce("src0.w", "src1.w") 381 opcode(name + "2", output_size, output_type, 382 [2, 2], [src_type, src_type], commutative, 383 final(reduce_(src0, src1))) 384 opcode(name + "3", output_size, output_type, 385 [3, 3], [src_type, src_type], commutative, 386 final(reduce_(reduce_(src0, src1), src2))) 387 opcode(name + "4", output_size, output_type, 388 [4, 4], [src_type, src_type], commutative, 389 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 390 391binop("fadd", tfloat, commutative + associative, "src0 + src1") 392binop("iadd", tint, commutative + associative, "src0 + src1") 393binop("fsub", tfloat, "", "src0 - src1") 394binop("isub", tint, "", "src0 - src1") 395 396binop("fmul", tfloat, commutative + associative, "src0 * src1") 397# low 32-bits of signed/unsigned integer multiply 398binop("imul", tint, commutative + associative, "src0 * src1") 399# high 32-bits of signed integer multiply 400binop("imul_high", tint32, commutative, 401 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)") 402# high 32-bits of unsigned integer multiply 403binop("umul_high", tuint32, commutative, 404 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)") 405 406binop("fdiv", tfloat, "", "src0 / src1") 407binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)") 408binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)") 409 410# returns a boolean representing the carry resulting from the addition of 411# the two unsigned arguments. 412 413binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0") 414 415# returns a boolean representing the borrow resulting from the subtraction 416# of the two unsigned arguments. 417 418binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1") 419 420binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1") 421 422# For signed integers, there are several different possible definitions of 423# "modulus" or "remainder". We follow the conventions used by LLVM and 424# SPIR-V. The irem opcode implements the standard C/C++ signed "%" 425# operation while the imod opcode implements the more mathematical 426# "modulus" operation. For details on the difference, see 427# 428# http://mathforum.org/library/drmath/view/52343.html 429 430binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1") 431binop("imod", tint, "", 432 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?" 433 " src0 % src1 : src0 % src1 + src1)") 434binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)") 435binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)") 436 437# 438# Comparisons 439# 440 441 442# these integer-aware comparisons return a boolean (0 or ~0) 443 444binop_compare("flt", tfloat, "", "src0 < src1") 445binop_compare("fge", tfloat, "", "src0 >= src1") 446binop_compare("feq", tfloat, commutative, "src0 == src1") 447binop_compare("fne", tfloat, commutative, "src0 != src1") 448binop_compare("ilt", tint, "", "src0 < src1") 449binop_compare("ige", tint, "", "src0 >= src1") 450binop_compare("ieq", tint, commutative, "src0 == src1") 451binop_compare("ine", tint, commutative, "src0 != src1") 452binop_compare("ult", tuint, "", "src0 < src1") 453binop_compare("uge", tuint, "", "src0 >= src1") 454 455# integer-aware GLSL-style comparisons that compare floats and ints 456 457binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}", 458 "{src0} && {src1}", "{src}") 459binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}", 460 "{src0} || {src1}", "{src}") 461binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}", 462 "{src0} && {src1}", "{src}") 463binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}", 464 "{src0} || {src1}", "{src}") 465 466# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 467 468binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", 469 "{src0} && {src1}", "{src} ? 1.0f : 0.0f") 470binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", 471 "{src0} || {src1}", "{src} ? 1.0f : 0.0f") 472 473# These comparisons for integer-less hardware return 1.0 and 0.0 for true 474# and false respectively 475 476binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than 477binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal 478binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal 479binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal 480 481 482opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1") 483opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1") 484opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1") 485 486# bitwise logic operators 487# 488# These are also used as boolean and, or, xor for hardware supporting 489# integers. 490 491 492binop("iand", tuint, commutative + associative, "src0 & src1") 493binop("ior", tuint, commutative + associative, "src0 | src1") 494binop("ixor", tuint, commutative + associative, "src0 ^ src1") 495 496 497# floating point logic operators 498# 499# These use (src != 0.0) for testing the truth of the input, and output 1.0 500# for true and 0.0 for false 501 502binop("fand", tfloat32, commutative, 503 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f") 504binop("for", tfloat32, commutative, 505 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f") 506binop("fxor", tfloat32, commutative, 507 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f") 508 509binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", 510 "{src}") 511 512binop_reduce("fdot_replicated", 4, tfloat, tfloat, 513 "{src0} * {src1}", "{src0} + {src1}", "{src}") 514 515opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "", 516 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 517opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "", 518 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 519 520binop("fmin", tfloat, "", "fminf(src0, src1)") 521binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1") 522binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1") 523binop("fmax", tfloat, "", "fmaxf(src0, src1)") 524binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0") 525binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0") 526 527# Saturated vector add for 4 8bit ints. 528binop("usadd_4x8", tint32, commutative + associative, """ 529dst = 0; 530for (int i = 0; i < 32; i += 8) { 531 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; 532} 533""") 534 535# Saturated vector subtract for 4 8bit ints. 536binop("ussub_4x8", tint32, "", """ 537dst = 0; 538for (int i = 0; i < 32; i += 8) { 539 int src0_chan = (src0 >> i) & 0xff; 540 int src1_chan = (src1 >> i) & 0xff; 541 if (src0_chan > src1_chan) 542 dst |= (src0_chan - src1_chan) << i; 543} 544""") 545 546# vector min for 4 8bit ints. 547binop("umin_4x8", tint32, commutative + associative, """ 548dst = 0; 549for (int i = 0; i < 32; i += 8) { 550 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 551} 552""") 553 554# vector max for 4 8bit ints. 555binop("umax_4x8", tint32, commutative + associative, """ 556dst = 0; 557for (int i = 0; i < 32; i += 8) { 558 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 559} 560""") 561 562# unorm multiply: (a * b) / 255. 563binop("umul_unorm_4x8", tint32, commutative + associative, """ 564dst = 0; 565for (int i = 0; i < 32; i += 8) { 566 int src0_chan = (src0 >> i) & 0xff; 567 int src1_chan = (src1 >> i) & 0xff; 568 dst |= ((src0_chan * src1_chan) / 255) << i; 569} 570""") 571 572binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") 573 574binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 575 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") 576 577binop_convert("pack_64_2x32_split", tuint64, tuint32, "", 578 "src0 | ((uint64_t)src1 << 32)") 579 580# bfm implements the behavior of the first operation of the SM5 "bfi" assembly 581# and that of the "bfi1" i965 instruction. That is, it has undefined behavior 582# if either of its arguments are 32. 583binop_convert("bfm", tuint32, tint32, "", """ 584int bits = src0, offset = src1; 585if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32) 586 dst = 0; /* undefined */ 587else 588 dst = ((1u << bits) - 1) << offset; 589""") 590 591opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """ 592dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); 593/* flush denormals to zero. */ 594if (!isnormal(dst)) 595 dst = copysignf(0.0f, src0); 596""") 597 598# Combines the first component of each input to make a 2-component vector. 599 600binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """ 601dst.x = src0.x; 602dst.y = src1.x; 603""") 604 605# Byte extraction 606binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") 607binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") 608 609# Word extraction 610binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") 611binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") 612 613 614def triop(name, ty, const_expr): 615 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr) 616def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr): 617 opcode(name, output_size, tuint, 618 [src1_size, src2_size, src3_size], 619 [tuint, tuint, tuint], "", const_expr) 620 621triop("ffma", tfloat, "src0 * src1 + src2") 622 623triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2") 624 625# Conditional Select 626# 627# A vector conditional select instruction (like ?:, but operating per- 628# component on vectors). There are two versions, one for floating point 629# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0). 630 631 632triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2") 633opcode("bcsel", 0, tuint, [0, 0, 0], 634 [tbool, tuint, tuint], "", "src0 ? src1 : src2") 635 636# SM5 bfi assembly 637triop("bfi", tuint32, """ 638unsigned mask = src0, insert = src1, base = src2; 639if (mask == 0) { 640 dst = base; 641} else { 642 unsigned tmp = mask; 643 while (!(tmp & 1)) { 644 tmp >>= 1; 645 insert <<= 1; 646 } 647 dst = (base & ~mask) | (insert & mask); 648} 649""") 650 651# SM5 ubfe/ibfe assembly 652opcode("ubfe", 0, tuint32, 653 [0, 0, 0], [tuint32, tint32, tint32], "", """ 654unsigned base = src0; 655int offset = src1, bits = src2; 656if (bits == 0) { 657 dst = 0; 658} else if (bits < 0 || offset < 0) { 659 dst = 0; /* undefined */ 660} else if (offset + bits < 32) { 661 dst = (base << (32 - bits - offset)) >> (32 - bits); 662} else { 663 dst = base >> offset; 664} 665""") 666opcode("ibfe", 0, tint32, 667 [0, 0, 0], [tint32, tint32, tint32], "", """ 668int base = src0; 669int offset = src1, bits = src2; 670if (bits == 0) { 671 dst = 0; 672} else if (bits < 0 || offset < 0) { 673 dst = 0; /* undefined */ 674} else if (offset + bits < 32) { 675 dst = (base << (32 - bits - offset)) >> (32 - bits); 676} else { 677 dst = base >> offset; 678} 679""") 680 681# GLSL bitfieldExtract() 682opcode("ubitfield_extract", 0, tuint32, 683 [0, 0, 0], [tuint32, tint32, tint32], "", """ 684unsigned base = src0; 685int offset = src1, bits = src2; 686if (bits == 0) { 687 dst = 0; 688} else if (bits < 0 || offset < 0 || offset + bits > 32) { 689 dst = 0; /* undefined per the spec */ 690} else { 691 dst = (base >> offset) & ((1ull << bits) - 1); 692} 693""") 694opcode("ibitfield_extract", 0, tint32, 695 [0, 0, 0], [tint32, tint32, tint32], "", """ 696int base = src0; 697int offset = src1, bits = src2; 698if (bits == 0) { 699 dst = 0; 700} else if (offset < 0 || bits < 0 || offset + bits > 32) { 701 dst = 0; 702} else { 703 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */ 704} 705""") 706 707# Combines the first component of each input to make a 3-component vector. 708 709triop_horiz("vec3", 3, 1, 1, 1, """ 710dst.x = src0.x; 711dst.y = src1.x; 712dst.z = src2.x; 713""") 714 715def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, 716 src4_size, const_expr): 717 opcode(name, output_size, tuint, 718 [src1_size, src2_size, src3_size, src4_size], 719 [tuint, tuint, tuint, tuint], 720 "", const_expr) 721 722opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], 723 [tuint32, tuint32, tint32, tint32], "", """ 724unsigned base = src0, insert = src1; 725int offset = src2, bits = src3; 726if (bits == 0) { 727 dst = base; 728} else if (offset < 0 || bits < 0 || bits + offset > 32) { 729 dst = 0; 730} else { 731 unsigned mask = ((1ull << bits) - 1) << offset; 732 dst = (base & ~mask) | ((insert << offset) & mask); 733} 734""") 735 736quadop_horiz("vec4", 4, 1, 1, 1, 1, """ 737dst.x = src0.x; 738dst.y = src1.x; 739dst.z = src2.x; 740dst.w = src3.x; 741""") 742 743 744