1 /* 2 * Copyright © 2013-2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include "isl/isl.h" 25 #include "brw_fs_surface_builder.h" 26 #include "brw_fs.h" 27 28 using namespace brw; 29 30 namespace brw { 31 namespace surface_access { 32 namespace { 33 /** 34 * Generate a logical send opcode for a surface message and return 35 * the result. 36 */ 37 fs_reg emit_send(const fs_builder & bld,enum opcode opcode,const fs_reg & addr,const fs_reg & src,const fs_reg & surface,unsigned dims,unsigned arg,unsigned rsize,brw_predicate pred=BRW_PREDICATE_NONE)38 emit_send(const fs_builder &bld, enum opcode opcode, 39 const fs_reg &addr, const fs_reg &src, const fs_reg &surface, 40 unsigned dims, unsigned arg, unsigned rsize, 41 brw_predicate pred = BRW_PREDICATE_NONE) 42 { 43 /* Reduce the dynamically uniform surface index to a single 44 * scalar. 45 */ 46 const fs_reg usurface = bld.emit_uniformize(surface); 47 const fs_reg srcs[] = { 48 addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg) 49 }; 50 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize); 51 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); 52 53 inst->size_written = rsize * dst.component_size(inst->exec_size); 54 inst->predicate = pred; 55 return dst; 56 } 57 } 58 59 /** 60 * Emit an untyped surface read opcode. \p dims determines the number 61 * of components of the address and \p size the number of components of 62 * the returned value. 63 */ 64 fs_reg emit_untyped_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size,brw_predicate pred)65 emit_untyped_read(const fs_builder &bld, 66 const fs_reg &surface, const fs_reg &addr, 67 unsigned dims, unsigned size, 68 brw_predicate pred) 69 { 70 return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 71 addr, fs_reg(), surface, dims, size, size, pred); 72 } 73 74 /** 75 * Emit an untyped surface write opcode. \p dims determines the number 76 * of components of the address and \p size the number of components of 77 * the argument. 78 */ 79 void emit_untyped_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size,brw_predicate pred)80 emit_untyped_write(const fs_builder &bld, const fs_reg &surface, 81 const fs_reg &addr, const fs_reg &src, 82 unsigned dims, unsigned size, 83 brw_predicate pred) 84 { 85 emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 86 addr, src, surface, dims, size, 0, pred); 87 } 88 89 /** 90 * Emit an untyped surface atomic opcode. \p dims determines the number 91 * of components of the address and \p rsize the number of components of 92 * the returned value (either zero or one). 93 */ 94 fs_reg emit_untyped_atomic(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned dims,unsigned rsize,unsigned op,brw_predicate pred)95 emit_untyped_atomic(const fs_builder &bld, 96 const fs_reg &surface, const fs_reg &addr, 97 const fs_reg &src0, const fs_reg &src1, 98 unsigned dims, unsigned rsize, unsigned op, 99 brw_predicate pred) 100 { 101 /* FINISHME: Factor out this frequently recurring pattern into a 102 * helper function. 103 */ 104 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE); 105 const fs_reg srcs[] = { src0, src1 }; 106 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n); 107 bld.LOAD_PAYLOAD(tmp, srcs, n, 0); 108 109 return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 110 addr, tmp, surface, dims, op, rsize, pred); 111 } 112 113 /** 114 * Emit a typed surface read opcode. \p dims determines the number of 115 * components of the address and \p size the number of components of the 116 * returned value. 117 */ 118 fs_reg emit_typed_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size)119 emit_typed_read(const fs_builder &bld, const fs_reg &surface, 120 const fs_reg &addr, unsigned dims, unsigned size) 121 { 122 return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, 123 addr, fs_reg(), surface, dims, size, size); 124 } 125 126 /** 127 * Emit a typed surface write opcode. \p dims determines the number of 128 * components of the address and \p size the number of components of the 129 * argument. 130 */ 131 void emit_typed_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size)132 emit_typed_write(const fs_builder &bld, const fs_reg &surface, 133 const fs_reg &addr, const fs_reg &src, 134 unsigned dims, unsigned size) 135 { 136 emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, 137 addr, src, surface, dims, size, 0); 138 } 139 140 /** 141 * Emit a typed surface atomic opcode. \p dims determines the number of 142 * components of the address and \p rsize the number of components of 143 * the returned value (either zero or one). 144 */ 145 fs_reg emit_typed_atomic(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned dims,unsigned rsize,unsigned op,brw_predicate pred)146 emit_typed_atomic(const fs_builder &bld, const fs_reg &surface, 147 const fs_reg &addr, 148 const fs_reg &src0, const fs_reg &src1, 149 unsigned dims, unsigned rsize, unsigned op, 150 brw_predicate pred) 151 { 152 /* FINISHME: Factor out this frequently recurring pattern into a 153 * helper function. 154 */ 155 const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE); 156 const fs_reg srcs[] = { src0, src1 }; 157 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n); 158 bld.LOAD_PAYLOAD(tmp, srcs, n, 0); 159 160 return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, 161 addr, tmp, surface, dims, op, rsize); 162 } 163 164 fs_reg emit_byte_scattered_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size,unsigned bit_size,brw_predicate pred)165 emit_byte_scattered_read(const fs_builder &bld, 166 const fs_reg &surface, const fs_reg &addr, 167 unsigned dims, unsigned size, 168 unsigned bit_size, brw_predicate pred) 169 { 170 return emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 171 addr, fs_reg(), surface, dims, bit_size, size, pred); 172 } 173 174 void emit_byte_scattered_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size,unsigned bit_size,brw_predicate pred)175 emit_byte_scattered_write(const fs_builder &bld, const fs_reg &surface, 176 const fs_reg &addr, const fs_reg &src, 177 unsigned dims, unsigned size, 178 unsigned bit_size, brw_predicate pred) 179 { 180 emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 181 addr, src, surface, dims, bit_size, 0, pred); 182 } 183 } 184 } 185 186 namespace { 187 namespace image_format_info { 188 /* The higher compiler layers use the GL enums for image formats even if 189 * they come in from SPIR-V or Vulkan. We need to turn them into an ISL 190 * enum before we can use them. 191 */ 192 static enum isl_format isl_format_for_gl_format(uint32_t gl_format)193 isl_format_for_gl_format(uint32_t gl_format) 194 { 195 switch (gl_format) { 196 case GL_R8: return ISL_FORMAT_R8_UNORM; 197 case GL_R8_SNORM: return ISL_FORMAT_R8_SNORM; 198 case GL_R8UI: return ISL_FORMAT_R8_UINT; 199 case GL_R8I: return ISL_FORMAT_R8_SINT; 200 case GL_RG8: return ISL_FORMAT_R8G8_UNORM; 201 case GL_RG8_SNORM: return ISL_FORMAT_R8G8_SNORM; 202 case GL_RG8UI: return ISL_FORMAT_R8G8_UINT; 203 case GL_RG8I: return ISL_FORMAT_R8G8_SINT; 204 case GL_RGBA8: return ISL_FORMAT_R8G8B8A8_UNORM; 205 case GL_RGBA8_SNORM: return ISL_FORMAT_R8G8B8A8_SNORM; 206 case GL_RGBA8UI: return ISL_FORMAT_R8G8B8A8_UINT; 207 case GL_RGBA8I: return ISL_FORMAT_R8G8B8A8_SINT; 208 case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT; 209 case GL_RGB10_A2: return ISL_FORMAT_R10G10B10A2_UNORM; 210 case GL_RGB10_A2UI: return ISL_FORMAT_R10G10B10A2_UINT; 211 case GL_R16: return ISL_FORMAT_R16_UNORM; 212 case GL_R16_SNORM: return ISL_FORMAT_R16_SNORM; 213 case GL_R16F: return ISL_FORMAT_R16_FLOAT; 214 case GL_R16UI: return ISL_FORMAT_R16_UINT; 215 case GL_R16I: return ISL_FORMAT_R16_SINT; 216 case GL_RG16: return ISL_FORMAT_R16G16_UNORM; 217 case GL_RG16_SNORM: return ISL_FORMAT_R16G16_SNORM; 218 case GL_RG16F: return ISL_FORMAT_R16G16_FLOAT; 219 case GL_RG16UI: return ISL_FORMAT_R16G16_UINT; 220 case GL_RG16I: return ISL_FORMAT_R16G16_SINT; 221 case GL_RGBA16: return ISL_FORMAT_R16G16B16A16_UNORM; 222 case GL_RGBA16_SNORM: return ISL_FORMAT_R16G16B16A16_SNORM; 223 case GL_RGBA16F: return ISL_FORMAT_R16G16B16A16_FLOAT; 224 case GL_RGBA16UI: return ISL_FORMAT_R16G16B16A16_UINT; 225 case GL_RGBA16I: return ISL_FORMAT_R16G16B16A16_SINT; 226 case GL_R32F: return ISL_FORMAT_R32_FLOAT; 227 case GL_R32UI: return ISL_FORMAT_R32_UINT; 228 case GL_R32I: return ISL_FORMAT_R32_SINT; 229 case GL_RG32F: return ISL_FORMAT_R32G32_FLOAT; 230 case GL_RG32UI: return ISL_FORMAT_R32G32_UINT; 231 case GL_RG32I: return ISL_FORMAT_R32G32_SINT; 232 case GL_RGBA32F: return ISL_FORMAT_R32G32B32A32_FLOAT; 233 case GL_RGBA32UI: return ISL_FORMAT_R32G32B32A32_UINT; 234 case GL_RGBA32I: return ISL_FORMAT_R32G32B32A32_SINT; 235 case GL_NONE: return ISL_FORMAT_UNSUPPORTED; 236 default: 237 assert(!"Invalid image format"); 238 return ISL_FORMAT_UNSUPPORTED; 239 } 240 } 241 242 /** 243 * Simple 4-tuple of scalars used to pass around per-color component 244 * values. 245 */ 246 struct color_u { color_u__anonee7e056b0211::image_format_info::color_u247 color_u(unsigned x = 0) : r(x), g(x), b(x), a(x) 248 { 249 } 250 color_u__anonee7e056b0211::image_format_info::color_u251 color_u(unsigned r, unsigned g, unsigned b, unsigned a) : 252 r(r), g(g), b(b), a(a) 253 { 254 } 255 256 unsigned operator []__anonee7e056b0211::image_format_info::color_u257 operator[](unsigned i) const 258 { 259 const unsigned xs[] = { r, g, b, a }; 260 return xs[i]; 261 } 262 263 unsigned r, g, b, a; 264 }; 265 266 /** 267 * Return the per-channel bitfield widths for a given image format. 268 */ 269 inline color_u get_bit_widths(isl_format format)270 get_bit_widths(isl_format format) 271 { 272 const isl_format_layout *fmtl = isl_format_get_layout(format); 273 274 return color_u(fmtl->channels.r.bits, 275 fmtl->channels.g.bits, 276 fmtl->channels.b.bits, 277 fmtl->channels.a.bits); 278 } 279 280 /** 281 * Return the per-channel bitfield shifts for a given image format. 282 */ 283 inline color_u get_bit_shifts(isl_format format)284 get_bit_shifts(isl_format format) 285 { 286 const color_u widths = get_bit_widths(format); 287 return color_u(0, widths.r, widths.r + widths.g, 288 widths.r + widths.g + widths.b); 289 } 290 291 /** 292 * Return true if all present components have the same bit width. 293 */ 294 inline bool is_homogeneous(isl_format format)295 is_homogeneous(isl_format format) 296 { 297 const color_u widths = get_bit_widths(format); 298 return ((widths.g == 0 || widths.g == widths.r) && 299 (widths.b == 0 || widths.b == widths.r) && 300 (widths.a == 0 || widths.a == widths.r)); 301 } 302 303 /** 304 * Return true if the format conversion boils down to a trivial copy. 305 */ 306 inline bool is_conversion_trivial(const gen_device_info * devinfo,isl_format format)307 is_conversion_trivial(const gen_device_info *devinfo, isl_format format) 308 { 309 return (get_bit_widths(format).r == 32 && is_homogeneous(format)) || 310 format == isl_lower_storage_image_format(devinfo, format); 311 } 312 313 /** 314 * Return true if the hardware natively supports some format with 315 * compatible bitfield layout, but possibly different data types. 316 */ 317 inline bool has_supported_bit_layout(const gen_device_info * devinfo,isl_format format)318 has_supported_bit_layout(const gen_device_info *devinfo, 319 isl_format format) 320 { 321 const color_u widths = get_bit_widths(format); 322 const color_u lower_widths = get_bit_widths( 323 isl_lower_storage_image_format(devinfo, format)); 324 325 return (widths.r == lower_widths.r && 326 widths.g == lower_widths.g && 327 widths.b == lower_widths.b && 328 widths.a == lower_widths.a); 329 } 330 331 /** 332 * Return true if we are required to spread individual components over 333 * several components of the format used by the hardware (RG32 and 334 * friends implemented as RGBA16UI). 335 */ 336 inline bool has_split_bit_layout(const gen_device_info * devinfo,isl_format format)337 has_split_bit_layout(const gen_device_info *devinfo, isl_format format) 338 { 339 const isl_format lower_format = 340 isl_lower_storage_image_format(devinfo, format); 341 342 return (isl_format_get_num_channels(format) < 343 isl_format_get_num_channels(lower_format)); 344 } 345 346 /** 347 * Return true if the hardware returns garbage in the unused high bits 348 * of each component. This may happen on IVB because we rely on the 349 * undocumented behavior that typed reads from surfaces of the 350 * unsupported R8 and R16 formats return useful data in their least 351 * significant bits. 352 */ 353 inline bool has_undefined_high_bits(const gen_device_info * devinfo,isl_format format)354 has_undefined_high_bits(const gen_device_info *devinfo, 355 isl_format format) 356 { 357 const isl_format lower_format = 358 isl_lower_storage_image_format(devinfo, format); 359 360 return (devinfo->gen == 7 && !devinfo->is_haswell && 361 (lower_format == ISL_FORMAT_R16_UINT || 362 lower_format == ISL_FORMAT_R8_UINT)); 363 } 364 365 /** 366 * Return true if the format represents values as signed integers 367 * requiring sign extension when unpacking. 368 */ 369 inline bool needs_sign_extension(isl_format format)370 needs_sign_extension(isl_format format) 371 { 372 return isl_format_has_snorm_channel(format) || 373 isl_format_has_sint_channel(format); 374 } 375 } 376 377 namespace image_validity { 378 /** 379 * Check whether the bound image is suitable for untyped access. 380 */ 381 static brw_predicate emit_untyped_image_check(const fs_builder & bld,const fs_reg & image,brw_predicate pred)382 emit_untyped_image_check(const fs_builder &bld, const fs_reg &image, 383 brw_predicate pred) 384 { 385 const gen_device_info *devinfo = bld.shader->devinfo; 386 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET); 387 388 if (devinfo->gen == 7 && !devinfo->is_haswell) { 389 /* Check whether the first stride component (i.e. the Bpp value) 390 * is greater than four, what on Gen7 indicates that a surface of 391 * type RAW has been bound for untyped access. Reading or writing 392 * to a surface of type other than RAW using untyped surface 393 * messages causes a hang on IVB and VLV. 394 */ 395 set_predicate(pred, 396 bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4), 397 BRW_CONDITIONAL_G)); 398 399 return BRW_PREDICATE_NORMAL; 400 } else { 401 /* More recent generations handle the format mismatch 402 * gracefully. 403 */ 404 return pred; 405 } 406 } 407 408 /** 409 * Check whether there is an image bound at the given index and write 410 * the comparison result to f0.0. Returns an appropriate predication 411 * mode to use on subsequent image operations. 412 */ 413 static brw_predicate emit_typed_atomic_check(const fs_builder & bld,const fs_reg & image)414 emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image) 415 { 416 const gen_device_info *devinfo = bld.shader->devinfo; 417 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); 418 419 if (devinfo->gen == 7 && !devinfo->is_haswell) { 420 /* Check the first component of the size field to find out if the 421 * image is bound. Necessary on IVB for typed atomics because 422 * they don't seem to respect null surfaces and will happily 423 * corrupt or read random memory when no image is bound. 424 */ 425 bld.CMP(bld.null_reg_ud(), 426 retype(size, BRW_REGISTER_TYPE_UD), 427 brw_imm_d(0), BRW_CONDITIONAL_NZ); 428 429 return BRW_PREDICATE_NORMAL; 430 } else { 431 /* More recent platforms implement compliant behavior when a null 432 * surface is bound. 433 */ 434 return BRW_PREDICATE_NONE; 435 } 436 } 437 438 /** 439 * Check whether the provided coordinates are within the image bounds 440 * and write the comparison result to f0.0. Returns an appropriate 441 * predication mode to use on subsequent image operations. 442 */ 443 static brw_predicate emit_bounds_check(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,unsigned dims)444 emit_bounds_check(const fs_builder &bld, const fs_reg &image, 445 const fs_reg &addr, unsigned dims) 446 { 447 const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET); 448 449 for (unsigned c = 0; c < dims; ++c) 450 set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL, 451 bld.CMP(bld.null_reg_ud(), 452 offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c), 453 offset(size, bld, c), 454 BRW_CONDITIONAL_L)); 455 456 return BRW_PREDICATE_NORMAL; 457 } 458 } 459 460 namespace image_coordinates { 461 /** 462 * Return the total number of coordinates needed to address a texel of 463 * the surface, which may be more than the sum of \p surf_dims and \p 464 * arr_dims if padding is required. 465 */ 466 static unsigned num_image_coordinates(const fs_builder & bld,unsigned surf_dims,unsigned arr_dims,isl_format format)467 num_image_coordinates(const fs_builder &bld, 468 unsigned surf_dims, unsigned arr_dims, 469 isl_format format) 470 { 471 /* HSW in vec4 mode and our software coordinate handling for untyped 472 * reads want the array index to be at the Z component. 473 */ 474 const bool array_index_at_z = 475 format != ISL_FORMAT_UNSUPPORTED && 476 !isl_has_matching_typed_storage_image_format( 477 bld.shader->devinfo, format); 478 const unsigned zero_dims = 479 ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0); 480 481 return surf_dims + zero_dims + arr_dims; 482 } 483 484 /** 485 * Transform image coordinates into the form expected by the 486 * implementation. 487 */ 488 static fs_reg emit_image_coordinates(const fs_builder & bld,const fs_reg & addr,unsigned surf_dims,unsigned arr_dims,isl_format format)489 emit_image_coordinates(const fs_builder &bld, const fs_reg &addr, 490 unsigned surf_dims, unsigned arr_dims, 491 isl_format format) 492 { 493 const unsigned dims = 494 num_image_coordinates(bld, surf_dims, arr_dims, format); 495 496 if (dims > surf_dims + arr_dims) { 497 assert(surf_dims == 1 && arr_dims == 1 && dims == 3); 498 /* The array index is required to be passed in as the Z component, 499 * insert a zero at the Y component to shift it to the right 500 * position. 501 * 502 * FINISHME: Factor out this frequently recurring pattern into a 503 * helper function. 504 */ 505 const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) }; 506 const fs_reg dst = bld.vgrf(addr.type, dims); 507 bld.LOAD_PAYLOAD(dst, srcs, dims, 0); 508 return dst; 509 } else { 510 return addr; 511 } 512 } 513 514 /** 515 * Calculate the offset in memory of the texel given by \p coord. 516 * 517 * This is meant to be used with untyped surface messages to access a 518 * tiled surface, what involves taking into account the tiling and 519 * swizzling modes of the surface manually so it will hopefully not 520 * happen very often. 521 * 522 * The tiling algorithm implemented here matches either the X or Y 523 * tiling layouts supported by the hardware depending on the tiling 524 * coefficients passed to the program as uniforms. See Volume 1 Part 2 525 * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth 526 * explanation of the hardware tiling format. 527 */ 528 static fs_reg emit_address_calculation(const fs_builder & bld,const fs_reg & image,const fs_reg & coord,unsigned dims)529 emit_address_calculation(const fs_builder &bld, const fs_reg &image, 530 const fs_reg &coord, unsigned dims) 531 { 532 const gen_device_info *devinfo = bld.shader->devinfo; 533 const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET); 534 const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET); 535 const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET); 536 const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET); 537 const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 538 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 539 const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 540 const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 541 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); 542 543 /* Shift the coordinates by the fixed surface offset. It may be 544 * non-zero if the image is a single slice of a higher-dimensional 545 * surface, or if a non-zero mipmap level of the surface is bound to 546 * the pipeline. The offset needs to be applied here rather than at 547 * surface state set-up time because the desired slice-level may 548 * start mid-tile, so simply shifting the surface base address 549 * wouldn't give a well-formed tiled surface in the general case. 550 */ 551 for (unsigned c = 0; c < 2; ++c) 552 bld.ADD(offset(addr, bld, c), offset(off, bld, c), 553 (c < dims ? 554 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) : 555 fs_reg(brw_imm_d(0)))); 556 557 /* The layout of 3-D textures in memory is sort-of like a tiling 558 * format. At each miplevel, the slices are arranged in rows of 559 * 2^level slices per row. The slice row is stored in tmp.y and 560 * the slice within the row is stored in tmp.x. 561 * 562 * The layout of 2-D array textures and cubemaps is much simpler: 563 * Depending on whether the ARYSPC_LOD0 layout is in use it will be 564 * stored in memory as an array of slices, each one being a 2-D 565 * arrangement of miplevels, or as a 2D arrangement of miplevels, 566 * each one being an array of slices. In either case the separation 567 * between slices of the same LOD is equal to the qpitch value 568 * provided as stride.w. 569 * 570 * This code can be made to handle either 2D arrays and 3D textures 571 * by passing in the miplevel as tile.z for 3-D textures and 0 in 572 * tile.z for 2-D array textures. 573 * 574 * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface 575 * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion 576 * of the hardware 3D texture and 2D array layouts. 577 */ 578 if (dims > 2) { 579 /* Decompose z into a major (tmp.y) and a minor (tmp.x) 580 * index. 581 */ 582 bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0), 583 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2)); 584 bld.SHR(offset(tmp, bld, 1), 585 offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2), 586 offset(tile, bld, 2)); 587 588 /* Take into account the horizontal (tmp.x) and vertical (tmp.y) 589 * slice offset. 590 */ 591 for (unsigned c = 0; c < 2; ++c) { 592 bld.MUL(offset(tmp, bld, c), 593 offset(stride, bld, 2 + c), offset(tmp, bld, c)); 594 bld.ADD(offset(addr, bld, c), 595 offset(addr, bld, c), offset(tmp, bld, c)); 596 } 597 } 598 599 if (dims > 1) { 600 /* Calculate the major/minor x and y indices. In order to 601 * accommodate both X and Y tiling, the Y-major tiling format is 602 * treated as being a bunch of narrow X-tiles placed next to each 603 * other. This means that the tile width for Y-tiling is actually 604 * the width of one sub-column of the Y-major tile where each 4K 605 * tile has 8 512B sub-columns. 606 * 607 * The major Y value is the row of tiles in which the pixel lives. 608 * The major X value is the tile sub-column in which the pixel 609 * lives; for X tiling, this is the same as the tile column, for Y 610 * tiling, each tile has 8 sub-columns. The minor X and Y indices 611 * are the position within the sub-column. 612 */ 613 for (unsigned c = 0; c < 2; ++c) { 614 /* Calculate the minor x and y indices. */ 615 bld.BFE(offset(minor, bld, c), offset(tile, bld, c), 616 brw_imm_d(0), offset(addr, bld, c)); 617 618 /* Calculate the major x and y indices. */ 619 bld.SHR(offset(major, bld, c), 620 offset(addr, bld, c), offset(tile, bld, c)); 621 } 622 623 /* Calculate the texel index from the start of the tile row and 624 * the vertical coordinate of the row. 625 * Equivalent to: 626 * tmp.x = (major.x << tile.y << tile.x) + 627 * (minor.y << tile.x) + minor.x 628 * tmp.y = major.y << tile.y 629 */ 630 bld.SHL(tmp, major, offset(tile, bld, 1)); 631 bld.ADD(tmp, tmp, offset(minor, bld, 1)); 632 bld.SHL(tmp, tmp, offset(tile, bld, 0)); 633 bld.ADD(tmp, tmp, minor); 634 bld.SHL(offset(tmp, bld, 1), 635 offset(major, bld, 1), offset(tile, bld, 1)); 636 637 /* Add it to the start of the tile row. */ 638 bld.MUL(offset(tmp, bld, 1), 639 offset(tmp, bld, 1), offset(stride, bld, 1)); 640 bld.ADD(tmp, tmp, offset(tmp, bld, 1)); 641 642 /* Multiply by the Bpp value. */ 643 bld.MUL(dst, tmp, stride); 644 645 if (devinfo->gen < 8 && !devinfo->is_baytrail) { 646 /* Take into account the two dynamically specified shifts. 647 * Both need are used to implement swizzling of X-tiled 648 * surfaces. For Y-tiled surfaces only one bit needs to be 649 * XOR-ed with bit 6 of the memory address, so a swz value of 650 * 0xff (actually interpreted as 31 by the hardware) will be 651 * provided to cause the relevant bit of tmp.y to be zero and 652 * turn the first XOR into the identity. For linear surfaces 653 * or platforms lacking address swizzling both shifts will be 654 * 0xff causing the relevant bits of both tmp.x and .y to be 655 * zero, what effectively disables swizzling. 656 */ 657 for (unsigned c = 0; c < 2; ++c) 658 bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c)); 659 660 /* XOR tmp.x and tmp.y with bit 6 of the memory address. */ 661 bld.XOR(tmp, tmp, offset(tmp, bld, 1)); 662 bld.AND(tmp, tmp, brw_imm_d(1 << 6)); 663 bld.XOR(dst, dst, tmp); 664 } 665 666 } else { 667 /* Multiply by the Bpp/stride value. Note that the addr.y may be 668 * non-zero even if the image is one-dimensional because a 669 * vertical offset may have been applied above to select a 670 * non-zero slice or level of a higher-dimensional texture. 671 */ 672 bld.MUL(offset(addr, bld, 1), 673 offset(addr, bld, 1), offset(stride, bld, 1)); 674 bld.ADD(addr, addr, offset(addr, bld, 1)); 675 bld.MUL(dst, addr, stride); 676 } 677 678 return dst; 679 } 680 } 681 682 namespace image_format_conversion { 683 using image_format_info::color_u; 684 685 namespace { 686 /** 687 * Maximum representable value in an unsigned integer with the given 688 * number of bits. 689 */ 690 inline unsigned scale(unsigned n)691 scale(unsigned n) 692 { 693 return (1 << n) - 1; 694 } 695 } 696 697 /** 698 * Pack the vector \p src in a bitfield given the per-component bit 699 * shifts and widths. Note that bitfield components are not allowed to 700 * cross 32-bit boundaries. 701 */ 702 static fs_reg emit_pack(const fs_builder & bld,const fs_reg & src,const color_u & shifts,const color_u & widths)703 emit_pack(const fs_builder &bld, const fs_reg &src, 704 const color_u &shifts, const color_u &widths) 705 { 706 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); 707 bool seen[4] = {}; 708 709 for (unsigned c = 0; c < 4; ++c) { 710 if (widths[c]) { 711 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 712 713 /* Shift each component left to the correct bitfield position. */ 714 bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32)); 715 716 /* Add everything up. */ 717 if (seen[shifts[c] / 32]) { 718 bld.OR(offset(dst, bld, shifts[c] / 32), 719 offset(dst, bld, shifts[c] / 32), tmp); 720 } else { 721 bld.MOV(offset(dst, bld, shifts[c] / 32), tmp); 722 seen[shifts[c] / 32] = true; 723 } 724 } 725 } 726 727 return dst; 728 } 729 730 /** 731 * Unpack a vector from the bitfield \p src given the per-component bit 732 * shifts and widths. Note that bitfield components are not allowed to 733 * cross 32-bit boundaries. 734 */ 735 static fs_reg emit_unpack(const fs_builder & bld,const fs_reg & src,const color_u & shifts,const color_u & widths)736 emit_unpack(const fs_builder &bld, const fs_reg &src, 737 const color_u &shifts, const color_u &widths) 738 { 739 const fs_reg dst = bld.vgrf(src.type, 4); 740 741 for (unsigned c = 0; c < 4; ++c) { 742 if (widths[c]) { 743 /* Shift left to discard the most significant bits. */ 744 bld.SHL(offset(dst, bld, c), 745 offset(src, bld, shifts[c] / 32), 746 brw_imm_ud(32 - shifts[c] % 32 - widths[c])); 747 748 /* Shift back to the least significant bits using an arithmetic 749 * shift to get sign extension on signed types. 750 */ 751 bld.ASR(offset(dst, bld, c), 752 offset(dst, bld, c), brw_imm_ud(32 - widths[c])); 753 } 754 } 755 756 return dst; 757 } 758 759 /** 760 * Convert an integer vector into another integer vector of the 761 * specified bit widths, properly handling overflow. 762 */ 763 static fs_reg emit_convert_to_integer(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)764 emit_convert_to_integer(const fs_builder &bld, const fs_reg &src, 765 const color_u &widths, bool is_signed) 766 { 767 const unsigned s = (is_signed ? 1 : 0); 768 const fs_reg dst = bld.vgrf( 769 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4); 770 assert(src.type == dst.type); 771 772 for (unsigned c = 0; c < 4; ++c) { 773 if (widths[c]) { 774 /* Clamp to the maximum value. */ 775 bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c), 776 brw_imm_d((int)scale(widths[c] - s)), 777 BRW_CONDITIONAL_L); 778 779 /* Clamp to the minimum value. */ 780 if (is_signed) 781 bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c), 782 brw_imm_d(-(int)scale(widths[c] - s) - 1), 783 BRW_CONDITIONAL_GE); 784 785 /* Mask off all but the bits we actually want. Otherwise, if 786 * we pass a negative number into the hardware when it's 787 * expecting something like UINT8, it will happily clamp it to 788 * +255 for us. 789 */ 790 if (is_signed && widths[c] < 32) 791 bld.AND(offset(dst, bld, c), offset(dst, bld, c), 792 brw_imm_d(scale(widths[c]))); 793 } 794 } 795 796 return dst; 797 } 798 799 /** 800 * Convert a normalized fixed-point vector of the specified signedness 801 * and bit widths into a floating point vector. 802 */ 803 static fs_reg emit_convert_from_scaled(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)804 emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src, 805 const color_u &widths, bool is_signed) 806 { 807 const unsigned s = (is_signed ? 1 : 0); 808 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4); 809 810 for (unsigned c = 0; c < 4; ++c) { 811 if (widths[c]) { 812 /* Convert to float. */ 813 bld.MOV(offset(dst, bld, c), offset(src, bld, c)); 814 815 /* Divide by the normalization constants. */ 816 bld.MUL(offset(dst, bld, c), offset(dst, bld, c), 817 brw_imm_f(1.0f / scale(widths[c] - s))); 818 819 /* Clamp to the minimum value. */ 820 if (is_signed) 821 bld.emit_minmax(offset(dst, bld, c), 822 offset(dst, bld, c), brw_imm_f(-1.0f), 823 BRW_CONDITIONAL_GE); 824 } 825 } 826 return dst; 827 } 828 829 /** 830 * Convert a floating-point vector into a normalized fixed-point vector 831 * of the specified signedness and bit widths. 832 */ 833 static fs_reg emit_convert_to_scaled(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)834 emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src, 835 const color_u &widths, bool is_signed) 836 { 837 const unsigned s = (is_signed ? 1 : 0); 838 const fs_reg dst = bld.vgrf( 839 is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4); 840 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); 841 842 for (unsigned c = 0; c < 4; ++c) { 843 if (widths[c]) { 844 /* Clamp the normalized floating-point argument. */ 845 if (is_signed) { 846 bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c), 847 brw_imm_f(-1.0f), BRW_CONDITIONAL_GE); 848 849 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c), 850 brw_imm_f(1.0f), BRW_CONDITIONAL_L); 851 } else { 852 set_saturate(true, bld.MOV(offset(fdst, bld, c), 853 offset(src, bld, c))); 854 } 855 856 /* Multiply by the normalization constants. */ 857 bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c), 858 brw_imm_f((float)scale(widths[c] - s))); 859 860 /* Convert to integer. */ 861 bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c)); 862 bld.MOV(offset(dst, bld, c), offset(fdst, bld, c)); 863 864 /* Mask off all but the bits we actually want. Otherwise, if 865 * we pass a negative number into the hardware when it's 866 * expecting something like UINT8, it will happily clamp it to 867 * +255 for us. 868 */ 869 if (is_signed && widths[c] < 32) 870 bld.AND(offset(dst, bld, c), offset(dst, bld, c), 871 brw_imm_d(scale(widths[c]))); 872 } 873 } 874 875 return dst; 876 } 877 878 /** 879 * Convert a floating point vector of the specified bit widths into a 880 * 32-bit floating point vector. 881 */ 882 static fs_reg emit_convert_from_float(const fs_builder & bld,const fs_reg & src,const color_u & widths)883 emit_convert_from_float(const fs_builder &bld, const fs_reg &src, 884 const color_u &widths) 885 { 886 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); 887 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); 888 889 for (unsigned c = 0; c < 4; ++c) { 890 if (widths[c]) { 891 bld.MOV(offset(dst, bld, c), offset(src, bld, c)); 892 893 /* Extend 10-bit and 11-bit floating point numbers to 15 bits. 894 * This works because they have a 5-bit exponent just like the 895 * 16-bit floating point format, and they have no sign bit. 896 */ 897 if (widths[c] < 16) 898 bld.SHL(offset(dst, bld, c), 899 offset(dst, bld, c), brw_imm_ud(15 - widths[c])); 900 901 /* Convert to 32-bit floating point. */ 902 bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c)); 903 } 904 } 905 906 return fdst; 907 } 908 909 /** 910 * Convert a vector into a floating point vector of the specified bit 911 * widths. 912 */ 913 static fs_reg emit_convert_to_float(const fs_builder & bld,const fs_reg & src,const color_u & widths)914 emit_convert_to_float(const fs_builder &bld, const fs_reg &src, 915 const color_u &widths) 916 { 917 const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); 918 const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F); 919 920 for (unsigned c = 0; c < 4; ++c) { 921 if (widths[c]) { 922 bld.MOV(offset(fdst, bld, c), offset(src, bld, c)); 923 924 /* Clamp to the minimum value. */ 925 if (widths[c] < 16) 926 bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c), 927 brw_imm_f(0.0f), BRW_CONDITIONAL_GE); 928 929 /* Convert to 16-bit floating-point. */ 930 bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c)); 931 932 /* Discard the least significant bits to get floating point 933 * numbers of the requested width. This works because the 934 * 10-bit and 11-bit floating point formats have a 5-bit 935 * exponent just like the 16-bit format, and they have no sign 936 * bit. 937 */ 938 if (widths[c] < 16) 939 bld.SHR(offset(dst, bld, c), offset(dst, bld, c), 940 brw_imm_ud(15 - widths[c])); 941 } 942 } 943 944 return dst; 945 } 946 947 /** 948 * Fill missing components of a vector with 0, 0, 0, 1. 949 */ 950 static fs_reg emit_pad(const fs_builder & bld,const fs_reg & src,const color_u & widths)951 emit_pad(const fs_builder &bld, const fs_reg &src, 952 const color_u &widths) 953 { 954 const fs_reg dst = bld.vgrf(src.type, 4); 955 const unsigned pad[] = { 0, 0, 0, 1 }; 956 957 for (unsigned c = 0; c < 4; ++c) 958 bld.MOV(offset(dst, bld, c), 959 widths[c] ? offset(src, bld, c) 960 : fs_reg(brw_imm_ud(pad[c]))); 961 962 return dst; 963 } 964 } 965 } 966 967 namespace brw { 968 namespace image_access { 969 /** 970 * Load a vector from a surface of the given format and dimensionality 971 * at the given coordinates. \p surf_dims and \p arr_dims give the 972 * number of non-array and array coordinates of the image respectively. 973 */ 974 fs_reg emit_image_load(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,unsigned surf_dims,unsigned arr_dims,unsigned gl_format)975 emit_image_load(const fs_builder &bld, 976 const fs_reg &image, const fs_reg &addr, 977 unsigned surf_dims, unsigned arr_dims, 978 unsigned gl_format) 979 { 980 using namespace image_format_info; 981 using namespace image_format_conversion; 982 using namespace image_validity; 983 using namespace image_coordinates; 984 using namespace surface_access; 985 const gen_device_info *devinfo = bld.shader->devinfo; 986 const isl_format format = isl_format_for_gl_format(gl_format); 987 const isl_format lower_format = 988 isl_lower_storage_image_format(devinfo, format); 989 fs_reg tmp; 990 991 /* Transform the image coordinates into actual surface coordinates. */ 992 const fs_reg saddr = 993 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format); 994 const unsigned dims = 995 num_image_coordinates(bld, surf_dims, arr_dims, format); 996 997 if (isl_has_matching_typed_storage_image_format(devinfo, format)) { 998 /* Hopefully we get here most of the time... */ 999 tmp = emit_typed_read(bld, image, saddr, dims, 1000 isl_format_get_num_channels(lower_format)); 1001 } else { 1002 /* Untyped surface reads return 32 bits of the surface per 1003 * component, without any sort of unpacking or type conversion, 1004 */ 1005 const unsigned size = isl_format_get_layout(format)->bpb / 32; 1006 /* they don't properly handle out of bounds access, so we have to 1007 * check manually if the coordinates are valid and predicate the 1008 * surface read on the result, 1009 */ 1010 const brw_predicate pred = 1011 emit_untyped_image_check(bld, image, 1012 emit_bounds_check(bld, image, 1013 saddr, dims)); 1014 1015 /* and they don't know about surface coordinates, we need to 1016 * convert them to a raw memory offset. 1017 */ 1018 const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims); 1019 1020 tmp = emit_untyped_read(bld, image, laddr, 1, size, pred); 1021 1022 /* An out of bounds surface access should give zero as result. */ 1023 for (unsigned c = 0; c < size; ++c) 1024 set_predicate(pred, bld.SEL(offset(tmp, bld, c), 1025 offset(tmp, bld, c), brw_imm_d(0))); 1026 } 1027 1028 /* Set the register type to D instead of UD if the data type is 1029 * represented as a signed integer in memory so that sign extension 1030 * is handled correctly by unpack. 1031 */ 1032 if (needs_sign_extension(format)) 1033 tmp = retype(tmp, BRW_REGISTER_TYPE_D); 1034 1035 if (!has_supported_bit_layout(devinfo, format)) { 1036 /* Unpack individual vector components from the bitfield if the 1037 * hardware is unable to do it for us. 1038 */ 1039 if (has_split_bit_layout(devinfo, format)) 1040 tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format), 1041 get_bit_widths(lower_format)); 1042 else 1043 tmp = emit_unpack(bld, tmp, get_bit_shifts(format), 1044 get_bit_widths(format)); 1045 1046 } else if ((needs_sign_extension(format) && 1047 !is_conversion_trivial(devinfo, format)) || 1048 has_undefined_high_bits(devinfo, format)) { 1049 /* Perform a trivial unpack even though the bit layout matches in 1050 * order to get the most significant bits of each component 1051 * initialized properly. 1052 */ 1053 tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96), 1054 get_bit_widths(format)); 1055 } 1056 1057 if (!isl_format_has_int_channel(format)) { 1058 if (is_conversion_trivial(devinfo, format)) { 1059 /* Just need to cast the vector to the target type. */ 1060 tmp = retype(tmp, BRW_REGISTER_TYPE_F); 1061 } else { 1062 /* Do the right sort of type conversion to float. */ 1063 if (isl_format_has_float_channel(format)) 1064 tmp = emit_convert_from_float( 1065 bld, tmp, get_bit_widths(format)); 1066 else 1067 tmp = emit_convert_from_scaled( 1068 bld, tmp, get_bit_widths(format), 1069 isl_format_has_snorm_channel(format)); 1070 } 1071 } 1072 1073 /* Initialize missing components of the result. */ 1074 return emit_pad(bld, tmp, get_bit_widths(format)); 1075 } 1076 1077 /** 1078 * Store a vector in a surface of the given format and dimensionality at 1079 * the given coordinates. \p surf_dims and \p arr_dims give the number 1080 * of non-array and array coordinates of the image respectively. 1081 */ 1082 void emit_image_store(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,const fs_reg & src,unsigned surf_dims,unsigned arr_dims,unsigned gl_format)1083 emit_image_store(const fs_builder &bld, const fs_reg &image, 1084 const fs_reg &addr, const fs_reg &src, 1085 unsigned surf_dims, unsigned arr_dims, 1086 unsigned gl_format) 1087 { 1088 using namespace image_format_info; 1089 using namespace image_format_conversion; 1090 using namespace image_validity; 1091 using namespace image_coordinates; 1092 using namespace surface_access; 1093 const isl_format format = isl_format_for_gl_format(gl_format); 1094 const gen_device_info *devinfo = bld.shader->devinfo; 1095 1096 /* Transform the image coordinates into actual surface coordinates. */ 1097 const fs_reg saddr = 1098 emit_image_coordinates(bld, addr, surf_dims, arr_dims, format); 1099 const unsigned dims = 1100 num_image_coordinates(bld, surf_dims, arr_dims, format); 1101 1102 if (gl_format == GL_NONE) { 1103 /* We don't know what the format is, but that's fine because it 1104 * implies write-only access, and typed surface writes are always 1105 * able to take care of type conversion and packing for us. 1106 */ 1107 emit_typed_write(bld, image, saddr, src, dims, 4); 1108 1109 } else { 1110 const isl_format lower_format = 1111 isl_lower_storage_image_format(devinfo, format); 1112 fs_reg tmp = src; 1113 1114 if (!is_conversion_trivial(devinfo, format)) { 1115 /* Do the right sort of type conversion. */ 1116 if (isl_format_has_float_channel(format)) 1117 tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format)); 1118 1119 else if (isl_format_has_int_channel(format)) 1120 tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format), 1121 isl_format_has_sint_channel(format)); 1122 1123 else 1124 tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format), 1125 isl_format_has_snorm_channel(format)); 1126 } 1127 1128 /* We're down to bit manipulation at this point. */ 1129 tmp = retype(tmp, BRW_REGISTER_TYPE_UD); 1130 1131 if (!has_supported_bit_layout(devinfo, format)) { 1132 /* Pack the vector components into a bitfield if the hardware 1133 * is unable to do it for us. 1134 */ 1135 if (has_split_bit_layout(devinfo, format)) 1136 tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format), 1137 get_bit_widths(lower_format)); 1138 1139 else 1140 tmp = emit_pack(bld, tmp, get_bit_shifts(format), 1141 get_bit_widths(format)); 1142 } 1143 1144 if (isl_has_matching_typed_storage_image_format(devinfo, format)) { 1145 /* Hopefully we get here most of the time... */ 1146 emit_typed_write(bld, image, saddr, tmp, dims, 1147 isl_format_get_num_channels(lower_format)); 1148 1149 } else { 1150 /* Untyped surface writes store 32 bits of the surface per 1151 * component, without any sort of packing or type conversion, 1152 */ 1153 const unsigned size = isl_format_get_layout(format)->bpb / 32; 1154 1155 /* they don't properly handle out of bounds access, so we have 1156 * to check manually if the coordinates are valid and predicate 1157 * the surface write on the result, 1158 */ 1159 const brw_predicate pred = 1160 emit_untyped_image_check(bld, image, 1161 emit_bounds_check(bld, image, 1162 saddr, dims)); 1163 1164 /* and, phew, they don't know about surface coordinates, we 1165 * need to convert them to a raw memory offset. 1166 */ 1167 const fs_reg laddr = emit_address_calculation( 1168 bld, image, saddr, dims); 1169 1170 emit_untyped_write(bld, image, laddr, tmp, 1, size, pred); 1171 } 1172 } 1173 } 1174 1175 /** 1176 * Perform an atomic read-modify-write operation in a surface of the 1177 * given dimensionality at the given coordinates. \p surf_dims and \p 1178 * arr_dims give the number of non-array and array coordinates of the 1179 * image respectively. Main building block of the imageAtomic GLSL 1180 * built-ins. 1181 */ 1182 fs_reg emit_image_atomic(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned surf_dims,unsigned arr_dims,unsigned rsize,unsigned op)1183 emit_image_atomic(const fs_builder &bld, 1184 const fs_reg &image, const fs_reg &addr, 1185 const fs_reg &src0, const fs_reg &src1, 1186 unsigned surf_dims, unsigned arr_dims, 1187 unsigned rsize, unsigned op) 1188 { 1189 using namespace image_validity; 1190 using namespace image_coordinates; 1191 using namespace surface_access; 1192 /* Avoid performing an atomic operation on an unbound surface. */ 1193 const brw_predicate pred = emit_typed_atomic_check(bld, image); 1194 1195 /* Transform the image coordinates into actual surface coordinates. */ 1196 const fs_reg saddr = 1197 emit_image_coordinates(bld, addr, surf_dims, arr_dims, 1198 ISL_FORMAT_R32_UINT); 1199 const unsigned dims = 1200 num_image_coordinates(bld, surf_dims, arr_dims, 1201 ISL_FORMAT_R32_UINT); 1202 1203 /* Thankfully we can do without untyped atomics here. */ 1204 const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1, 1205 dims, rsize, op, pred); 1206 1207 /* An unbound surface access should give zero as result. */ 1208 if (rsize && pred) 1209 set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0))); 1210 1211 return retype(tmp, src0.type); 1212 } 1213 } 1214 } 1215