1 /*
2  * Copyright © 2013-2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "isl/isl.h"
25 #include "brw_fs_surface_builder.h"
26 #include "brw_fs.h"
27 
28 using namespace brw;
29 
30 namespace brw {
31    namespace surface_access {
32       namespace {
33          /**
34           * Generate a logical send opcode for a surface message and return
35           * the result.
36           */
37          fs_reg
emit_send(const fs_builder & bld,enum opcode opcode,const fs_reg & addr,const fs_reg & src,const fs_reg & surface,unsigned dims,unsigned arg,unsigned rsize,brw_predicate pred=BRW_PREDICATE_NONE)38          emit_send(const fs_builder &bld, enum opcode opcode,
39                    const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
40                    unsigned dims, unsigned arg, unsigned rsize,
41                    brw_predicate pred = BRW_PREDICATE_NONE)
42          {
43             /* Reduce the dynamically uniform surface index to a single
44              * scalar.
45              */
46             const fs_reg usurface = bld.emit_uniformize(surface);
47             const fs_reg srcs[] = {
48                addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
49             };
50             const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
51             fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
52 
53             inst->size_written = rsize * dst.component_size(inst->exec_size);
54             inst->predicate = pred;
55             return dst;
56          }
57       }
58 
59       /**
60        * Emit an untyped surface read opcode.  \p dims determines the number
61        * of components of the address and \p size the number of components of
62        * the returned value.
63        */
64       fs_reg
emit_untyped_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size,brw_predicate pred)65       emit_untyped_read(const fs_builder &bld,
66                         const fs_reg &surface, const fs_reg &addr,
67                         unsigned dims, unsigned size,
68                         brw_predicate pred)
69       {
70          return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
71                           addr, fs_reg(), surface, dims, size, size, pred);
72       }
73 
74       /**
75        * Emit an untyped surface write opcode.  \p dims determines the number
76        * of components of the address and \p size the number of components of
77        * the argument.
78        */
79       void
emit_untyped_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size,brw_predicate pred)80       emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
81                          const fs_reg &addr, const fs_reg &src,
82                          unsigned dims, unsigned size,
83                          brw_predicate pred)
84       {
85          emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
86                    addr, src, surface, dims, size, 0, pred);
87       }
88 
89       /**
90        * Emit an untyped surface atomic opcode.  \p dims determines the number
91        * of components of the address and \p rsize the number of components of
92        * the returned value (either zero or one).
93        */
94       fs_reg
emit_untyped_atomic(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned dims,unsigned rsize,unsigned op,brw_predicate pred)95       emit_untyped_atomic(const fs_builder &bld,
96                           const fs_reg &surface, const fs_reg &addr,
97                           const fs_reg &src0, const fs_reg &src1,
98                           unsigned dims, unsigned rsize, unsigned op,
99                           brw_predicate pred)
100       {
101          /* FINISHME: Factor out this frequently recurring pattern into a
102           * helper function.
103           */
104          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
105          const fs_reg srcs[] = { src0, src1 };
106          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
107          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
108 
109          return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
110                           addr, tmp, surface, dims, op, rsize, pred);
111       }
112 
113       /**
114        * Emit a typed surface read opcode.  \p dims determines the number of
115        * components of the address and \p size the number of components of the
116        * returned value.
117        */
118       fs_reg
emit_typed_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size)119       emit_typed_read(const fs_builder &bld, const fs_reg &surface,
120                       const fs_reg &addr, unsigned dims, unsigned size)
121       {
122          return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
123                           addr, fs_reg(), surface, dims, size, size);
124       }
125 
126       /**
127        * Emit a typed surface write opcode.  \p dims determines the number of
128        * components of the address and \p size the number of components of the
129        * argument.
130        */
131       void
emit_typed_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size)132       emit_typed_write(const fs_builder &bld, const fs_reg &surface,
133                        const fs_reg &addr, const fs_reg &src,
134                        unsigned dims, unsigned size)
135       {
136          emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
137                    addr, src, surface, dims, size, 0);
138       }
139 
140       /**
141        * Emit a typed surface atomic opcode.  \p dims determines the number of
142        * components of the address and \p rsize the number of components of
143        * the returned value (either zero or one).
144        */
145       fs_reg
emit_typed_atomic(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned dims,unsigned rsize,unsigned op,brw_predicate pred)146       emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
147                         const fs_reg &addr,
148                         const fs_reg &src0, const fs_reg &src1,
149                         unsigned dims, unsigned rsize, unsigned op,
150                         brw_predicate pred)
151       {
152          /* FINISHME: Factor out this frequently recurring pattern into a
153           * helper function.
154           */
155          const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
156          const fs_reg srcs[] = { src0, src1 };
157          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
158          bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
159 
160          return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
161                           addr, tmp, surface, dims, op, rsize);
162       }
163 
164       fs_reg
emit_byte_scattered_read(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,unsigned dims,unsigned size,unsigned bit_size,brw_predicate pred)165       emit_byte_scattered_read(const fs_builder &bld,
166                                const fs_reg &surface, const fs_reg &addr,
167                                unsigned dims, unsigned size,
168                                unsigned bit_size, brw_predicate pred)
169       {
170          return emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
171                           addr, fs_reg(), surface, dims, bit_size, size, pred);
172       }
173 
174       void
emit_byte_scattered_write(const fs_builder & bld,const fs_reg & surface,const fs_reg & addr,const fs_reg & src,unsigned dims,unsigned size,unsigned bit_size,brw_predicate pred)175       emit_byte_scattered_write(const fs_builder &bld, const fs_reg &surface,
176                                 const fs_reg &addr, const fs_reg &src,
177                                 unsigned dims, unsigned size,
178                                 unsigned bit_size, brw_predicate pred)
179       {
180          emit_send(bld, SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
181                    addr, src, surface, dims, bit_size, 0, pred);
182       }
183    }
184 }
185 
186 namespace {
187    namespace image_format_info {
188       /* The higher compiler layers use the GL enums for image formats even if
189        * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
190        * enum before we can use them.
191        */
192       static enum isl_format
isl_format_for_gl_format(uint32_t gl_format)193       isl_format_for_gl_format(uint32_t gl_format)
194       {
195          switch (gl_format) {
196          case GL_R8:             return ISL_FORMAT_R8_UNORM;
197          case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
198          case GL_R8UI:           return ISL_FORMAT_R8_UINT;
199          case GL_R8I:            return ISL_FORMAT_R8_SINT;
200          case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
201          case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
202          case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
203          case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
204          case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
205          case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
206          case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
207          case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
208          case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
209          case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
210          case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
211          case GL_R16:            return ISL_FORMAT_R16_UNORM;
212          case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
213          case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
214          case GL_R16UI:          return ISL_FORMAT_R16_UINT;
215          case GL_R16I:           return ISL_FORMAT_R16_SINT;
216          case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
217          case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
218          case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
219          case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
220          case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
221          case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
222          case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
223          case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
224          case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
225          case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
226          case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
227          case GL_R32UI:          return ISL_FORMAT_R32_UINT;
228          case GL_R32I:           return ISL_FORMAT_R32_SINT;
229          case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
230          case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
231          case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
232          case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
233          case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
234          case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
235          case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
236          default:
237             assert(!"Invalid image format");
238             return ISL_FORMAT_UNSUPPORTED;
239          }
240       }
241 
242       /**
243        * Simple 4-tuple of scalars used to pass around per-color component
244        * values.
245        */
246       struct color_u {
color_u__anonee7e056b0211::image_format_info::color_u247          color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
248          {
249          }
250 
color_u__anonee7e056b0211::image_format_info::color_u251          color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
252             r(r), g(g), b(b), a(a)
253          {
254          }
255 
256          unsigned
operator []__anonee7e056b0211::image_format_info::color_u257          operator[](unsigned i) const
258          {
259             const unsigned xs[] = { r, g, b, a };
260             return xs[i];
261          }
262 
263          unsigned r, g, b, a;
264       };
265 
266       /**
267        * Return the per-channel bitfield widths for a given image format.
268        */
269       inline color_u
get_bit_widths(isl_format format)270       get_bit_widths(isl_format format)
271       {
272          const isl_format_layout *fmtl = isl_format_get_layout(format);
273 
274          return color_u(fmtl->channels.r.bits,
275                         fmtl->channels.g.bits,
276                         fmtl->channels.b.bits,
277                         fmtl->channels.a.bits);
278       }
279 
280       /**
281        * Return the per-channel bitfield shifts for a given image format.
282        */
283       inline color_u
get_bit_shifts(isl_format format)284       get_bit_shifts(isl_format format)
285       {
286          const color_u widths = get_bit_widths(format);
287          return color_u(0, widths.r, widths.r + widths.g,
288                         widths.r + widths.g + widths.b);
289       }
290 
291       /**
292        * Return true if all present components have the same bit width.
293        */
294       inline bool
is_homogeneous(isl_format format)295       is_homogeneous(isl_format format)
296       {
297          const color_u widths = get_bit_widths(format);
298          return ((widths.g == 0 || widths.g == widths.r) &&
299                  (widths.b == 0 || widths.b == widths.r) &&
300                  (widths.a == 0 || widths.a == widths.r));
301       }
302 
303       /**
304        * Return true if the format conversion boils down to a trivial copy.
305        */
306       inline bool
is_conversion_trivial(const gen_device_info * devinfo,isl_format format)307       is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
308       {
309          return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
310                  format == isl_lower_storage_image_format(devinfo, format);
311       }
312 
313       /**
314        * Return true if the hardware natively supports some format with
315        * compatible bitfield layout, but possibly different data types.
316        */
317       inline bool
has_supported_bit_layout(const gen_device_info * devinfo,isl_format format)318       has_supported_bit_layout(const gen_device_info *devinfo,
319                                isl_format format)
320       {
321          const color_u widths = get_bit_widths(format);
322          const color_u lower_widths = get_bit_widths(
323             isl_lower_storage_image_format(devinfo, format));
324 
325          return (widths.r == lower_widths.r &&
326                  widths.g == lower_widths.g &&
327                  widths.b == lower_widths.b &&
328                  widths.a == lower_widths.a);
329       }
330 
331       /**
332        * Return true if we are required to spread individual components over
333        * several components of the format used by the hardware (RG32 and
334        * friends implemented as RGBA16UI).
335        */
336       inline bool
has_split_bit_layout(const gen_device_info * devinfo,isl_format format)337       has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
338       {
339          const isl_format lower_format =
340             isl_lower_storage_image_format(devinfo, format);
341 
342          return (isl_format_get_num_channels(format) <
343                  isl_format_get_num_channels(lower_format));
344       }
345 
346       /**
347        * Return true if the hardware returns garbage in the unused high bits
348        * of each component.  This may happen on IVB because we rely on the
349        * undocumented behavior that typed reads from surfaces of the
350        * unsupported R8 and R16 formats return useful data in their least
351        * significant bits.
352        */
353       inline bool
has_undefined_high_bits(const gen_device_info * devinfo,isl_format format)354       has_undefined_high_bits(const gen_device_info *devinfo,
355                               isl_format format)
356       {
357          const isl_format lower_format =
358             isl_lower_storage_image_format(devinfo, format);
359 
360          return (devinfo->gen == 7 && !devinfo->is_haswell &&
361                  (lower_format == ISL_FORMAT_R16_UINT ||
362                   lower_format == ISL_FORMAT_R8_UINT));
363       }
364 
365       /**
366        * Return true if the format represents values as signed integers
367        * requiring sign extension when unpacking.
368        */
369       inline bool
needs_sign_extension(isl_format format)370       needs_sign_extension(isl_format format)
371       {
372          return isl_format_has_snorm_channel(format) ||
373                 isl_format_has_sint_channel(format);
374       }
375    }
376 
377    namespace image_validity {
378       /**
379        * Check whether the bound image is suitable for untyped access.
380        */
381       static brw_predicate
emit_untyped_image_check(const fs_builder & bld,const fs_reg & image,brw_predicate pred)382       emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
383                                brw_predicate pred)
384       {
385          const gen_device_info *devinfo = bld.shader->devinfo;
386          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
387 
388          if (devinfo->gen == 7 && !devinfo->is_haswell) {
389             /* Check whether the first stride component (i.e. the Bpp value)
390              * is greater than four, what on Gen7 indicates that a surface of
391              * type RAW has been bound for untyped access.  Reading or writing
392              * to a surface of type other than RAW using untyped surface
393              * messages causes a hang on IVB and VLV.
394              */
395             set_predicate(pred,
396                           bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
397                                   BRW_CONDITIONAL_G));
398 
399             return BRW_PREDICATE_NORMAL;
400          } else {
401             /* More recent generations handle the format mismatch
402              * gracefully.
403              */
404             return pred;
405          }
406       }
407 
408       /**
409        * Check whether there is an image bound at the given index and write
410        * the comparison result to f0.0.  Returns an appropriate predication
411        * mode to use on subsequent image operations.
412        */
413       static brw_predicate
emit_typed_atomic_check(const fs_builder & bld,const fs_reg & image)414       emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
415       {
416          const gen_device_info *devinfo = bld.shader->devinfo;
417          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
418 
419          if (devinfo->gen == 7 && !devinfo->is_haswell) {
420             /* Check the first component of the size field to find out if the
421              * image is bound.  Necessary on IVB for typed atomics because
422              * they don't seem to respect null surfaces and will happily
423              * corrupt or read random memory when no image is bound.
424              */
425             bld.CMP(bld.null_reg_ud(),
426                     retype(size, BRW_REGISTER_TYPE_UD),
427                     brw_imm_d(0), BRW_CONDITIONAL_NZ);
428 
429             return BRW_PREDICATE_NORMAL;
430          } else {
431             /* More recent platforms implement compliant behavior when a null
432              * surface is bound.
433              */
434             return BRW_PREDICATE_NONE;
435          }
436       }
437 
438       /**
439        * Check whether the provided coordinates are within the image bounds
440        * and write the comparison result to f0.0.  Returns an appropriate
441        * predication mode to use on subsequent image operations.
442        */
443       static brw_predicate
emit_bounds_check(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,unsigned dims)444       emit_bounds_check(const fs_builder &bld, const fs_reg &image,
445                         const fs_reg &addr, unsigned dims)
446       {
447          const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
448 
449          for (unsigned c = 0; c < dims; ++c)
450             set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
451                           bld.CMP(bld.null_reg_ud(),
452                                   offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
453                                   offset(size, bld, c),
454                                   BRW_CONDITIONAL_L));
455 
456          return BRW_PREDICATE_NORMAL;
457       }
458    }
459 
460    namespace image_coordinates {
461       /**
462        * Return the total number of coordinates needed to address a texel of
463        * the surface, which may be more than the sum of \p surf_dims and \p
464        * arr_dims if padding is required.
465        */
466       static unsigned
num_image_coordinates(const fs_builder & bld,unsigned surf_dims,unsigned arr_dims,isl_format format)467       num_image_coordinates(const fs_builder &bld,
468                             unsigned surf_dims, unsigned arr_dims,
469                             isl_format format)
470       {
471          /* HSW in vec4 mode and our software coordinate handling for untyped
472           * reads want the array index to be at the Z component.
473           */
474          const bool array_index_at_z =
475             format != ISL_FORMAT_UNSUPPORTED &&
476             !isl_has_matching_typed_storage_image_format(
477                bld.shader->devinfo, format);
478          const unsigned zero_dims =
479             ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
480 
481          return surf_dims + zero_dims + arr_dims;
482       }
483 
484       /**
485        * Transform image coordinates into the form expected by the
486        * implementation.
487        */
488       static fs_reg
emit_image_coordinates(const fs_builder & bld,const fs_reg & addr,unsigned surf_dims,unsigned arr_dims,isl_format format)489       emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
490                              unsigned surf_dims, unsigned arr_dims,
491                              isl_format format)
492       {
493          const unsigned dims =
494             num_image_coordinates(bld, surf_dims, arr_dims, format);
495 
496          if (dims > surf_dims + arr_dims) {
497             assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
498             /* The array index is required to be passed in as the Z component,
499              * insert a zero at the Y component to shift it to the right
500              * position.
501              *
502              * FINISHME: Factor out this frequently recurring pattern into a
503              * helper function.
504              */
505             const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
506             const fs_reg dst = bld.vgrf(addr.type, dims);
507             bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
508             return dst;
509          } else {
510             return addr;
511          }
512       }
513 
514       /**
515        * Calculate the offset in memory of the texel given by \p coord.
516        *
517        * This is meant to be used with untyped surface messages to access a
518        * tiled surface, what involves taking into account the tiling and
519        * swizzling modes of the surface manually so it will hopefully not
520        * happen very often.
521        *
522        * The tiling algorithm implemented here matches either the X or Y
523        * tiling layouts supported by the hardware depending on the tiling
524        * coefficients passed to the program as uniforms.  See Volume 1 Part 2
525        * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
526        * explanation of the hardware tiling format.
527        */
528       static fs_reg
emit_address_calculation(const fs_builder & bld,const fs_reg & image,const fs_reg & coord,unsigned dims)529       emit_address_calculation(const fs_builder &bld, const fs_reg &image,
530                                const fs_reg &coord, unsigned dims)
531       {
532          const gen_device_info *devinfo = bld.shader->devinfo;
533          const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
534          const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
535          const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
536          const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
537          const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
538          const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
539          const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
540          const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
541          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
542 
543          /* Shift the coordinates by the fixed surface offset.  It may be
544           * non-zero if the image is a single slice of a higher-dimensional
545           * surface, or if a non-zero mipmap level of the surface is bound to
546           * the pipeline.  The offset needs to be applied here rather than at
547           * surface state set-up time because the desired slice-level may
548           * start mid-tile, so simply shifting the surface base address
549           * wouldn't give a well-formed tiled surface in the general case.
550           */
551          for (unsigned c = 0; c < 2; ++c)
552             bld.ADD(offset(addr, bld, c), offset(off, bld, c),
553                     (c < dims ?
554                      offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
555                      fs_reg(brw_imm_d(0))));
556 
557          /* The layout of 3-D textures in memory is sort-of like a tiling
558           * format.  At each miplevel, the slices are arranged in rows of
559           * 2^level slices per row.  The slice row is stored in tmp.y and
560           * the slice within the row is stored in tmp.x.
561           *
562           * The layout of 2-D array textures and cubemaps is much simpler:
563           * Depending on whether the ARYSPC_LOD0 layout is in use it will be
564           * stored in memory as an array of slices, each one being a 2-D
565           * arrangement of miplevels, or as a 2D arrangement of miplevels,
566           * each one being an array of slices.  In either case the separation
567           * between slices of the same LOD is equal to the qpitch value
568           * provided as stride.w.
569           *
570           * This code can be made to handle either 2D arrays and 3D textures
571           * by passing in the miplevel as tile.z for 3-D textures and 0 in
572           * tile.z for 2-D array textures.
573           *
574           * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
575           * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
576           * of the hardware 3D texture and 2D array layouts.
577           */
578          if (dims > 2) {
579             /* Decompose z into a major (tmp.y) and a minor (tmp.x)
580              * index.
581              */
582             bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
583                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
584             bld.SHR(offset(tmp, bld, 1),
585                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
586                     offset(tile, bld, 2));
587 
588             /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
589              * slice offset.
590              */
591             for (unsigned c = 0; c < 2; ++c) {
592                bld.MUL(offset(tmp, bld, c),
593                        offset(stride, bld, 2 + c), offset(tmp, bld, c));
594                bld.ADD(offset(addr, bld, c),
595                        offset(addr, bld, c), offset(tmp, bld, c));
596             }
597          }
598 
599          if (dims > 1) {
600             /* Calculate the major/minor x and y indices.  In order to
601              * accommodate both X and Y tiling, the Y-major tiling format is
602              * treated as being a bunch of narrow X-tiles placed next to each
603              * other.  This means that the tile width for Y-tiling is actually
604              * the width of one sub-column of the Y-major tile where each 4K
605              * tile has 8 512B sub-columns.
606              *
607              * The major Y value is the row of tiles in which the pixel lives.
608              * The major X value is the tile sub-column in which the pixel
609              * lives; for X tiling, this is the same as the tile column, for Y
610              * tiling, each tile has 8 sub-columns.  The minor X and Y indices
611              * are the position within the sub-column.
612              */
613             for (unsigned c = 0; c < 2; ++c) {
614                /* Calculate the minor x and y indices. */
615                bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
616                        brw_imm_d(0), offset(addr, bld, c));
617 
618                /* Calculate the major x and y indices. */
619                bld.SHR(offset(major, bld, c),
620                        offset(addr, bld, c), offset(tile, bld, c));
621             }
622 
623             /* Calculate the texel index from the start of the tile row and
624              * the vertical coordinate of the row.
625              * Equivalent to:
626              *   tmp.x = (major.x << tile.y << tile.x) +
627              *           (minor.y << tile.x) + minor.x
628              *   tmp.y = major.y << tile.y
629              */
630             bld.SHL(tmp, major, offset(tile, bld, 1));
631             bld.ADD(tmp, tmp, offset(minor, bld, 1));
632             bld.SHL(tmp, tmp, offset(tile, bld, 0));
633             bld.ADD(tmp, tmp, minor);
634             bld.SHL(offset(tmp, bld, 1),
635                     offset(major, bld, 1), offset(tile, bld, 1));
636 
637             /* Add it to the start of the tile row. */
638             bld.MUL(offset(tmp, bld, 1),
639                     offset(tmp, bld, 1), offset(stride, bld, 1));
640             bld.ADD(tmp, tmp, offset(tmp, bld, 1));
641 
642             /* Multiply by the Bpp value. */
643             bld.MUL(dst, tmp, stride);
644 
645             if (devinfo->gen < 8 && !devinfo->is_baytrail) {
646                /* Take into account the two dynamically specified shifts.
647                 * Both need are used to implement swizzling of X-tiled
648                 * surfaces.  For Y-tiled surfaces only one bit needs to be
649                 * XOR-ed with bit 6 of the memory address, so a swz value of
650                 * 0xff (actually interpreted as 31 by the hardware) will be
651                 * provided to cause the relevant bit of tmp.y to be zero and
652                 * turn the first XOR into the identity.  For linear surfaces
653                 * or platforms lacking address swizzling both shifts will be
654                 * 0xff causing the relevant bits of both tmp.x and .y to be
655                 * zero, what effectively disables swizzling.
656                 */
657                for (unsigned c = 0; c < 2; ++c)
658                   bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
659 
660                /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
661                bld.XOR(tmp, tmp, offset(tmp, bld, 1));
662                bld.AND(tmp, tmp, brw_imm_d(1 << 6));
663                bld.XOR(dst, dst, tmp);
664             }
665 
666          } else {
667             /* Multiply by the Bpp/stride value.  Note that the addr.y may be
668              * non-zero even if the image is one-dimensional because a
669              * vertical offset may have been applied above to select a
670              * non-zero slice or level of a higher-dimensional texture.
671              */
672             bld.MUL(offset(addr, bld, 1),
673                     offset(addr, bld, 1), offset(stride, bld, 1));
674             bld.ADD(addr, addr, offset(addr, bld, 1));
675             bld.MUL(dst, addr, stride);
676          }
677 
678          return dst;
679       }
680    }
681 
682    namespace image_format_conversion {
683       using image_format_info::color_u;
684 
685       namespace {
686          /**
687           * Maximum representable value in an unsigned integer with the given
688           * number of bits.
689           */
690          inline unsigned
scale(unsigned n)691          scale(unsigned n)
692          {
693             return (1 << n) - 1;
694          }
695       }
696 
697       /**
698        * Pack the vector \p src in a bitfield given the per-component bit
699        * shifts and widths.  Note that bitfield components are not allowed to
700        * cross 32-bit boundaries.
701        */
702       static fs_reg
emit_pack(const fs_builder & bld,const fs_reg & src,const color_u & shifts,const color_u & widths)703       emit_pack(const fs_builder &bld, const fs_reg &src,
704                 const color_u &shifts, const color_u &widths)
705       {
706          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
707          bool seen[4] = {};
708 
709          for (unsigned c = 0; c < 4; ++c) {
710             if (widths[c]) {
711                const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
712 
713                /* Shift each component left to the correct bitfield position. */
714                bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
715 
716                /* Add everything up. */
717                if (seen[shifts[c] / 32]) {
718                   bld.OR(offset(dst, bld, shifts[c] / 32),
719                          offset(dst, bld, shifts[c] / 32), tmp);
720                } else {
721                   bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
722                   seen[shifts[c] / 32] = true;
723                }
724             }
725          }
726 
727          return dst;
728       }
729 
730       /**
731        * Unpack a vector from the bitfield \p src given the per-component bit
732        * shifts and widths.  Note that bitfield components are not allowed to
733        * cross 32-bit boundaries.
734        */
735       static fs_reg
emit_unpack(const fs_builder & bld,const fs_reg & src,const color_u & shifts,const color_u & widths)736       emit_unpack(const fs_builder &bld, const fs_reg &src,
737                   const color_u &shifts, const color_u &widths)
738       {
739          const fs_reg dst = bld.vgrf(src.type, 4);
740 
741          for (unsigned c = 0; c < 4; ++c) {
742             if (widths[c]) {
743                /* Shift left to discard the most significant bits. */
744                bld.SHL(offset(dst, bld, c),
745                        offset(src, bld, shifts[c] / 32),
746                        brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
747 
748                /* Shift back to the least significant bits using an arithmetic
749                 * shift to get sign extension on signed types.
750                 */
751                bld.ASR(offset(dst, bld, c),
752                        offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
753             }
754          }
755 
756          return dst;
757       }
758 
759       /**
760        * Convert an integer vector into another integer vector of the
761        * specified bit widths, properly handling overflow.
762        */
763       static fs_reg
emit_convert_to_integer(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)764       emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
765                               const color_u &widths, bool is_signed)
766       {
767          const unsigned s = (is_signed ? 1 : 0);
768          const fs_reg dst = bld.vgrf(
769             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
770          assert(src.type == dst.type);
771 
772          for (unsigned c = 0; c < 4; ++c) {
773             if (widths[c]) {
774                /* Clamp to the maximum value. */
775                bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
776                                brw_imm_d((int)scale(widths[c] - s)),
777                                BRW_CONDITIONAL_L);
778 
779                /* Clamp to the minimum value. */
780                if (is_signed)
781                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
782                                   brw_imm_d(-(int)scale(widths[c] - s) - 1),
783                                   BRW_CONDITIONAL_GE);
784 
785                /* Mask off all but the bits we actually want.  Otherwise, if
786                 * we pass a negative number into the hardware when it's
787                 * expecting something like UINT8, it will happily clamp it to
788                 * +255 for us.
789                 */
790                if (is_signed && widths[c] < 32)
791                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
792                           brw_imm_d(scale(widths[c])));
793             }
794          }
795 
796          return dst;
797       }
798 
799       /**
800        * Convert a normalized fixed-point vector of the specified signedness
801        * and bit widths into a floating point vector.
802        */
803       static fs_reg
emit_convert_from_scaled(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)804       emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
805                                const color_u &widths, bool is_signed)
806       {
807          const unsigned s = (is_signed ? 1 : 0);
808          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
809 
810          for (unsigned c = 0; c < 4; ++c) {
811             if (widths[c]) {
812                /* Convert to float. */
813                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
814 
815                /* Divide by the normalization constants. */
816                bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
817                        brw_imm_f(1.0f / scale(widths[c] - s)));
818 
819                /* Clamp to the minimum value. */
820                if (is_signed)
821                   bld.emit_minmax(offset(dst, bld, c),
822                                   offset(dst, bld, c), brw_imm_f(-1.0f),
823                                   BRW_CONDITIONAL_GE);
824             }
825          }
826          return dst;
827       }
828 
829       /**
830        * Convert a floating-point vector into a normalized fixed-point vector
831        * of the specified signedness and bit widths.
832        */
833       static fs_reg
emit_convert_to_scaled(const fs_builder & bld,const fs_reg & src,const color_u & widths,bool is_signed)834       emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
835                              const color_u &widths, bool is_signed)
836       {
837          const unsigned s = (is_signed ? 1 : 0);
838          const fs_reg dst = bld.vgrf(
839             is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
840          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
841 
842          for (unsigned c = 0; c < 4; ++c) {
843             if (widths[c]) {
844                /* Clamp the normalized floating-point argument. */
845                if (is_signed) {
846                   bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
847                                   brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
848 
849                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
850                                   brw_imm_f(1.0f), BRW_CONDITIONAL_L);
851                } else {
852                   set_saturate(true, bld.MOV(offset(fdst, bld, c),
853                                              offset(src, bld, c)));
854                }
855 
856                /* Multiply by the normalization constants. */
857                bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
858                        brw_imm_f((float)scale(widths[c] - s)));
859 
860                /* Convert to integer. */
861                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
862                bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
863 
864                /* Mask off all but the bits we actually want.  Otherwise, if
865                 * we pass a negative number into the hardware when it's
866                 * expecting something like UINT8, it will happily clamp it to
867                 * +255 for us.
868                 */
869                if (is_signed && widths[c] < 32)
870                   bld.AND(offset(dst, bld, c), offset(dst, bld, c),
871                           brw_imm_d(scale(widths[c])));
872             }
873          }
874 
875          return dst;
876       }
877 
878       /**
879        * Convert a floating point vector of the specified bit widths into a
880        * 32-bit floating point vector.
881        */
882       static fs_reg
emit_convert_from_float(const fs_builder & bld,const fs_reg & src,const color_u & widths)883       emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
884                               const color_u &widths)
885       {
886          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
887          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
888 
889          for (unsigned c = 0; c < 4; ++c) {
890             if (widths[c]) {
891                bld.MOV(offset(dst, bld, c), offset(src, bld, c));
892 
893                /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
894                 * This works because they have a 5-bit exponent just like the
895                 * 16-bit floating point format, and they have no sign bit.
896                 */
897                if (widths[c] < 16)
898                   bld.SHL(offset(dst, bld, c),
899                           offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
900 
901                /* Convert to 32-bit floating point. */
902                bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
903             }
904          }
905 
906          return fdst;
907       }
908 
909       /**
910        * Convert a vector into a floating point vector of the specified bit
911        * widths.
912        */
913       static fs_reg
emit_convert_to_float(const fs_builder & bld,const fs_reg & src,const color_u & widths)914       emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
915                             const color_u &widths)
916       {
917          const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
918          const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
919 
920          for (unsigned c = 0; c < 4; ++c) {
921             if (widths[c]) {
922                bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
923 
924                /* Clamp to the minimum value. */
925                if (widths[c] < 16)
926                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
927                                   brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
928 
929                /* Convert to 16-bit floating-point. */
930                bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
931 
932                /* Discard the least significant bits to get floating point
933                 * numbers of the requested width.  This works because the
934                 * 10-bit and 11-bit floating point formats have a 5-bit
935                 * exponent just like the 16-bit format, and they have no sign
936                 * bit.
937                 */
938                if (widths[c] < 16)
939                   bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
940                           brw_imm_ud(15 - widths[c]));
941             }
942          }
943 
944          return dst;
945       }
946 
947       /**
948        * Fill missing components of a vector with 0, 0, 0, 1.
949        */
950       static fs_reg
emit_pad(const fs_builder & bld,const fs_reg & src,const color_u & widths)951       emit_pad(const fs_builder &bld, const fs_reg &src,
952                const color_u &widths)
953       {
954          const fs_reg dst = bld.vgrf(src.type, 4);
955          const unsigned pad[] = { 0, 0, 0, 1 };
956 
957          for (unsigned c = 0; c < 4; ++c)
958             bld.MOV(offset(dst, bld, c),
959                     widths[c] ? offset(src, bld, c)
960                               : fs_reg(brw_imm_ud(pad[c])));
961 
962          return dst;
963       }
964    }
965 }
966 
967 namespace brw {
968    namespace image_access {
969       /**
970        * Load a vector from a surface of the given format and dimensionality
971        * at the given coordinates.  \p surf_dims and \p arr_dims give the
972        * number of non-array and array coordinates of the image respectively.
973        */
974       fs_reg
emit_image_load(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,unsigned surf_dims,unsigned arr_dims,unsigned gl_format)975       emit_image_load(const fs_builder &bld,
976                       const fs_reg &image, const fs_reg &addr,
977                       unsigned surf_dims, unsigned arr_dims,
978                       unsigned gl_format)
979       {
980          using namespace image_format_info;
981          using namespace image_format_conversion;
982          using namespace image_validity;
983          using namespace image_coordinates;
984          using namespace surface_access;
985          const gen_device_info *devinfo = bld.shader->devinfo;
986          const isl_format format = isl_format_for_gl_format(gl_format);
987          const isl_format lower_format =
988             isl_lower_storage_image_format(devinfo, format);
989          fs_reg tmp;
990 
991          /* Transform the image coordinates into actual surface coordinates. */
992          const fs_reg saddr =
993             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
994          const unsigned dims =
995             num_image_coordinates(bld, surf_dims, arr_dims, format);
996 
997          if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
998             /* Hopefully we get here most of the time... */
999             tmp = emit_typed_read(bld, image, saddr, dims,
1000                                   isl_format_get_num_channels(lower_format));
1001          } else {
1002             /* Untyped surface reads return 32 bits of the surface per
1003              * component, without any sort of unpacking or type conversion,
1004              */
1005             const unsigned size = isl_format_get_layout(format)->bpb / 32;
1006             /* they don't properly handle out of bounds access, so we have to
1007              * check manually if the coordinates are valid and predicate the
1008              * surface read on the result,
1009              */
1010             const brw_predicate pred =
1011                emit_untyped_image_check(bld, image,
1012                                         emit_bounds_check(bld, image,
1013                                                           saddr, dims));
1014 
1015             /* and they don't know about surface coordinates, we need to
1016              * convert them to a raw memory offset.
1017              */
1018             const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
1019 
1020             tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
1021 
1022             /* An out of bounds surface access should give zero as result. */
1023             for (unsigned c = 0; c < size; ++c)
1024                set_predicate(pred, bld.SEL(offset(tmp, bld, c),
1025                                            offset(tmp, bld, c), brw_imm_d(0)));
1026          }
1027 
1028          /* Set the register type to D instead of UD if the data type is
1029           * represented as a signed integer in memory so that sign extension
1030           * is handled correctly by unpack.
1031           */
1032          if (needs_sign_extension(format))
1033             tmp = retype(tmp, BRW_REGISTER_TYPE_D);
1034 
1035          if (!has_supported_bit_layout(devinfo, format)) {
1036             /* Unpack individual vector components from the bitfield if the
1037              * hardware is unable to do it for us.
1038              */
1039             if (has_split_bit_layout(devinfo, format))
1040                tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
1041                                get_bit_widths(lower_format));
1042             else
1043                tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
1044                                  get_bit_widths(format));
1045 
1046          } else if ((needs_sign_extension(format) &&
1047                      !is_conversion_trivial(devinfo, format)) ||
1048                     has_undefined_high_bits(devinfo, format)) {
1049             /* Perform a trivial unpack even though the bit layout matches in
1050              * order to get the most significant bits of each component
1051              * initialized properly.
1052              */
1053             tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
1054                               get_bit_widths(format));
1055          }
1056 
1057          if (!isl_format_has_int_channel(format)) {
1058             if (is_conversion_trivial(devinfo, format)) {
1059                /* Just need to cast the vector to the target type. */
1060                tmp = retype(tmp, BRW_REGISTER_TYPE_F);
1061             } else {
1062                /* Do the right sort of type conversion to float. */
1063                if (isl_format_has_float_channel(format))
1064                   tmp = emit_convert_from_float(
1065                      bld, tmp, get_bit_widths(format));
1066                else
1067                   tmp = emit_convert_from_scaled(
1068                      bld, tmp, get_bit_widths(format),
1069                      isl_format_has_snorm_channel(format));
1070             }
1071          }
1072 
1073          /* Initialize missing components of the result. */
1074          return emit_pad(bld, tmp, get_bit_widths(format));
1075       }
1076 
1077       /**
1078        * Store a vector in a surface of the given format and dimensionality at
1079        * the given coordinates.  \p surf_dims and \p arr_dims give the number
1080        * of non-array and array coordinates of the image respectively.
1081        */
1082       void
emit_image_store(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,const fs_reg & src,unsigned surf_dims,unsigned arr_dims,unsigned gl_format)1083       emit_image_store(const fs_builder &bld, const fs_reg &image,
1084                        const fs_reg &addr, const fs_reg &src,
1085                        unsigned surf_dims, unsigned arr_dims,
1086                        unsigned gl_format)
1087       {
1088          using namespace image_format_info;
1089          using namespace image_format_conversion;
1090          using namespace image_validity;
1091          using namespace image_coordinates;
1092          using namespace surface_access;
1093          const isl_format format = isl_format_for_gl_format(gl_format);
1094          const gen_device_info *devinfo = bld.shader->devinfo;
1095 
1096          /* Transform the image coordinates into actual surface coordinates. */
1097          const fs_reg saddr =
1098             emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
1099          const unsigned dims =
1100             num_image_coordinates(bld, surf_dims, arr_dims, format);
1101 
1102          if (gl_format == GL_NONE) {
1103             /* We don't know what the format is, but that's fine because it
1104              * implies write-only access, and typed surface writes are always
1105              * able to take care of type conversion and packing for us.
1106              */
1107             emit_typed_write(bld, image, saddr, src, dims, 4);
1108 
1109          } else {
1110             const isl_format lower_format =
1111                isl_lower_storage_image_format(devinfo, format);
1112             fs_reg tmp = src;
1113 
1114             if (!is_conversion_trivial(devinfo, format)) {
1115                /* Do the right sort of type conversion. */
1116                if (isl_format_has_float_channel(format))
1117                   tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
1118 
1119                else if (isl_format_has_int_channel(format))
1120                   tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
1121                                                 isl_format_has_sint_channel(format));
1122 
1123                else
1124                   tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
1125                                                isl_format_has_snorm_channel(format));
1126             }
1127 
1128             /* We're down to bit manipulation at this point. */
1129             tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
1130 
1131             if (!has_supported_bit_layout(devinfo, format)) {
1132                /* Pack the vector components into a bitfield if the hardware
1133                 * is unable to do it for us.
1134                 */
1135                if (has_split_bit_layout(devinfo, format))
1136                   tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
1137                                     get_bit_widths(lower_format));
1138 
1139                else
1140                   tmp = emit_pack(bld, tmp, get_bit_shifts(format),
1141                                   get_bit_widths(format));
1142             }
1143 
1144             if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
1145                /* Hopefully we get here most of the time... */
1146                emit_typed_write(bld, image, saddr, tmp, dims,
1147                                 isl_format_get_num_channels(lower_format));
1148 
1149             } else {
1150                /* Untyped surface writes store 32 bits of the surface per
1151                 * component, without any sort of packing or type conversion,
1152                 */
1153                const unsigned size = isl_format_get_layout(format)->bpb / 32;
1154 
1155                /* they don't properly handle out of bounds access, so we have
1156                 * to check manually if the coordinates are valid and predicate
1157                 * the surface write on the result,
1158                 */
1159                const brw_predicate pred =
1160                   emit_untyped_image_check(bld, image,
1161                                            emit_bounds_check(bld, image,
1162                                                              saddr, dims));
1163 
1164                /* and, phew, they don't know about surface coordinates, we
1165                 * need to convert them to a raw memory offset.
1166                 */
1167                const fs_reg laddr = emit_address_calculation(
1168                   bld, image, saddr, dims);
1169 
1170                emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
1171             }
1172          }
1173       }
1174 
1175       /**
1176        * Perform an atomic read-modify-write operation in a surface of the
1177        * given dimensionality at the given coordinates.  \p surf_dims and \p
1178        * arr_dims give the number of non-array and array coordinates of the
1179        * image respectively.  Main building block of the imageAtomic GLSL
1180        * built-ins.
1181        */
1182       fs_reg
emit_image_atomic(const fs_builder & bld,const fs_reg & image,const fs_reg & addr,const fs_reg & src0,const fs_reg & src1,unsigned surf_dims,unsigned arr_dims,unsigned rsize,unsigned op)1183       emit_image_atomic(const fs_builder &bld,
1184                         const fs_reg &image, const fs_reg &addr,
1185                         const fs_reg &src0, const fs_reg &src1,
1186                         unsigned surf_dims, unsigned arr_dims,
1187                         unsigned rsize, unsigned op)
1188       {
1189          using namespace image_validity;
1190          using namespace image_coordinates;
1191          using namespace surface_access;
1192          /* Avoid performing an atomic operation on an unbound surface. */
1193          const brw_predicate pred = emit_typed_atomic_check(bld, image);
1194 
1195          /* Transform the image coordinates into actual surface coordinates. */
1196          const fs_reg saddr =
1197             emit_image_coordinates(bld, addr, surf_dims, arr_dims,
1198                                    ISL_FORMAT_R32_UINT);
1199          const unsigned dims =
1200             num_image_coordinates(bld, surf_dims, arr_dims,
1201                                   ISL_FORMAT_R32_UINT);
1202 
1203          /* Thankfully we can do without untyped atomics here. */
1204          const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
1205                                               dims, rsize, op, pred);
1206 
1207          /* An unbound surface access should give zero as result. */
1208          if (rsize && pred)
1209             set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
1210 
1211          return retype(tmp, src0.type);
1212       }
1213    }
1214 }
1215