1 /*
2  * Copyright © 2020 Google LLC
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /* Lowers nir_intrinsic_load_ubo() to nir_intrinsic_load_ubo_vec4() taking an
25  * offset in vec4 units.  This is a fairly common mode of UBO addressing for
26  * hardware to have, and it gives NIR a chance to optimize the addressing math
27  * and CSE the loads.
28  *
29  * This pass handles lowering for loads that straddle a vec4 alignment
30  * boundary.  We try to minimize the extra loads we generate for that case,
31  * and are ensured non-straddling loads with:
32  *
33  * - std140 (GLSL 1.40, GLSL ES)
34  * - Vulkan "Extended Layout" (the baseline for UBOs)
35  *
36  * but not:
37  *
38  * - GLSL 4.30's new packed mode (enabled by PIPE_CAP_LOAD_CONSTBUF) where
39  *   vec3 arrays are packed tightly.
40  *
41  * - PackedDriverUniformStorage in GL (enabled by PIPE_CAP_PACKED_UNIFORMS)
42  *   combined with nir_lower_uniforms_to_ubo, where values in the default
43  *   uniform block are packed tightly.
44  *
45  * - Vulkan's scalarBlockLayout optional feature:
46  *
47  *   "A member is defined to improperly straddle if either of the following are
48  *    true:
49  *
50  *    • It is a vector with total size less than or equal to 16 bytes, and has
51  *      Offset decorations placing its first byte at F and its last byte at L
52  *      where floor(F / 16) != floor(L / 16).
53  *    • It is a vector with total size greater than 16 bytes and has its Offset
54  *      decorations placing its first byte at a non-integer multiple of 16.
55  *
56  *    [...]
57  *
58  *    Unless the scalarBlockLayout feature is enabled on the device:
59  *
60  *    • Vectors must not improperly straddle, as defined above."
61  */
62 
63 #include "nir.h"
64 #include "nir_builder.h"
65 
66 static bool
nir_lower_ubo_vec4_filter(const nir_instr * instr,const void * data)67 nir_lower_ubo_vec4_filter(const nir_instr *instr, const void *data)
68 {
69    if (instr->type != nir_instr_type_intrinsic)
70       return false;
71 
72    return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo;
73 }
74 
75 static nir_intrinsic_instr *
nir_load_ubo_vec4(nir_builder * b,nir_ssa_def * block,nir_ssa_def * offset,unsigned bit_size,unsigned num_components)76 nir_load_ubo_vec4(nir_builder *b, nir_ssa_def *block, nir_ssa_def *offset,
77                   unsigned bit_size, unsigned num_components)
78 {
79    nir_intrinsic_instr *load =
80       nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_vec4);
81    load->src[0] = nir_src_for_ssa(block);
82    load->src[1] = nir_src_for_ssa(offset);
83 
84    nir_ssa_dest_init(&load->instr, &load->dest, num_components, bit_size, NULL);
85    load->num_components = num_components;
86    nir_builder_instr_insert(b, &load->instr);
87 
88    return load;
89 }
90 
91 static nir_ssa_def *
nir_lower_ubo_vec4_lower(nir_builder * b,nir_instr * instr,void * data)92 nir_lower_ubo_vec4_lower(nir_builder *b, nir_instr *instr, void *data)
93 {
94    b->cursor = nir_before_instr(instr);
95 
96    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
97 
98    nir_ssa_def *byte_offset = nir_ssa_for_src(b, intr->src[1], 1);
99    nir_ssa_def *vec4_offset = nir_ushr_imm(b, byte_offset, 4);
100 
101    unsigned align_mul = nir_intrinsic_align_mul(intr);
102    unsigned align_offset = nir_intrinsic_align_offset(intr);
103 
104    int chan_size_bytes = intr->dest.ssa.bit_size / 8;
105    int chans_per_vec4 = 16 / chan_size_bytes;
106 
107    /* We don't care if someone figured out that things are aligned beyond
108     * vec4.
109     */
110    align_mul = MIN2(align_mul, 16);
111    align_offset &= 15;
112    assert(align_offset % chan_size_bytes == 0);
113 
114    unsigned num_components = intr->num_components;
115    bool aligned_mul = (align_mul == 16 &&
116                        align_offset +  chan_size_bytes * num_components <= 16);
117    if (!aligned_mul)
118       num_components = chans_per_vec4;
119 
120    nir_intrinsic_instr *load = nir_load_ubo_vec4(b, intr->src[0].ssa,
121                                                  vec4_offset,
122                                                  intr->dest.ssa.bit_size,
123                                                  num_components);
124 
125    nir_ssa_def *result = &load->dest.ssa;
126 
127    int align_chan_offset = align_offset / chan_size_bytes;
128    if (aligned_mul) {
129       /* For an aligned load, just ask the backend to load from the known
130        * offset's component.
131        */
132       nir_intrinsic_set_component(load, align_chan_offset);
133    } else if (intr->num_components == 1) {
134       /* If we're loading a single component, that component alone won't
135        * straddle a vec4 boundary so we can do this with a single UBO load.
136        */
137       nir_ssa_def *component =
138          nir_iand_imm(b,
139                       nir_udiv_imm(b, byte_offset, chan_size_bytes),
140                       chans_per_vec4 - 1);
141 
142       result = nir_vector_extract(b, result, component);
143    } else if (align_mul == 8 &&
144               align_offset + chan_size_bytes * intr->num_components <= 16) {
145       /* Special case: Loading small vectors from offset % 8 == 0 can be done
146        * with just one load and one bcsel.
147        */
148       nir_component_mask_t low_channels =
149          BITSET_MASK(intr->num_components) << (align_chan_offset);
150       nir_component_mask_t high_channels =
151          low_channels << (8 / chan_size_bytes);
152       result = nir_bcsel(b,
153                          nir_i2b(b, nir_iand_imm(b, byte_offset, 8)),
154                          nir_channels(b, result, high_channels),
155                          nir_channels(b, result, low_channels));
156    } else {
157       /* General fallback case: Per-result-channel bcsel-based extraction
158        * from two separate vec4 loads.
159        */
160       assert(num_components == 4);
161       nir_ssa_def *next_vec4_offset = nir_iadd_imm(b, vec4_offset, 1);
162       nir_intrinsic_instr *next_load = nir_load_ubo_vec4(b, intr->src[0].ssa,
163                                                          next_vec4_offset,
164                                                          intr->dest.ssa.bit_size,
165                                                          num_components);
166 
167       nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS];
168       for (unsigned i = 0; i < intr->num_components; i++) {
169          nir_ssa_def *chan_byte_offset = nir_iadd_imm(b, byte_offset, i * chan_size_bytes);
170 
171          nir_ssa_def *chan_vec4_offset = nir_ushr_imm(b, chan_byte_offset, 4);
172 
173          nir_ssa_def *component =
174             nir_iand_imm(b,
175                          nir_udiv_imm(b, chan_byte_offset, chan_size_bytes),
176                          chans_per_vec4 - 1);
177 
178          channels[i] = nir_vector_extract(b,
179                                           nir_bcsel(b,
180                                                     nir_ieq(b,
181                                                             chan_vec4_offset,
182                                                             vec4_offset),
183                                                     &load->dest.ssa,
184                                                     &next_load->dest.ssa),
185                                           component);
186       }
187 
188       result = nir_vec(b, channels, intr->num_components);
189    }
190 
191    return result;
192 }
193 
194 bool
nir_lower_ubo_vec4(nir_shader * shader)195 nir_lower_ubo_vec4(nir_shader *shader)
196 {
197    return nir_shader_lower_instructions(shader,
198                                         nir_lower_ubo_vec4_filter,
199                                         nir_lower_ubo_vec4_lower,
200                                         NULL);
201 }
202