/* * Copyright © 2020 Google LLC * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ /* Lowers nir_intrinsic_load_ubo() to nir_intrinsic_load_ubo_vec4() taking an * offset in vec4 units. This is a fairly common mode of UBO addressing for * hardware to have, and it gives NIR a chance to optimize the addressing math * and CSE the loads. * * This pass handles lowering for loads that straddle a vec4 alignment * boundary. We try to minimize the extra loads we generate for that case, * and are ensured non-straddling loads with: * * - std140 (GLSL 1.40, GLSL ES) * - Vulkan "Extended Layout" (the baseline for UBOs) * * but not: * * - GLSL 4.30's new packed mode (enabled by PIPE_CAP_LOAD_CONSTBUF) where * vec3 arrays are packed tightly. * * - PackedDriverUniformStorage in GL (enabled by PIPE_CAP_PACKED_UNIFORMS) * combined with nir_lower_uniforms_to_ubo, where values in the default * uniform block are packed tightly. * * - Vulkan's scalarBlockLayout optional feature: * * "A member is defined to improperly straddle if either of the following are * true: * * • It is a vector with total size less than or equal to 16 bytes, and has * Offset decorations placing its first byte at F and its last byte at L * where floor(F / 16) != floor(L / 16). * • It is a vector with total size greater than 16 bytes and has its Offset * decorations placing its first byte at a non-integer multiple of 16. * * [...] * * Unless the scalarBlockLayout feature is enabled on the device: * * • Vectors must not improperly straddle, as defined above." */ #include "nir.h" #include "nir_builder.h" static bool nir_lower_ubo_vec4_filter(const nir_instr *instr, const void *data) { if (instr->type != nir_instr_type_intrinsic) return false; return nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_ubo; } static nir_intrinsic_instr * nir_load_ubo_vec4(nir_builder *b, nir_ssa_def *block, nir_ssa_def *offset, unsigned bit_size, unsigned num_components) { nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_vec4); load->src[0] = nir_src_for_ssa(block); load->src[1] = nir_src_for_ssa(offset); nir_ssa_dest_init(&load->instr, &load->dest, num_components, bit_size, NULL); load->num_components = num_components; nir_builder_instr_insert(b, &load->instr); return load; } static nir_ssa_def * nir_lower_ubo_vec4_lower(nir_builder *b, nir_instr *instr, void *data) { b->cursor = nir_before_instr(instr); nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); nir_ssa_def *byte_offset = nir_ssa_for_src(b, intr->src[1], 1); nir_ssa_def *vec4_offset = nir_ushr_imm(b, byte_offset, 4); unsigned align_mul = nir_intrinsic_align_mul(intr); unsigned align_offset = nir_intrinsic_align_offset(intr); int chan_size_bytes = intr->dest.ssa.bit_size / 8; int chans_per_vec4 = 16 / chan_size_bytes; /* We don't care if someone figured out that things are aligned beyond * vec4. */ align_mul = MIN2(align_mul, 16); align_offset &= 15; assert(align_offset % chan_size_bytes == 0); unsigned num_components = intr->num_components; bool aligned_mul = (align_mul == 16 && align_offset + chan_size_bytes * num_components <= 16); if (!aligned_mul) num_components = chans_per_vec4; nir_intrinsic_instr *load = nir_load_ubo_vec4(b, intr->src[0].ssa, vec4_offset, intr->dest.ssa.bit_size, num_components); nir_ssa_def *result = &load->dest.ssa; int align_chan_offset = align_offset / chan_size_bytes; if (aligned_mul) { /* For an aligned load, just ask the backend to load from the known * offset's component. */ nir_intrinsic_set_component(load, align_chan_offset); } else if (intr->num_components == 1) { /* If we're loading a single component, that component alone won't * straddle a vec4 boundary so we can do this with a single UBO load. */ nir_ssa_def *component = nir_iand_imm(b, nir_udiv_imm(b, byte_offset, chan_size_bytes), chans_per_vec4 - 1); result = nir_vector_extract(b, result, component); } else if (align_mul == 8 && align_offset + chan_size_bytes * intr->num_components <= 16) { /* Special case: Loading small vectors from offset % 8 == 0 can be done * with just one load and one bcsel. */ nir_component_mask_t low_channels = BITSET_MASK(intr->num_components) << (align_chan_offset); nir_component_mask_t high_channels = low_channels << (8 / chan_size_bytes); result = nir_bcsel(b, nir_i2b(b, nir_iand_imm(b, byte_offset, 8)), nir_channels(b, result, high_channels), nir_channels(b, result, low_channels)); } else { /* General fallback case: Per-result-channel bcsel-based extraction * from two separate vec4 loads. */ assert(num_components == 4); nir_ssa_def *next_vec4_offset = nir_iadd_imm(b, vec4_offset, 1); nir_intrinsic_instr *next_load = nir_load_ubo_vec4(b, intr->src[0].ssa, next_vec4_offset, intr->dest.ssa.bit_size, num_components); nir_ssa_def *channels[NIR_MAX_VEC_COMPONENTS]; for (unsigned i = 0; i < intr->num_components; i++) { nir_ssa_def *chan_byte_offset = nir_iadd_imm(b, byte_offset, i * chan_size_bytes); nir_ssa_def *chan_vec4_offset = nir_ushr_imm(b, chan_byte_offset, 4); nir_ssa_def *component = nir_iand_imm(b, nir_udiv_imm(b, chan_byte_offset, chan_size_bytes), chans_per_vec4 - 1); channels[i] = nir_vector_extract(b, nir_bcsel(b, nir_ieq(b, chan_vec4_offset, vec4_offset), &load->dest.ssa, &next_load->dest.ssa), component); } result = nir_vec(b, channels, intr->num_components); } return result; } bool nir_lower_ubo_vec4(nir_shader *shader) { return nir_shader_lower_instructions(shader, nir_lower_ubo_vec4_filter, nir_lower_ubo_vec4_lower, NULL); }