1 /* 2 * Copyright (C) 2019 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 // A Neon vectorized implementation of the GNU symbol hash function. 30 31 // This function generally accesses beyond the bounds of the name string. Specifically, it reads 32 // each aligned 8-byte chunk containing a byte of the string, including the final NUL byte. This 33 // should be acceptable for use with MTE, which uses 16-byte granules. Typically, the function is 34 // used to hash strings in an ELF file's string table, where MTE is presumably unaware of the 35 // bounds of each symbol, but the linker also hashes the symbol name passed to dlsym. 36 37 #include "linker_gnu_hash_neon.h" 38 39 #include <arm_neon.h> 40 #include <stdio.h> 41 #include <stdint.h> 42 #include <stdlib.h> 43 44 struct __attribute__((aligned(8))) GnuHashInitEntry { 45 uint64_t ignore_mask; 46 uint32_t accum; 47 }; 48 49 constexpr uint32_t kStep0 = 1; 50 constexpr uint32_t kStep1 = kStep0 * 33; 51 constexpr uint32_t kStep2 = kStep1 * 33; 52 constexpr uint32_t kStep3 = kStep2 * 33; 53 constexpr uint32_t kStep4 = kStep3 * 33; 54 constexpr uint32_t kStep5 = kStep4 * 33; 55 constexpr uint32_t kStep6 = kStep5 * 33; 56 constexpr uint32_t kStep7 = kStep6 * 33; 57 constexpr uint32_t kStep8 = kStep7 * 33; 58 constexpr uint32_t kStep9 = kStep8 * 33; 59 constexpr uint32_t kStep10 = kStep9 * 33; 60 constexpr uint32_t kStep11 = kStep10 * 33; 61 62 // Step by -1 through -7: 33 * 0x3e0f83e1 == 1 (mod 2**32) 63 constexpr uint32_t kStepN1 = kStep0 * 0x3e0f83e1; 64 constexpr uint32_t kStepN2 = kStepN1 * 0x3e0f83e1; 65 constexpr uint32_t kStepN3 = kStepN2 * 0x3e0f83e1; 66 constexpr uint32_t kStepN4 = kStepN3 * 0x3e0f83e1; 67 constexpr uint32_t kStepN5 = kStepN4 * 0x3e0f83e1; 68 constexpr uint32_t kStepN6 = kStepN5 * 0x3e0f83e1; 69 constexpr uint32_t kStepN7 = kStepN6 * 0x3e0f83e1; 70 71 // Calculate the GNU hash and string length of the symbol name. 72 // 73 // The hash calculation is an optimized version of this function: 74 // 75 // uint32_t calculate_gnu_hash(const uint8_t* name) { 76 // uint32_t h = 5381; 77 // for (; *name != '\0'; ++name) { 78 // h *= 33; 79 // h += *name; 80 // } 81 // return h; 82 // } 83 // 84 std::pair<uint32_t, uint32_t> calculate_gnu_hash_neon(const char* name) { 85 86 // The input string may be misaligned by 0-7 bytes (K). This function loads the first aligned 87 // 8-byte chunk, then counteracts the misalignment: 88 // - The initial K bytes are set to 0xff in the working chunk vector. 89 // - The accumulator is initialized to 5381 * modinv(33)**K. 90 // - The accumulator also cancels out each initial 0xff byte. 91 // If we could set bytes to NUL instead, then the accumulator wouldn't need to cancel out the 92 // 0xff values, but this would break the NUL check. 93 94 static const struct GnuHashInitEntry kInitTable[] = { 95 { // (addr&7) == 0 96 0ull, 97 5381u*kStep0, 98 }, { // (addr&7) == 1 99 0xffull, 100 5381u*kStepN1 - 0xffu*kStepN1, 101 }, { // (addr&7) == 2 102 0xffffull, 103 5381u*kStepN2 - 0xffu*kStepN1 - 0xffu*kStepN2, 104 }, { // (addr&7) == 3 105 0xffffffull, 106 5381u*kStepN3 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3, 107 }, { // (addr&7) == 4 108 0xffffffffull, 109 5381u*kStepN4 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4, 110 }, { // (addr&7) == 5 111 0xffffffffffull, 112 5381u*kStepN5 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5, 113 }, { // (addr&7) == 6 114 0xffffffffffffull, 115 5381u*kStepN6 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5 - 0xffu*kStepN6, 116 }, { // (addr&7) == 7 117 0xffffffffffffffull, 118 5381u*kStepN7 - 0xffu*kStepN1 - 0xffu*kStepN2 - 0xffu*kStepN3 - 0xffu*kStepN4 - 0xffu*kStepN5 - 0xffu*kStepN6 - 0xffu*kStepN7, 119 }, 120 }; 121 122 uint8_t offset = reinterpret_cast<uintptr_t>(name) & 7; 123 const uint64_t* chunk_ptr = reinterpret_cast<const uint64_t*>(reinterpret_cast<uintptr_t>(name) & ~7); 124 const struct GnuHashInitEntry* entry = &kInitTable[offset]; 125 126 uint8x8_t chunk = vld1_u8(reinterpret_cast<const uint8_t*>(chunk_ptr)); 127 chunk |= vld1_u8(reinterpret_cast<const uint8_t*>(&entry->ignore_mask)); 128 129 uint32x4_t accum_lo = { 0 }; 130 uint32x4_t accum_hi = { entry->accum, 0, 0, 0 }; 131 const uint16x4_t kInclineVec = { kStep3, kStep2, kStep1, kStep0 }; 132 const uint32x4_t kStep8Vec = vdupq_n_u32(kStep8); 133 uint8x8_t is_nul; 134 uint16x8_t expand; 135 136 while (1) { 137 // Exit the loop if any of the 8 bytes is NUL. 138 is_nul = vceq_u8(chunk, (uint8x8_t){ 0 }); 139 expand = vmovl_u8(chunk); 140 uint64x1_t is_nul_64 = vreinterpret_u64_u8(is_nul); 141 if (vget_lane_u64(is_nul_64, 0)) break; 142 143 // Multiply both accumulators by 33**8. 144 accum_lo = vmulq_u32(accum_lo, kStep8Vec); 145 accum_hi = vmulq_u32(accum_hi, kStep8Vec); 146 147 // Multiply each 4-piece subchunk by (33**3, 33**2, 33*1, 1), then accumulate the result. The lo 148 // accumulator will be behind by 33**4 until the very end of the computation. 149 accum_lo = vmlal_u16(accum_lo, vget_low_u16(expand), kInclineVec); 150 accum_hi = vmlal_u16(accum_hi, vget_high_u16(expand), kInclineVec); 151 152 // Load the next chunk. 153 chunk = vld1_u8(reinterpret_cast<const uint8_t*>(++chunk_ptr)); 154 } 155 156 // Reverse the is-NUL vector so we can use clz to count the number of remaining bytes. 157 is_nul = vrev64_u8(is_nul); 158 const uint64_t is_nul_u64 = vget_lane_u64(vreinterpret_u64_u8(is_nul), 0); 159 const uint32_t num_valid_bits = __builtin_clzll(is_nul_u64); 160 161 const uint32_t name_len = reinterpret_cast<const char*>(chunk_ptr) - name + (num_valid_bits >> 3); 162 163 static const uint32_t kFinalStepTable[] = { 164 kStep4, kStep0, // 0 remaining bytes 165 kStep5, kStep1, // 1 remaining byte 166 kStep6, kStep2, // 2 remaining bytes 167 kStep7, kStep3, // 3 remaining bytes 168 kStep8, kStep4, // 4 remaining bytes 169 kStep9, kStep5, // 5 remaining bytes 170 kStep10, kStep6, // 6 remaining bytes 171 kStep11, kStep7, // 7 remaining bytes 172 }; 173 174 // Advance the lo/hi accumulators appropriately for the number of remaining bytes. Multiply 33**4 175 // into the lo accumulator to catch it up with the hi accumulator. 176 const uint32_t* final_step = &kFinalStepTable[num_valid_bits >> 2]; 177 accum_lo = vmulq_u32(accum_lo, vdupq_n_u32(final_step[0])); 178 accum_lo = vmlaq_u32(accum_lo, accum_hi, vdupq_n_u32(final_step[1])); 179 180 static const uint32_t kFinalInclineTable[] = { 181 0, kStep6, kStep5, kStep4, kStep3, kStep2, kStep1, kStep0, 182 0, 0, 0, 0, 0, 0, 0, 0, 183 }; 184 185 // Prepare a vector to multiply powers of 33 into each of the remaining bytes. 186 const uint32_t* const incline = &kFinalInclineTable[8 - (num_valid_bits >> 3)]; 187 const uint32x4_t incline_lo = vld1q_u32(incline); 188 const uint32x4_t incline_hi = vld1q_u32(incline + 4); 189 190 // Multiply 33 into each of the remaining 4-piece vectors, then accumulate everything into 191 // accum_lo. Combine everything into a single 32-bit result. 192 accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_low_u16(expand)), incline_lo); 193 accum_lo = vmlaq_u32(accum_lo, vmovl_u16(vget_high_u16(expand)), incline_hi); 194 195 uint32x2_t sum = vadd_u32(vget_low_u32(accum_lo), vget_high_u32(accum_lo)); 196 const uint32_t hash = sum[0] + sum[1]; 197 198 return { hash, name_len }; 199 } 200