//===-- Memset utils --------------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H #include "src/string/memory_utils/utils.h" #include // size_t namespace __llvm_libc { // Sets `kBlockSize` bytes starting from `src` to `value`. template static void SetBlock(char *dst, unsigned value) { // Theoretically the compiler is allowed to call memset here and end up with a // recursive call, practically it doesn't happen, however this should be // replaced with a __builtin_memset_inline once it's available in clang. __builtin_memset(dst, value, kBlockSize); } // Sets `kBlockSize` bytes from `src + count - kBlockSize` to `value`. // Precondition: `count >= kBlockSize`. template static void SetLastBlock(char *dst, unsigned value, size_t count) { SetBlock(dst + count - kBlockSize, value); } // Sets `kBlockSize` bytes twice with an overlap between the two. // // [1234567812345678123] // [__XXXXXXXXXXXXXX___] // [__XXXXXXXX_________] // [________XXXXXXXX___] // // Precondition: `count >= kBlockSize && count <= kBlockSize`. template static void SetBlockOverlap(char *dst, unsigned value, size_t count) { SetBlock(dst, value); SetLastBlock(dst, value, count); } // Sets `count` bytes by blocks of `kBlockSize` bytes. // Sets at the start and end of the buffer are unaligned. // Sets in the middle of the buffer are aligned to `kBlockSize`. // // e.g. with // [12345678123456781234567812345678] // [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___] // [__XXXXXXXX______________________] // [________XXXXXXXX________________] // [________________XXXXXXXX________] // [_____________________XXXXXXXX___] // // Precondition: `count > 2 * kBlockSize` for efficiency. // `count >= kBlockSize` for correctness. template static void SetAlignedBlocks(char *dst, unsigned value, size_t count) { SetBlock(dst, value); // Set first block // Set aligned blocks size_t offset = kBlockSize - offset_from_last_aligned(dst); for (; offset + kBlockSize < count; offset += kBlockSize) SetBlock(dst + offset, value); SetLastBlock(dst, value, count); // Set last block } // A general purpose implementation assuming cheap unaligned writes for sizes: // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32 // or 64 Bytes at a time, the compiler will expand them as needed. // // This implementation is subject to change as we benchmark more processors. We // may also want to customize it for processors with specialized instructions // that performs better (e.g. `rep stosb`). // // A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes. // We want to balance two things here: // - The number of redundant writes (when using `SetBlockOverlap`), // - The number of conditionals for sizes <=128 (~90% of memset calls are for // such sizes). // // For the range 64-128: // - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this // is wasteful near 65 but efficient toward 128. // - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write // 96 or 128 Bytes. // - Another approach could be to use an hybrid approach Copy<64>+Overlap<32> // for 65-96 and Copy<96>+Overlap<32> for 97-128 // // Benchmarks showed that redundant writes were cheap (for Intel X86) but // conditional were expensive, even on processor that do not support writing 64B // at a time (pre-AVX512F). We also want to favor short functions that allow // more hot code to fit in the iL1 cache. // // Above 128 we have to use conditionals since we don't know the upper bound in // advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32> // may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not // superior for sizes that mattered. inline static void GeneralPurposeMemset(char *dst, unsigned char value, size_t count) { if (count == 0) return; if (count == 1) return SetBlock<1>(dst, value); if (count == 2) return SetBlock<2>(dst, value); if (count == 3) return SetBlock<3>(dst, value); if (count == 4) return SetBlock<4>(dst, value); if (count <= 8) return SetBlockOverlap<4>(dst, value, count); if (count <= 16) return SetBlockOverlap<8>(dst, value, count); if (count <= 32) return SetBlockOverlap<16>(dst, value, count); if (count <= 64) return SetBlockOverlap<32>(dst, value, count); if (count <= 128) return SetBlockOverlap<64>(dst, value, count); return SetAlignedBlocks<32>(dst, value, count); } } // namespace __llvm_libc #endif // LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H