1 //===-- Memset utils --------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
10 #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
11 
12 #include "src/string/memory_utils/utils.h"
13 
14 #include <stddef.h> // size_t
15 
16 namespace __llvm_libc {
17 
18 // Sets `kBlockSize` bytes starting from `src` to `value`.
SetBlock(char * dst,unsigned value)19 template <size_t kBlockSize> static void SetBlock(char *dst, unsigned value) {
20   // Theoretically the compiler is allowed to call memset here and end up with a
21   // recursive call, practically it doesn't happen, however this should be
22   // replaced with a __builtin_memset_inline once it's available in clang.
23   __builtin_memset(dst, value, kBlockSize);
24 }
25 
26 // Sets `kBlockSize` bytes from `src + count - kBlockSize` to `value`.
27 // Precondition: `count >= kBlockSize`.
28 template <size_t kBlockSize>
SetLastBlock(char * dst,unsigned value,size_t count)29 static void SetLastBlock(char *dst, unsigned value, size_t count) {
30   SetBlock<kBlockSize>(dst + count - kBlockSize, value);
31 }
32 
33 // Sets `kBlockSize` bytes twice with an overlap between the two.
34 //
35 // [1234567812345678123]
36 // [__XXXXXXXXXXXXXX___]
37 // [__XXXXXXXX_________]
38 // [________XXXXXXXX___]
39 //
40 // Precondition: `count >= kBlockSize && count <= kBlockSize`.
41 template <size_t kBlockSize>
SetBlockOverlap(char * dst,unsigned value,size_t count)42 static void SetBlockOverlap(char *dst, unsigned value, size_t count) {
43   SetBlock<kBlockSize>(dst, value);
44   SetLastBlock<kBlockSize>(dst, value, count);
45 }
46 
47 // Sets `count` bytes by blocks of `kBlockSize` bytes.
48 // Sets at the start and end of the buffer are unaligned.
49 // Sets in the middle of the buffer are aligned to `kBlockSize`.
50 //
51 // e.g. with
52 // [12345678123456781234567812345678]
53 // [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___]
54 // [__XXXXXXXX______________________]
55 // [________XXXXXXXX________________]
56 // [________________XXXXXXXX________]
57 // [_____________________XXXXXXXX___]
58 //
59 // Precondition: `count > 2 * kBlockSize` for efficiency.
60 //               `count >= kBlockSize` for correctness.
61 template <size_t kBlockSize>
SetAlignedBlocks(char * dst,unsigned value,size_t count)62 static void SetAlignedBlocks(char *dst, unsigned value, size_t count) {
63   SetBlock<kBlockSize>(dst, value); // Set first block
64 
65   // Set aligned blocks
66   size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
67   for (; offset + kBlockSize < count; offset += kBlockSize)
68     SetBlock<kBlockSize>(dst + offset, value);
69 
70   SetLastBlock<kBlockSize>(dst, value, count); // Set last block
71 }
72 
73 // A general purpose implementation assuming cheap unaligned writes for sizes:
74 // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
75 // or 64 Bytes at a time, the compiler will expand them as needed.
76 //
77 // This implementation is subject to change as we benchmark more processors. We
78 // may also want to customize it for processors with specialized instructions
79 // that performs better (e.g. `rep stosb`).
80 //
81 // A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
82 // We want to balance two things here:
83 //  - The number of redundant writes (when using `SetBlockOverlap`),
84 //  - The number of conditionals for sizes <=128 (~90% of memset calls are for
85 //    such sizes).
86 //
87 // For the range 64-128:
88 //  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
89 //  is wasteful near 65 but efficient toward 128.
90 //  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
91 //  96 or 128 Bytes.
92 //  - Another approach could be to use an hybrid approach Copy<64>+Overlap<32>
93 //  for 65-96 and Copy<96>+Overlap<32> for 97-128
94 //
95 // Benchmarks showed that redundant writes were cheap (for Intel X86) but
96 // conditional were expensive, even on processor that do not support writing 64B
97 // at a time (pre-AVX512F). We also want to favor short functions that allow
98 // more hot code to fit in the iL1 cache.
99 //
100 // Above 128 we have to use conditionals since we don't know the upper bound in
101 // advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
102 // may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
103 // superior for sizes that mattered.
GeneralPurposeMemset(char * dst,unsigned char value,size_t count)104 inline static void GeneralPurposeMemset(char *dst, unsigned char value,
105                                         size_t count) {
106   if (count == 0)
107     return;
108   if (count == 1)
109     return SetBlock<1>(dst, value);
110   if (count == 2)
111     return SetBlock<2>(dst, value);
112   if (count == 3)
113     return SetBlock<3>(dst, value);
114   if (count == 4)
115     return SetBlock<4>(dst, value);
116   if (count <= 8)
117     return SetBlockOverlap<4>(dst, value, count);
118   if (count <= 16)
119     return SetBlockOverlap<8>(dst, value, count);
120   if (count <= 32)
121     return SetBlockOverlap<16>(dst, value, count);
122   if (count <= 64)
123     return SetBlockOverlap<32>(dst, value, count);
124   if (count <= 128)
125     return SetBlockOverlap<64>(dst, value, count);
126   return SetAlignedBlocks<32>(dst, value, count);
127 }
128 
129 } // namespace __llvm_libc
130 
131 #endif //  LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
132