1 //===-- Implementation of memcpy ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "src/string/memcpy.h"
10 #include "src/__support/common.h"
11 #include "src/string/memory_utils/memcpy_utils.h"
12 
13 namespace __llvm_libc {
14 
CopyRepMovsb(char * __restrict dst,const char * __restrict src,size_t count)15 static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
16                          size_t count) {
17   // FIXME: Add MSVC support with
18   // #include <intrin.h>
19   // __movsb(reinterpret_cast<unsigned char *>(dst),
20   //         reinterpret_cast<const unsigned char *>(src), count);
21   asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
22 }
23 
24 #if defined(__AVX__)
25 #define BEST_SIZE 64
26 #else
27 #define BEST_SIZE 32
28 #endif
29 
30 // Design rationale
31 // ================
32 //
33 // Using a profiler to observe size distributions for calls into libc
34 // functions, it was found most operations act on a small number of bytes.
35 // This makes it important to favor small sizes.
36 //
37 // The tests for `count` are in ascending order so the cost of branching is
38 // proportional to the cost of copying.
39 //
40 // The function is written in C++ for several reasons:
41 // - The compiler can __see__ the code, this is useful when performing Profile
42 //   Guided Optimization as the optimized code can take advantage of branching
43 //   probabilities.
44 // - It also allows for easier customization and favors testing multiple
45 //   implementation parameters.
46 // - As compilers and processors get better, the generated code is improved
47 //   with little change on the code side.
memcpy_x86(char * __restrict dst,const char * __restrict src,size_t count)48 static void memcpy_x86(char *__restrict dst, const char *__restrict src,
49                        size_t count) {
50   if (count == 0)
51     return;
52   if (count == 1)
53     return CopyBlock<1>(dst, src);
54   if (count == 2)
55     return CopyBlock<2>(dst, src);
56   if (count == 3)
57     return CopyBlock<3>(dst, src);
58   if (count == 4)
59     return CopyBlock<4>(dst, src);
60   if (count < 8)
61     return CopyBlockOverlap<4>(dst, src, count);
62   if (count < 16)
63     return CopyBlockOverlap<8>(dst, src, count);
64   if (count < 32)
65     return CopyBlockOverlap<16>(dst, src, count);
66   if (count < 64)
67     return CopyBlockOverlap<32>(dst, src, count);
68   if (count < 128)
69     return CopyBlockOverlap<64>(dst, src, count);
70 #if defined(__AVX__)
71   if (count < 256)
72     return CopyBlockOverlap<128>(dst, src, count);
73 #endif
74   // kRepMovsBSize == -1 : Only CopyAligned is used.
75   // kRepMovsBSize ==  0 : Only RepMovsb is used.
76   // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
77   constexpr size_t kRepMovsBSize = -1;
78   if (count <= kRepMovsBSize)
79     return CopyAlignedBlocks<BEST_SIZE>(dst, src, count);
80   return CopyRepMovsb(dst, src, count);
81 }
82 
LLVM_LIBC_ENTRYPOINT(memcpy)83 void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
84                                    const void *__restrict src, size_t size) {
85   memcpy_x86(reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src),
86              size);
87   return dst;
88 }
89 
90 } // namespace __llvm_libc
91