1 #include <iostream>
2 #include <chrono>
3 #include <vector>
4 #include <algorithm>
5 #include <numeric>
6 #include <stdlib.h>
7 #include <memory>
8 #include <cmath>
9 #include <string>
10 
11 using namespace std;
12 
13 const size_t size_start = 64;
14 const size_t size_end = 16 * (1ull << 20);
15 const size_t samples = 2048;
16 size_t size_per_test = 64 * (1ull << 20);
17 size_t tot_sum = 0;
18 
19 void __attribute__((noinline)) memcpy_noinline(void *dst, void *src, size_t size);
20 void __attribute__((noinline)) memset_noinline(void *dst, int value, size_t size);
21 uint64_t __attribute__((noinline)) sum(volatile void *src, size_t size);
22 
23 enum BenchType {
24     MemcpyBench,
25     MemsetBench,
26     SumBench,
27 };
28 
main(int argc,char * argv[])29 int main(int argc, char *argv[])
30 {
31     BenchType type;
32     if (argc <= 1) {
33         cerr << "memcpy_perf [--memcpy|--memset|--sum]" << endl;
34         return 0;
35     }
36     if (string(argv[1]) == string("--memcpy")) {
37         type = MemcpyBench;
38     } else if (string(argv[1]) == string("--memset")) {
39         type = MemsetBench;
40     } else if (string(argv[1]) == string("--sum")) {
41         type = SumBench;
42     } else {
43         type = MemcpyBench;
44     }
45 
46     unique_ptr<uint8_t[]> src(new uint8_t[size_end]);
47     unique_ptr<uint8_t[]> dst(new uint8_t[size_end]);
48     memset(src.get(), 1, size_end);
49 
50     double start_pow = log10(size_start);
51     double end_pow = log10(size_end);
52     double pow_inc = (end_pow - start_pow) / samples;
53 
54     //cout << "src: " << (uintptr_t)src.get() << endl;
55     //cout << "dst: " <<  (uintptr_t)dst.get() << endl;
56 
57     for (double cur_pow = start_pow; cur_pow <= end_pow; cur_pow += pow_inc) {
58         chrono::time_point<chrono::high_resolution_clock> copy_start, copy_end;
59 
60         size_t cur_size = (size_t)pow(10.0, cur_pow);
61         size_t iter_per_size = size_per_test / cur_size;
62 
63         // run benchmark
64         switch (type) {
65             case MemsetBench: {
66                 memcpy_noinline(src.get(), dst.get(), cur_size);
67                 memset_noinline(dst.get(), 0xdeadbeef, cur_size);
68                 copy_start = chrono::high_resolution_clock::now();
69                 for (int i = 0; i < iter_per_size; i++) {
70                     memset_noinline(dst.get(), 0xdeadbeef, cur_size);
71                 }
72                 copy_end = chrono::high_resolution_clock::now();
73                 break;
74             }
75             case MemcpyBench: {
76                 memcpy_noinline(dst.get(), src.get(), cur_size);
77                 memcpy_noinline(src.get(), dst.get(), cur_size);
78                 copy_start = chrono::high_resolution_clock::now();
79                 for (int i = 0; i < iter_per_size; i++) {
80                     memcpy_noinline(dst.get(), src.get(), cur_size);
81                 }
82                 copy_end = chrono::high_resolution_clock::now();
83                 break;
84             }
85             case SumBench: {
86                 uint64_t s = 0;
87                 s += sum(src.get(), cur_size);
88                 copy_start = chrono::high_resolution_clock::now();
89                 for (int i = 0; i < iter_per_size; i++) {
90                     s += sum(src.get(), cur_size);
91                 }
92                 copy_end = chrono::high_resolution_clock::now();
93                 tot_sum += s;
94                 break;
95             }
96         }
97 
98         double ns_per_copy = chrono::duration_cast<chrono::nanoseconds>(copy_end - copy_start).count() / double(iter_per_size);
99         double gb_per_sec = ((double)cur_size / (1ull<<30)) / (ns_per_copy / 1.0E9);
100         if (type == MemcpyBench)
101             gb_per_sec *= 2.0;
102         cout << "size: " << cur_size << ", perf: " << gb_per_sec << "GB/s, iter: " << iter_per_size << endl;
103     }
104     return 0;
105 }
106