1 #include <stdbool.h>
2 #include <stddef.h>
3 #include <stdint.h>
4 
5 #include "blake3_impl.h"
6 
7 #if defined(IS_X86)
8 #if defined(_MSC_VER)
9 #include <intrin.h>
10 #elif defined(__GNUC__)
11 #include <immintrin.h>
12 #else
13 #undef IS_X86 /* Unimplemented! */
14 #endif
15 #endif
16 
17 #define MAYBE_UNUSED(x) (void)((x))
18 
19 #if defined(IS_X86)
xgetbv(void)20 static uint64_t xgetbv(void) {
21 #if defined(_MSC_VER)
22   return _xgetbv(0);
23 #else
24   uint32_t eax = 0, edx = 0;
25   __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26   return ((uint64_t)edx << 32) | eax;
27 #endif
28 }
29 
cpuid(uint32_t out[4],uint32_t id)30 static void cpuid(uint32_t out[4], uint32_t id) {
31 #if defined(_MSC_VER)
32   __cpuid((int *)out, id);
33 #elif defined(__i386__) || defined(_M_IX86)
34   __asm__ __volatile__("movl %%ebx, %1\n"
35                        "cpuid\n"
36                        "xchgl %1, %%ebx\n"
37                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38                        : "a"(id));
39 #else
40   __asm__ __volatile__("cpuid\n"
41                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42                        : "a"(id));
43 #endif
44 }
45 
cpuidex(uint32_t out[4],uint32_t id,uint32_t sid)46 static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47 #if defined(_MSC_VER)
48   __cpuidex((int *)out, id, sid);
49 #elif defined(__i386__) || defined(_M_IX86)
50   __asm__ __volatile__("movl %%ebx, %1\n"
51                        "cpuid\n"
52                        "xchgl %1, %%ebx\n"
53                        : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54                        : "a"(id), "c"(sid));
55 #else
56   __asm__ __volatile__("cpuid\n"
57                        : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58                        : "a"(id), "c"(sid));
59 #endif
60 }
61 
62 #endif
63 
64 enum cpu_feature {
65   SSE2 = 1 << 0,
66   SSSE3 = 1 << 1,
67   SSE41 = 1 << 2,
68   AVX = 1 << 3,
69   AVX2 = 1 << 4,
70   AVX512F = 1 << 5,
71   AVX512VL = 1 << 6,
72   /* ... */
73   UNDEFINED = 1 << 30
74 };
75 
76 #if !defined(BLAKE3_TESTING)
77 static /* Allow the variable to be controlled manually for testing */
78 #endif
79     enum cpu_feature g_cpu_features = UNDEFINED;
80 
81 #if !defined(BLAKE3_TESTING)
82 static
83 #endif
84     enum cpu_feature
get_cpu_features(void)85     get_cpu_features(void) {
86 
87   if (g_cpu_features != UNDEFINED) {
88     return g_cpu_features;
89   } else {
90 #if defined(IS_X86)
91     uint32_t regs[4] = {0};
92     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
93     (void)edx;
94     enum cpu_feature features = 0;
95     cpuid(regs, 0);
96     const int max_id = *eax;
97     cpuid(regs, 1);
98 #if defined(__amd64__) || defined(_M_X64)
99     features |= SSE2;
100 #else
101     if (*edx & (1UL << 26))
102       features |= SSE2;
103 #endif
104     if (*ecx & (1UL << 0))
105       features |= SSSE3;
106     if (*ecx & (1UL << 19))
107       features |= SSE41;
108 
109     if (*ecx & (1UL << 27)) { // OSXSAVE
110       const uint64_t mask = xgetbv();
111       if ((mask & 6) == 6) { // SSE and AVX states
112         if (*ecx & (1UL << 28))
113           features |= AVX;
114         if (max_id >= 7) {
115           cpuidex(regs, 7, 0);
116           if (*ebx & (1UL << 5))
117             features |= AVX2;
118           if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
119             if (*ebx & (1UL << 31))
120               features |= AVX512VL;
121             if (*ebx & (1UL << 16))
122               features |= AVX512F;
123           }
124         }
125       }
126     }
127     g_cpu_features = features;
128     return features;
129 #else
130     /* How to detect NEON? */
131     return 0;
132 #endif
133   }
134 }
135 
blake3_compress_in_place(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)136 void blake3_compress_in_place(uint32_t cv[8],
137                               const uint8_t block[BLAKE3_BLOCK_LEN],
138                               uint8_t block_len, uint64_t counter,
139                               uint8_t flags) {
140 #if defined(IS_X86)
141   const enum cpu_feature features = get_cpu_features();
142   MAYBE_UNUSED(features);
143 #if !defined(BLAKE3_NO_AVX512)
144   if (features & AVX512VL) {
145     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
146     return;
147   }
148 #endif
149 #if !defined(BLAKE3_NO_SSE41)
150   if (features & SSE41) {
151     blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
152     return;
153   }
154 #endif
155 #if !defined(BLAKE3_NO_SSE2)
156   if (features & SSE2) {
157     blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
158     return;
159   }
160 #endif
161 #endif
162   blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
163 }
164 
blake3_compress_xof(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])165 void blake3_compress_xof(const uint32_t cv[8],
166                          const uint8_t block[BLAKE3_BLOCK_LEN],
167                          uint8_t block_len, uint64_t counter, uint8_t flags,
168                          uint8_t out[64]) {
169 #if defined(IS_X86)
170   const enum cpu_feature features = get_cpu_features();
171   MAYBE_UNUSED(features);
172 #if !defined(BLAKE3_NO_AVX512)
173   if (features & AVX512VL) {
174     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
175     return;
176   }
177 #endif
178 #if !defined(BLAKE3_NO_SSE41)
179   if (features & SSE41) {
180     blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
181     return;
182   }
183 #endif
184 #if !defined(BLAKE3_NO_SSE2)
185   if (features & SSE2) {
186     blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
187     return;
188   }
189 #endif
190 #endif
191   blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
192 }
193 
blake3_hash_many(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)194 void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
195                       size_t blocks, const uint32_t key[8], uint64_t counter,
196                       bool increment_counter, uint8_t flags,
197                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
198 #if defined(IS_X86)
199   const enum cpu_feature features = get_cpu_features();
200   MAYBE_UNUSED(features);
201 #if !defined(BLAKE3_NO_AVX512)
202   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
203     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
204                             increment_counter, flags, flags_start, flags_end,
205                             out);
206     return;
207   }
208 #endif
209 #if !defined(BLAKE3_NO_AVX2)
210   if (features & AVX2) {
211     blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
212                           increment_counter, flags, flags_start, flags_end,
213                           out);
214     return;
215   }
216 #endif
217 #if !defined(BLAKE3_NO_SSE41)
218   if (features & SSE41) {
219     blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
220                            increment_counter, flags, flags_start, flags_end,
221                            out);
222     return;
223   }
224 #endif
225 #if !defined(BLAKE3_NO_SSE2)
226   if (features & SSE2) {
227     blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
228                           increment_counter, flags, flags_start, flags_end,
229                           out);
230     return;
231   }
232 #endif
233 #endif
234 
235 #if BLAKE3_USE_NEON == 1
236   blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
237                         increment_counter, flags, flags_start, flags_end, out);
238   return;
239 #endif
240 
241   blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
242                             increment_counter, flags, flags_start, flags_end,
243                             out);
244 }
245 
246 // The dynamically detected SIMD degree of the current platform.
blake3_simd_degree(void)247 size_t blake3_simd_degree(void) {
248 #if defined(IS_X86)
249   const enum cpu_feature features = get_cpu_features();
250   MAYBE_UNUSED(features);
251 #if !defined(BLAKE3_NO_AVX512)
252   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
253     return 16;
254   }
255 #endif
256 #if !defined(BLAKE3_NO_AVX2)
257   if (features & AVX2) {
258     return 8;
259   }
260 #endif
261 #if !defined(BLAKE3_NO_SSE41)
262   if (features & SSE41) {
263     return 4;
264   }
265 #endif
266 #if !defined(BLAKE3_NO_SSE2)
267   if (features & SSE2) {
268     return 4;
269   }
270 #endif
271 #endif
272 #if BLAKE3_USE_NEON == 1
273   return 4;
274 #endif
275   return 1;
276 }
277