1 #pragma once
2 #ifndef FXDIV_H
3 #define FXDIV_H
4 
5 #if defined(__cplusplus) && (__cplusplus >= 201103L)
6 	#include <cstddef>
7 	#include <cstdint>
8 	#include <climits>
9 #elif !defined(__OPENCL_VERSION__)
10 	#include <stddef.h>
11 	#include <stdint.h>
12 	#include <limits.h>
13 #endif
14 
15 #if defined(_MSC_VER)
16 	#include <intrin.h>
17 	#if defined(_M_IX86) || defined(_M_X64)
18 		#include <immintrin.h>
19 	#endif
20 #endif
21 
22 #ifndef FXDIV_USE_INLINE_ASSEMBLY
23 	#define FXDIV_USE_INLINE_ASSEMBLY 0
24 #endif
25 
fxdiv_mulext_uint32_t(uint32_t a,uint32_t b)26 static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
27 #if defined(_MSC_VER) && defined(_M_IX86)
28 	return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
29 #else
30 	return (uint64_t) a * (uint64_t) b;
31 #endif
32 }
33 
fxdiv_mulhi_uint32_t(uint32_t a,uint32_t b)34 static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
35 #if defined(__OPENCL_VERSION__)
36 	return mul_hi(a, b);
37 #elif defined(__CUDA_ARCH__)
38 	return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
39 #elif defined(_MSC_VER) && defined(_M_IX86)
40 	return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
41 #elif defined(_MSC_VER) && defined(_M_ARM)
42 	return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
43 #else
44 	return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
45 #endif
46 }
47 
fxdiv_mulhi_uint64_t(uint64_t a,uint64_t b)48 static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
49 #if defined(__OPENCL_VERSION__)
50 	return mul_hi(a, b);
51 #elif defined(__CUDA_ARCH__)
52 	return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
53 #elif defined(_MSC_VER) && defined(_M_X64)
54 	return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
55 #elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
56 	return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
57 #else
58 	const uint32_t a_lo = (uint32_t) a;
59 	const uint32_t a_hi = (uint32_t) (a >> 32);
60 	const uint32_t b_lo = (uint32_t) b;
61 	const uint32_t b_hi = (uint32_t) (b >> 32);
62 
63 	const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
64 		(uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
65 	return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
66 		((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
67 #endif
68 }
69 
fxdiv_mulhi_size_t(size_t a,size_t b)70 static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
71 #if SIZE_MAX == UINT32_MAX
72 	return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
73 #elif SIZE_MAX == UINT64_MAX
74 	return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
75 #else
76 	#error Unsupported platform
77 #endif
78 }
79 
80 struct fxdiv_divisor_uint32_t {
81 	uint32_t value;
82 	uint32_t m;
83 	uint8_t s1;
84 	uint8_t s2;
85 };
86 
87 struct fxdiv_result_uint32_t {
88 	uint32_t quotient;
89 	uint32_t remainder;
90 };
91 
92 struct fxdiv_divisor_uint64_t {
93 	uint64_t value;
94 	uint64_t m;
95 	uint8_t s1;
96 	uint8_t s2;
97 };
98 
99 struct fxdiv_result_uint64_t {
100 	uint64_t quotient;
101 	uint64_t remainder;
102 };
103 
104 struct fxdiv_divisor_size_t {
105 	size_t value;
106 	size_t m;
107 	uint8_t s1;
108 	uint8_t s2;
109 };
110 
111 struct fxdiv_result_size_t {
112 	size_t quotient;
113 	size_t remainder;
114 };
115 
fxdiv_init_uint32_t(uint32_t d)116 static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
117 	struct fxdiv_divisor_uint32_t result = { d };
118 	if (d == 1) {
119 		result.m = UINT32_C(1);
120 		result.s1 = 0;
121 		result.s2 = 0;
122 	} else {
123 		#if defined(__OPENCL_VERSION__)
124 			const uint32_t l_minus_1 = 31 - clz(d - 1);
125 		#elif defined(__CUDA_ARCH__)
126 			const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
127 		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
128 			unsigned long l_minus_1;
129 			_BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
130 		#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
131 			uint32_t l_minus_1;
132 			__asm__("BSRL %[d_minus_1], %[l_minus_1]"
133 				: [l_minus_1] "=r" (l_minus_1)
134 				: [d_minus_1] "r" (d - 1)
135 				: "cc");
136 		#elif defined(__GNUC__)
137 			const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
138 		#else
139 			/* Based on Algorithm 2 from Hacker's delight */
140 
141 			uint32_t l_minus_1 = 0;
142 			uint32_t x = d - 1;
143 			uint32_t y = x >> 16;
144 			if (y != 0) {
145 				l_minus_1 += 16;
146 				x = y;
147 			}
148 			y = x >> 8;
149 			if (y != 0) {
150 				l_minus_1 += 8;
151 				x = y;
152 			}
153 			y = x >> 4;
154 			if (y != 0) {
155 				l_minus_1 += 4;
156 				x = y;
157 			}
158 			y = x >> 2;
159 			if (y != 0) {
160 				l_minus_1 += 2;
161 				x = y;
162 			}
163 			if ((x & 2) != 0) {
164 				l_minus_1 += 1;
165 			}
166 		#endif
167 		uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
168 
169 		/* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
170 		#if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
171 			uint32_t q;
172 			__asm__("DIVL %[d]"
173 				: "=a" (q), "+d" (u_hi)
174 				: [d] "r" (d), "a" (0)
175 				: "cc");
176 		#elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64))
177 			unsigned int remainder;
178 			const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder);
179 		#else
180 			const uint32_t q = ((uint64_t) u_hi << 32) / d;
181 		#endif
182 
183 		result.m = q + UINT32_C(1);
184 		result.s1 = 1;
185 		result.s2 = (uint8_t) l_minus_1;
186 	}
187 	return result;
188 }
189 
fxdiv_init_uint64_t(uint64_t d)190 static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
191 	struct fxdiv_divisor_uint64_t result = { d };
192 	if (d == 1) {
193 		result.m = UINT64_C(1);
194 		result.s1 = 0;
195 		result.s2 = 0;
196 	} else {
197 		#if defined(__OPENCL_VERSION__)
198 			const uint32_t nlz_d = clz(d);
199 			const uint32_t l_minus_1 = 63 - clz(d - 1);
200 		#elif defined(__CUDA_ARCH__)
201 			const uint32_t nlz_d = __clzll((long long) d);
202 			const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
203 		#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
204 			unsigned long l_minus_1;
205 			_BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
206 			unsigned long bsr_d;
207 			_BitScanReverse64(&bsr_d, (unsigned __int64) d);
208 			const uint32_t nlz_d = bsr_d ^ 0x3F;
209 		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM))
210 			const uint64_t d_minus_1 = d - 1;
211 			const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
212 			unsigned long l_minus_1;
213 			if ((uint32_t) (d_minus_1 >> 32) == 0) {
214 				_BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
215 			} else {
216 				_BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
217 				l_minus_1 += 32;
218 			}
219 			const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
220 		#elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
221 			uint64_t l_minus_1;
222 			__asm__("BSRQ %[d_minus_1], %[l_minus_1]"
223 				: [l_minus_1] "=r" (l_minus_1)
224 				: [d_minus_1] "r" (d - 1)
225 				: "cc");
226 		#elif defined(__GNUC__)
227 			const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
228 			const uint32_t nlz_d = __builtin_clzll(d);
229 		#else
230 			/* Based on Algorithm 2 from Hacker's delight */
231 			const uint64_t d_minus_1 = d - 1;
232 			const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
233 			uint32_t l_minus_1 = 0;
234 			uint32_t x = (uint32_t) d_minus_1;
235 			uint32_t y = d_minus_1 >> 32;
236 			if (y != 0) {
237 				l_minus_1 += 32;
238 				x = y;
239 			}
240 			y = x >> 16;
241 			if (y != 0) {
242 				l_minus_1 += 16;
243 				x = y;
244 			}
245 			y = x >> 8;
246 			if (y != 0) {
247 				l_minus_1 += 8;
248 				x = y;
249 			}
250 			y = x >> 4;
251 			if (y != 0) {
252 				l_minus_1 += 4;
253 				x = y;
254 			}
255 			y = x >> 2;
256 			if (y != 0) {
257 				l_minus_1 += 2;
258 				x = y;
259 			}
260 			if ((x & 2) != 0) {
261 				l_minus_1 += 1;
262 			}
263 			const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
264 		#endif
265 		uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
266 
267 		/* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
268 		#if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
269 			uint64_t q;
270 			__asm__("DIVQ %[d]"
271 				: "=a" (q), "+d" (u_hi)
272 				: [d] "r" (d), "a" (UINT64_C(0))
273 				: "cc");
274 		#elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__)
275 			/* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */
276 			const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d));
277 		#elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64)
278 			unsigned __int64 remainder;
279 			const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder);
280 		#else
281 			/* Implementation based on code from Hacker's delight */
282 
283 			/* Normalize divisor and shift divident left */
284 			d <<= nlz_d;
285 			u_hi <<= nlz_d;
286 			/* Break divisor up into two 32-bit digits */
287 			const uint64_t d_hi = (uint32_t) (d >> 32);
288 			const uint32_t d_lo = (uint32_t) d;
289 
290 			/* Compute the first quotient digit, q1 */
291 			uint64_t q1 = u_hi / d_hi;
292 			uint64_t r1 = u_hi - q1 * d_hi;
293 
294 			while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
295 				q1 -= 1;
296 				r1 += d_hi;
297 				if ((r1 >> 32) != 0) {
298 					break;
299 				}
300 			}
301 
302 			/* Multiply and subtract. */
303 			u_hi = (u_hi << 32) - q1 * d;
304 
305 			/* Compute the second quotient digit, q0 */
306 			uint64_t q0 = u_hi / d_hi;
307 			uint64_t r0 = u_hi - q0 * d_hi;
308 
309 			while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
310 				q0 -= 1;
311 				r0 += d_hi;
312 				if ((r0 >> 32) != 0) {
313 					break;
314 				}
315 			}
316 			const uint64_t q = (q1 << 32) | (uint32_t) q0;
317 		#endif
318 		result.m = q + UINT64_C(1);
319 		result.s1 = 1;
320 		result.s2 = (uint8_t) l_minus_1;
321 	}
322 	return result;
323 }
324 
fxdiv_init_size_t(size_t d)325 static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
326 #if SIZE_MAX == UINT32_MAX
327 	const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
328 #elif SIZE_MAX == UINT64_MAX
329 	const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
330 #else
331 	#error Unsupported platform
332 #endif
333 	struct fxdiv_divisor_size_t size_result = {
334 		(size_t) uint_result.value,
335 		(size_t) uint_result.m,
336 		uint_result.s1,
337 		uint_result.s2
338 	};
339 	return size_result;
340 }
341 
fxdiv_quotient_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)342 static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
343 	const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
344 	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
345 }
346 
fxdiv_quotient_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)347 static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
348 	const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
349 	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
350 }
351 
fxdiv_quotient_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)352 static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
353 #if SIZE_MAX == UINT32_MAX
354 	const struct fxdiv_divisor_uint32_t uint32_divisor = {
355 		(uint32_t) divisor.value,
356 		(uint32_t) divisor.m,
357 		divisor.s1,
358 		divisor.s2
359 	};
360 	return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
361 #elif SIZE_MAX == UINT64_MAX
362 	const struct fxdiv_divisor_uint64_t uint64_divisor = {
363 		(uint64_t) divisor.value,
364 		(uint64_t) divisor.m,
365 		divisor.s1,
366 		divisor.s2
367 	};
368 	return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
369 #else
370 	#error Unsupported platform
371 #endif
372 }
373 
fxdiv_remainder_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)374 static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
375 	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
376 	return n - quotient * divisor.value;
377 }
378 
fxdiv_remainder_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)379 static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
380 	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
381 	return n - quotient * divisor.value;
382 }
383 
fxdiv_remainder_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)384 static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
385 	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
386 	return n - quotient * divisor.value;
387 }
388 
fxdiv_round_down_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t granularity)389 static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
390 	const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
391 	return quotient * granularity.value;
392 }
393 
fxdiv_round_down_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t granularity)394 static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
395 	const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
396 	return quotient * granularity.value;
397 }
398 
fxdiv_round_down_size_t(size_t n,const struct fxdiv_divisor_size_t granularity)399 static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
400 	const size_t quotient = fxdiv_quotient_size_t(n, granularity);
401 	return quotient * granularity.value;
402 }
403 
fxdiv_divide_uint32_t(uint32_t n,const struct fxdiv_divisor_uint32_t divisor)404 static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
405 	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
406 	const uint32_t remainder = n - quotient * divisor.value;
407 	struct fxdiv_result_uint32_t result = { quotient, remainder };
408 	return result;
409 }
410 
fxdiv_divide_uint64_t(uint64_t n,const struct fxdiv_divisor_uint64_t divisor)411 static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
412 	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
413 	const uint64_t remainder = n - quotient * divisor.value;
414 	struct fxdiv_result_uint64_t result = { quotient, remainder };
415 	return result;
416 }
417 
fxdiv_divide_size_t(size_t n,const struct fxdiv_divisor_size_t divisor)418 static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
419 	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
420 	const size_t remainder = n - quotient * divisor.value;
421 	struct fxdiv_result_size_t result = { quotient, remainder };
422 	return result;
423 }
424 
425 #endif /* FXDIV_H */
426