1 #pragma once
2 
3 #include <stdint.h>
4 #include <stddef.h>
5 
6 /* SSE-specific headers */
7 #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
8 	#include <xmmintrin.h>
9 #endif
10 
11 /* MSVC-specific headers */
12 #if defined(_MSC_VER)
13 	#include <intrin.h>
14 #endif
15 
16 
17 struct fpu_state {
18 #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
19 	uint32_t mxcsr;
20 #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM)
21 	uint32_t fpscr;
22 #elif defined(__GNUC__) && defined(__aarch64__) || defined(_MSC_VER) && defined(_M_ARM64)
23 	uint64_t fpcr;
24 #else
25 	char unused;
26 #endif
27 };
28 
get_fpu_state()29 static inline struct fpu_state get_fpu_state() {
30 	struct fpu_state state = { 0 };
31 #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
32 	state.mxcsr = (uint32_t) _mm_getcsr();
33 #elif defined(_MSC_VER) && defined(_M_ARM)
34 	state.fpscr = (uint32_t) _MoveFromCoprocessor(10, 7, 1, 0, 0);
35 #elif defined(_MSC_VER) && defined(_M_ARM64)
36 	state.fpcr = (uint64_t) _ReadStatusReg(0x5A20);
37 #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
38 	__asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr));
39 #elif defined(__GNUC__) && defined(__aarch64__)
40 	__asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr));
41 #endif
42 	return state;
43 }
44 
set_fpu_state(const struct fpu_state state)45 static inline void set_fpu_state(const struct fpu_state state) {
46 #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
47 	_mm_setcsr((unsigned int) state.mxcsr);
48 #elif defined(_MSC_VER) && defined(_M_ARM)
49 	_MoveToCoprocessor((int) state.fpscr, 10, 7, 1, 0, 0);
50 #elif defined(_MSC_VER) && defined(_M_ARM64)
51 	_WriteStatusReg(0x5A20, (__int64) state.fpcr);
52 #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
53 	__asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr));
54 #elif defined(__GNUC__) && defined(__aarch64__)
55 	__asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr));
56 #endif
57 }
58 
disable_fpu_denormals()59 static inline void disable_fpu_denormals() {
60 #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
61 	_mm_setcsr(_mm_getcsr() | 0x8040);
62 #elif defined(_MSC_VER) && defined(_M_ARM)
63 	int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0);
64 	fpscr |= 0x1000000;
65 	_MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0);
66 #elif defined(_MSC_VER) && defined(_M_ARM64)
67 	__int64 fpcr = _ReadStatusReg(0x5A20);
68 	fpcr |= 0x1080000;
69 	_WriteStatusReg(0x5A20, fpcr);
70 #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
71 	uint32_t fpscr;
72 	#if defined(__thumb__) && !defined(__thumb2__)
73 		__asm__ __volatile__(
74 				"VMRS %[fpscr], fpscr\n"
75 				"ORRS %[fpscr], %[bitmask]\n"
76 				"VMSR fpscr, %[fpscr]\n"
77 			: [fpscr] "=l" (fpscr)
78 			: [bitmask] "l" (0x1000000)
79 			: "cc");
80 	#else
81 		__asm__ __volatile__(
82 				"VMRS %[fpscr], fpscr\n"
83 				"ORR %[fpscr], #0x1000000\n"
84 				"VMSR fpscr, %[fpscr]\n"
85 			: [fpscr] "=r" (fpscr));
86 	#endif
87 #elif defined(__GNUC__) && defined(__aarch64__)
88 	uint64_t fpcr;
89 	__asm__ __volatile__(
90 			"MRS %[fpcr], fpcr\n"
91 			"ORR %w[fpcr], %w[fpcr], 0x1000000\n"
92 			"ORR %w[fpcr], %w[fpcr], 0x80000\n"
93 			"MSR fpcr, %[fpcr]\n"
94 		: [fpcr] "=r" (fpcr));
95 #endif
96 }
97 
modulo_decrement(size_t i,size_t n)98 static inline size_t modulo_decrement(size_t i, size_t n) {
99 	/* Wrap modulo n, if needed */
100 	if (i == 0) {
101 		i = n;
102 	}
103 	/* Decrement input variable */
104 	return i - 1;
105 }
106 
divide_round_up(size_t dividend,size_t divisor)107 static inline size_t divide_round_up(size_t dividend, size_t divisor) {
108 	if (dividend % divisor == 0) {
109 		return dividend / divisor;
110 	} else {
111 		return dividend / divisor + 1;
112 	}
113 }
114 
115 /* Windows headers define min and max macros; undefine it here */
116 #ifdef min
117 	#undef min
118 #endif
119 
min(size_t a,size_t b)120 static inline size_t min(size_t a, size_t b) {
121 	return a < b ? a : b;
122 }
123