1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstddef>
9 #include <cstdint>
10 #include <cstdlib>
11 #include <iomanip>
12 #include <ios>
13 #include <vector>
14 
15 #include <gtest/gtest.h>
16 
17 #include <fp16.h>
18 
19 #include <xnnpack/AlignedAllocator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/isa-checks.h>
22 #include <xnnpack/math-stubs.h>
23 
24 
25 constexpr int kBlockSize = 1024;
26 
27 
28 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(EXP__NEONFMA_RR2_LUT64_P2,negative_zero)29   TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_zero) {
30     TEST_REQUIRES_ARM_NEON_FMA;
31 
32     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
33     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
34     std::fill(inputs.begin(), inputs.end(), -0.0f);
35     xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
36     const float reference_output = 1.0f;
37     ASSERT_EQ(reference_output, outputs[0])
38       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
39       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
40       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
41   }
42 
TEST(EXP__NEONFMA_RR2_LUT64_P2,positive_zero)43   TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_zero) {
44     TEST_REQUIRES_ARM_NEON_FMA;
45 
46     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
47     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
48     std::fill(inputs.begin(), inputs.end(), +0.0f);
49     xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
50     const float reference_output = 1.0f;
51     ASSERT_EQ(reference_output, outputs[0])
52       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
53       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
54       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
55   }
56 
TEST(EXP__NEONFMA_RR2_LUT64_P2,negative_saturation)57   TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_saturation) {
58     TEST_REQUIRES_ARM_NEON_FMA;
59 
60     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
61     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
62     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
63       for (uint32_t i = 0; i < kBlockSize; i++) {
64         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
65       }
66       xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
67       for (uint32_t i = 0; i < kBlockSize; i++) {
68         const uint32_t reference_output = UINT32_C(0x00000000);
69         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
70           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
71           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
72           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
73       }
74     }
75   }
76 
TEST(EXP__NEONFMA_RR2_LUT64_P2,positive_overflow)77   TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_overflow) {
78     TEST_REQUIRES_ARM_NEON_FMA;
79 
80     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
81     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
82     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
83       for (uint32_t i = 0; i < kBlockSize; i++) {
84         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
85       }
86       xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
87       for (uint32_t i = 0; i < kBlockSize; i++) {
88         const uint32_t reference_output = UINT32_C(0x7F800000);
89         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
90           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
91           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
92           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
93       }
94     }
95   }
96 
TEST(EXP__NEONFMA_RR2_LUT64_P2,positive_nan)97   TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_nan) {
98     TEST_REQUIRES_ARM_NEON_FMA;
99 
100     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
101     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
102     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
103       for (uint32_t i = 0; i < kBlockSize; i++) {
104         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
105       }
106       xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
107       for (uint32_t i = 0; i < kBlockSize; i++) {
108         ASSERT_TRUE(std::isnan(outputs[i]))
109           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
110           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
111       }
112     }
113   }
114 
TEST(EXP__NEONFMA_RR2_LUT64_P2,negative_nan)115   TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_nan) {
116     TEST_REQUIRES_ARM_NEON_FMA;
117 
118     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
119     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
120     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
121       for (uint32_t i = 0; i < kBlockSize; i++) {
122         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
123       }
124       xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
125       for (uint32_t i = 0; i < kBlockSize; i++) {
126         ASSERT_TRUE(std::isnan(outputs[i]))
127           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
128           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
129       }
130     }
131   }
132 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
133 
134 
135 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(EXP__NEONFMA_RR2_P5,negative_zero)136   TEST(EXP__NEONFMA_RR2_P5, negative_zero) {
137     TEST_REQUIRES_ARM_NEON_FMA;
138 
139     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
140     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
141     std::fill(inputs.begin(), inputs.end(), -0.0f);
142     xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
143     const float reference_output = 1.0f;
144     ASSERT_EQ(reference_output, outputs[0])
145       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
146       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
147       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
148   }
149 
TEST(EXP__NEONFMA_RR2_P5,positive_zero)150   TEST(EXP__NEONFMA_RR2_P5, positive_zero) {
151     TEST_REQUIRES_ARM_NEON_FMA;
152 
153     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
154     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
155     std::fill(inputs.begin(), inputs.end(), +0.0f);
156     xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
157     const float reference_output = 1.0f;
158     ASSERT_EQ(reference_output, outputs[0])
159       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
160       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
161       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
162   }
163 
TEST(EXP__NEONFMA_RR2_P5,negative_saturation)164   TEST(EXP__NEONFMA_RR2_P5, negative_saturation) {
165     TEST_REQUIRES_ARM_NEON_FMA;
166 
167     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
168     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
169     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
170       for (uint32_t i = 0; i < kBlockSize; i++) {
171         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
172       }
173       xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
174       for (uint32_t i = 0; i < kBlockSize; i++) {
175         const uint32_t reference_output = UINT32_C(0x00000000);
176         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
177           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
178           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
179           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
180       }
181     }
182   }
183 
TEST(EXP__NEONFMA_RR2_P5,positive_overflow)184   TEST(EXP__NEONFMA_RR2_P5, positive_overflow) {
185     TEST_REQUIRES_ARM_NEON_FMA;
186 
187     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
188     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
189     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
190       for (uint32_t i = 0; i < kBlockSize; i++) {
191         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
192       }
193       xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
194       for (uint32_t i = 0; i < kBlockSize; i++) {
195         const uint32_t reference_output = UINT32_C(0x7F800000);
196         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
197           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
198           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
199           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
200       }
201     }
202   }
203 
TEST(EXP__NEONFMA_RR2_P5,positive_nan)204   TEST(EXP__NEONFMA_RR2_P5, positive_nan) {
205     TEST_REQUIRES_ARM_NEON_FMA;
206 
207     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
208     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
209     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
210       for (uint32_t i = 0; i < kBlockSize; i++) {
211         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
212       }
213       xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
214       for (uint32_t i = 0; i < kBlockSize; i++) {
215         ASSERT_TRUE(std::isnan(outputs[i]))
216           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
217           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
218       }
219     }
220   }
221 
TEST(EXP__NEONFMA_RR2_P5,negative_nan)222   TEST(EXP__NEONFMA_RR2_P5, negative_nan) {
223     TEST_REQUIRES_ARM_NEON_FMA;
224 
225     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
226     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
227     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
228       for (uint32_t i = 0; i < kBlockSize; i++) {
229         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
230       }
231       xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
232       for (uint32_t i = 0; i < kBlockSize; i++) {
233         ASSERT_TRUE(std::isnan(outputs[i]))
234           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
235           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
236       }
237     }
238   }
239 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
240 
241 
242 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,negative_zero)243   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_zero) {
244     TEST_REQUIRES_X86_AVX512F;
245 
246     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
247     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
248     std::fill(inputs.begin(), inputs.end(), -0.0f);
249     xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
250     const float reference_output = 1.0f;
251     ASSERT_EQ(reference_output, outputs[0])
252       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
253       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
254       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
255   }
256 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,positive_zero)257   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_zero) {
258     TEST_REQUIRES_X86_AVX512F;
259 
260     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
261     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
262     std::fill(inputs.begin(), inputs.end(), +0.0f);
263     xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
264     const float reference_output = 1.0f;
265     ASSERT_EQ(reference_output, outputs[0])
266       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
267       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
268       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
269   }
270 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,negative_saturation)271   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_saturation) {
272     TEST_REQUIRES_X86_AVX512F;
273 
274     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
275     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
276     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
277       for (uint32_t i = 0; i < kBlockSize; i++) {
278         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
279       }
280       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
281       for (uint32_t i = 0; i < kBlockSize; i++) {
282         const uint32_t reference_output = UINT32_C(0x00000000);
283         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
284           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
285           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
286           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
287       }
288     }
289   }
290 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,positive_overflow)291   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_overflow) {
292     TEST_REQUIRES_X86_AVX512F;
293 
294     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
295     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
296     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
297       for (uint32_t i = 0; i < kBlockSize; i++) {
298         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
299       }
300       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
301       for (uint32_t i = 0; i < kBlockSize; i++) {
302         const uint32_t reference_output = UINT32_C(0x7F800000);
303         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
304           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
305           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
306           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
307       }
308     }
309   }
310 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,positive_nan)311   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_nan) {
312     TEST_REQUIRES_X86_AVX512F;
313 
314     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
315     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
316     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
317       for (uint32_t i = 0; i < kBlockSize; i++) {
318         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
319       }
320       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
321       for (uint32_t i = 0; i < kBlockSize; i++) {
322         ASSERT_TRUE(std::isnan(outputs[i]))
323           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
324           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
325       }
326     }
327   }
328 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,negative_nan)329   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_nan) {
330     TEST_REQUIRES_X86_AVX512F;
331 
332     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
333     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
334     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
335       for (uint32_t i = 0; i < kBlockSize; i++) {
336         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
337       }
338       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
339       for (uint32_t i = 0; i < kBlockSize; i++) {
340         ASSERT_TRUE(std::isnan(outputs[i]))
341           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
342           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
343       }
344     }
345   }
346 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
347 
348 
349 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,negative_zero)350   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_zero) {
351     TEST_REQUIRES_X86_AVX512F;
352 
353     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
354     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
355     std::fill(inputs.begin(), inputs.end(), -0.0f);
356     xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
357     const float reference_output = 1.0f;
358     ASSERT_EQ(reference_output, outputs[0])
359       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
360       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
361       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
362   }
363 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,positive_zero)364   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_zero) {
365     TEST_REQUIRES_X86_AVX512F;
366 
367     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
368     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
369     std::fill(inputs.begin(), inputs.end(), +0.0f);
370     xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
371     const float reference_output = 1.0f;
372     ASSERT_EQ(reference_output, outputs[0])
373       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
374       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
375       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
376   }
377 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,negative_saturation)378   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_saturation) {
379     TEST_REQUIRES_X86_AVX512F;
380 
381     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
382     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
383     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
384       for (uint32_t i = 0; i < kBlockSize; i++) {
385         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
386       }
387       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
388       for (uint32_t i = 0; i < kBlockSize; i++) {
389         const uint32_t reference_output = UINT32_C(0x00000000);
390         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
391           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
392           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
393           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
394       }
395     }
396   }
397 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,positive_overflow)398   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_overflow) {
399     TEST_REQUIRES_X86_AVX512F;
400 
401     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
402     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
403     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
404       for (uint32_t i = 0; i < kBlockSize; i++) {
405         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
406       }
407       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
408       for (uint32_t i = 0; i < kBlockSize; i++) {
409         const uint32_t reference_output = UINT32_C(0x7F800000);
410         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
411           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
412           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
413           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
414       }
415     }
416   }
417 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,positive_nan)418   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_nan) {
419     TEST_REQUIRES_X86_AVX512F;
420 
421     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
422     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
423     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
424       for (uint32_t i = 0; i < kBlockSize; i++) {
425         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
426       }
427       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
428       for (uint32_t i = 0; i < kBlockSize; i++) {
429         ASSERT_TRUE(std::isnan(outputs[i]))
430           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
431           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
432       }
433     }
434   }
435 
TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,negative_nan)436   TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_nan) {
437     TEST_REQUIRES_X86_AVX512F;
438 
439     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
440     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
441     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
442       for (uint32_t i = 0; i < kBlockSize; i++) {
443         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
444       }
445       xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
446       for (uint32_t i = 0; i < kBlockSize; i++) {
447         ASSERT_TRUE(std::isnan(outputs[i]))
448           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
449           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
450       }
451     }
452   }
453 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
454 
455 
456 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,negative_zero)457   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_zero) {
458     TEST_REQUIRES_X86_AVX512F;
459 
460     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
461     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
462     std::fill(inputs.begin(), inputs.end(), -0.0f);
463     xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
464     const float reference_output = 1.0f;
465     ASSERT_EQ(reference_output, outputs[0])
466       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
467       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
468       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
469   }
470 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,positive_zero)471   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_zero) {
472     TEST_REQUIRES_X86_AVX512F;
473 
474     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
475     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
476     std::fill(inputs.begin(), inputs.end(), +0.0f);
477     xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
478     const float reference_output = 1.0f;
479     ASSERT_EQ(reference_output, outputs[0])
480       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
481       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
482       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
483   }
484 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,negative_saturation)485   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_saturation) {
486     TEST_REQUIRES_X86_AVX512F;
487 
488     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
489     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
490     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
491       for (uint32_t i = 0; i < kBlockSize; i++) {
492         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
493       }
494       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
495       for (uint32_t i = 0; i < kBlockSize; i++) {
496         const uint32_t reference_output = UINT32_C(0x00000000);
497         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
498           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
499           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
500           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
501       }
502     }
503   }
504 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,positive_overflow)505   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_overflow) {
506     TEST_REQUIRES_X86_AVX512F;
507 
508     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
509     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
510     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
511       for (uint32_t i = 0; i < kBlockSize; i++) {
512         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
513       }
514       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
515       for (uint32_t i = 0; i < kBlockSize; i++) {
516         const uint32_t reference_output = UINT32_C(0x7F800000);
517         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
518           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
519           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
520           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
521       }
522     }
523   }
524 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,positive_nan)525   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_nan) {
526     TEST_REQUIRES_X86_AVX512F;
527 
528     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
529     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
530     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
531       for (uint32_t i = 0; i < kBlockSize; i++) {
532         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
533       }
534       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
535       for (uint32_t i = 0; i < kBlockSize; i++) {
536         ASSERT_TRUE(std::isnan(outputs[i]))
537           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
538           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
539       }
540     }
541   }
542 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,negative_nan)543   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_nan) {
544     TEST_REQUIRES_X86_AVX512F;
545 
546     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
547     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
548     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
549       for (uint32_t i = 0; i < kBlockSize; i++) {
550         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
551       }
552       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
553       for (uint32_t i = 0; i < kBlockSize; i++) {
554         ASSERT_TRUE(std::isnan(outputs[i]))
555           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
556           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
557       }
558     }
559   }
560 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
561 
562 
563 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,negative_zero)564   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_zero) {
565     TEST_REQUIRES_X86_AVX512F;
566 
567     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
568     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
569     std::fill(inputs.begin(), inputs.end(), -0.0f);
570     xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
571     const float reference_output = 1.0f;
572     ASSERT_EQ(reference_output, outputs[0])
573       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
574       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
575       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
576   }
577 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,positive_zero)578   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_zero) {
579     TEST_REQUIRES_X86_AVX512F;
580 
581     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
582     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
583     std::fill(inputs.begin(), inputs.end(), +0.0f);
584     xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
585     const float reference_output = 1.0f;
586     ASSERT_EQ(reference_output, outputs[0])
587       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
588       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
589       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
590   }
591 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,negative_saturation)592   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_saturation) {
593     TEST_REQUIRES_X86_AVX512F;
594 
595     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
596     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
597     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
598       for (uint32_t i = 0; i < kBlockSize; i++) {
599         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
600       }
601       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
602       for (uint32_t i = 0; i < kBlockSize; i++) {
603         const uint32_t reference_output = UINT32_C(0x00000000);
604         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
605           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
606           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
607           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
608       }
609     }
610   }
611 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,positive_overflow)612   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_overflow) {
613     TEST_REQUIRES_X86_AVX512F;
614 
615     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
616     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
617     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
618       for (uint32_t i = 0; i < kBlockSize; i++) {
619         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
620       }
621       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
622       for (uint32_t i = 0; i < kBlockSize; i++) {
623         const uint32_t reference_output = UINT32_C(0x7F800000);
624         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
625           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
626           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
627           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
628       }
629     }
630   }
631 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,positive_nan)632   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_nan) {
633     TEST_REQUIRES_X86_AVX512F;
634 
635     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
636     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
637     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
638       for (uint32_t i = 0; i < kBlockSize; i++) {
639         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
640       }
641       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
642       for (uint32_t i = 0; i < kBlockSize; i++) {
643         ASSERT_TRUE(std::isnan(outputs[i]))
644           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
645           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
646       }
647     }
648   }
649 
TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,negative_nan)650   TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_nan) {
651     TEST_REQUIRES_X86_AVX512F;
652 
653     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
654     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
655     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
656       for (uint32_t i = 0; i < kBlockSize; i++) {
657         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
658       }
659       xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
660       for (uint32_t i = 0; i < kBlockSize; i++) {
661         ASSERT_TRUE(std::isnan(outputs[i]))
662           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
663           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
664       }
665     }
666   }
667 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
668 
669 
670 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_P5,negative_zero)671   TEST(EXP__AVX512F_RR2_P5, negative_zero) {
672     TEST_REQUIRES_X86_AVX512F;
673 
674     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
675     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
676     std::fill(inputs.begin(), inputs.end(), -0.0f);
677     xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
678     const float reference_output = 1.0f;
679     ASSERT_EQ(reference_output, outputs[0])
680       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
681       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
682       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
683   }
684 
TEST(EXP__AVX512F_RR2_P5,positive_zero)685   TEST(EXP__AVX512F_RR2_P5, positive_zero) {
686     TEST_REQUIRES_X86_AVX512F;
687 
688     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
689     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
690     std::fill(inputs.begin(), inputs.end(), +0.0f);
691     xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
692     const float reference_output = 1.0f;
693     ASSERT_EQ(reference_output, outputs[0])
694       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
695       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
696       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
697   }
698 
TEST(EXP__AVX512F_RR2_P5,negative_saturation)699   TEST(EXP__AVX512F_RR2_P5, negative_saturation) {
700     TEST_REQUIRES_X86_AVX512F;
701 
702     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
703     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
704     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
705       for (uint32_t i = 0; i < kBlockSize; i++) {
706         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
707       }
708       xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
709       for (uint32_t i = 0; i < kBlockSize; i++) {
710         const uint32_t reference_output = UINT32_C(0x00000000);
711         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
712           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
713           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
714           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
715       }
716     }
717   }
718 
TEST(EXP__AVX512F_RR2_P5,positive_overflow)719   TEST(EXP__AVX512F_RR2_P5, positive_overflow) {
720     TEST_REQUIRES_X86_AVX512F;
721 
722     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
723     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
724     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
725       for (uint32_t i = 0; i < kBlockSize; i++) {
726         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
727       }
728       xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
729       for (uint32_t i = 0; i < kBlockSize; i++) {
730         const uint32_t reference_output = UINT32_C(0x7F800000);
731         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
732           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
733           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
734           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
735       }
736     }
737   }
738 
TEST(EXP__AVX512F_RR2_P5,positive_nan)739   TEST(EXP__AVX512F_RR2_P5, positive_nan) {
740     TEST_REQUIRES_X86_AVX512F;
741 
742     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
743     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
744     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
745       for (uint32_t i = 0; i < kBlockSize; i++) {
746         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
747       }
748       xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
749       for (uint32_t i = 0; i < kBlockSize; i++) {
750         ASSERT_TRUE(std::isnan(outputs[i]))
751           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
752           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
753       }
754     }
755   }
756 
TEST(EXP__AVX512F_RR2_P5,negative_nan)757   TEST(EXP__AVX512F_RR2_P5, negative_nan) {
758     TEST_REQUIRES_X86_AVX512F;
759 
760     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
761     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
762     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
763       for (uint32_t i = 0; i < kBlockSize; i++) {
764         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
765       }
766       xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
767       for (uint32_t i = 0; i < kBlockSize; i++) {
768         ASSERT_TRUE(std::isnan(outputs[i]))
769           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
770           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
771       }
772     }
773   }
774 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
775 
776 
777 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX512F_RR2_P5_SCALEF,negative_zero)778   TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_zero) {
779     TEST_REQUIRES_X86_AVX512F;
780 
781     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
782     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
783     std::fill(inputs.begin(), inputs.end(), -0.0f);
784     xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
785     const float reference_output = 1.0f;
786     ASSERT_EQ(reference_output, outputs[0])
787       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
788       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
789       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
790   }
791 
TEST(EXP__AVX512F_RR2_P5_SCALEF,positive_zero)792   TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_zero) {
793     TEST_REQUIRES_X86_AVX512F;
794 
795     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
796     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
797     std::fill(inputs.begin(), inputs.end(), +0.0f);
798     xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
799     const float reference_output = 1.0f;
800     ASSERT_EQ(reference_output, outputs[0])
801       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
802       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
803       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
804   }
805 
TEST(EXP__AVX512F_RR2_P5_SCALEF,negative_saturation)806   TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_saturation) {
807     TEST_REQUIRES_X86_AVX512F;
808 
809     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
810     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
811     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
812       for (uint32_t i = 0; i < kBlockSize; i++) {
813         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
814       }
815       xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
816       for (uint32_t i = 0; i < kBlockSize; i++) {
817         const uint32_t reference_output = UINT32_C(0x00000000);
818         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
819           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
820           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
821           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
822       }
823     }
824   }
825 
TEST(EXP__AVX512F_RR2_P5_SCALEF,positive_overflow)826   TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_overflow) {
827     TEST_REQUIRES_X86_AVX512F;
828 
829     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
830     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
831     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
832       for (uint32_t i = 0; i < kBlockSize; i++) {
833         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
834       }
835       xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
836       for (uint32_t i = 0; i < kBlockSize; i++) {
837         const uint32_t reference_output = UINT32_C(0x7F800000);
838         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
839           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
840           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
841           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
842       }
843     }
844   }
845 
TEST(EXP__AVX512F_RR2_P5_SCALEF,positive_nan)846   TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_nan) {
847     TEST_REQUIRES_X86_AVX512F;
848 
849     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
850     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
851     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
852       for (uint32_t i = 0; i < kBlockSize; i++) {
853         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
854       }
855       xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
856       for (uint32_t i = 0; i < kBlockSize; i++) {
857         ASSERT_TRUE(std::isnan(outputs[i]))
858           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
859           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
860       }
861     }
862   }
863 
TEST(EXP__AVX512F_RR2_P5_SCALEF,negative_nan)864   TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_nan) {
865     TEST_REQUIRES_X86_AVX512F;
866 
867     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
868     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
869     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
870       for (uint32_t i = 0; i < kBlockSize; i++) {
871         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
872       }
873       xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data());
874       for (uint32_t i = 0; i < kBlockSize; i++) {
875         ASSERT_TRUE(std::isnan(outputs[i]))
876           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
877           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
878       }
879     }
880   }
881 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
882 
883 
884 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,negative_zero)885   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_zero) {
886     TEST_REQUIRES_X86_AVX2;
887 
888     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
889     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
890     std::fill(inputs.begin(), inputs.end(), -0.0f);
891     xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
892     const float reference_output = 1.0f;
893     ASSERT_EQ(reference_output, outputs[0])
894       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
895       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
896       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
897   }
898 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,positive_zero)899   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_zero) {
900     TEST_REQUIRES_X86_AVX2;
901 
902     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
903     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
904     std::fill(inputs.begin(), inputs.end(), +0.0f);
905     xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
906     const float reference_output = 1.0f;
907     ASSERT_EQ(reference_output, outputs[0])
908       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
909       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
910       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
911   }
912 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,negative_saturation)913   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_saturation) {
914     TEST_REQUIRES_X86_AVX2;
915 
916     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
917     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
918     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
919       for (uint32_t i = 0; i < kBlockSize; i++) {
920         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
921       }
922       xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
923       for (uint32_t i = 0; i < kBlockSize; i++) {
924         const uint32_t reference_output = UINT32_C(0x00000000);
925         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
926           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
927           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
928           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
929       }
930     }
931   }
932 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,positive_overflow)933   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_overflow) {
934     TEST_REQUIRES_X86_AVX2;
935 
936     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
937     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
938     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
939       for (uint32_t i = 0; i < kBlockSize; i++) {
940         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
941       }
942       xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
943       for (uint32_t i = 0; i < kBlockSize; i++) {
944         const uint32_t reference_output = UINT32_C(0x7F800000);
945         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
946           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
947           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
948           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
949       }
950     }
951   }
952 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,positive_nan)953   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_nan) {
954     TEST_REQUIRES_X86_AVX2;
955 
956     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
957     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
958     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
959       for (uint32_t i = 0; i < kBlockSize; i++) {
960         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
961       }
962       xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
963       for (uint32_t i = 0; i < kBlockSize; i++) {
964         ASSERT_TRUE(std::isnan(outputs[i]))
965           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
966           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
967       }
968     }
969   }
970 
TEST(EXP__AVX2_RR2_LUT8_P3_PERM,negative_nan)971   TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_nan) {
972     TEST_REQUIRES_X86_AVX2;
973 
974     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
975     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
976     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
977       for (uint32_t i = 0; i < kBlockSize; i++) {
978         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
979       }
980       xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
981       for (uint32_t i = 0; i < kBlockSize; i++) {
982         ASSERT_TRUE(std::isnan(outputs[i]))
983           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
984           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
985       }
986     }
987   }
988 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
989 
990 
991 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,negative_zero)992   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_zero) {
993     TEST_REQUIRES_X86_AVX2;
994 
995     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
996     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
997     std::fill(inputs.begin(), inputs.end(), -0.0f);
998     xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
999     const float reference_output = 1.0f;
1000     ASSERT_EQ(reference_output, outputs[0])
1001       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1002       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1003       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1004   }
1005 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,positive_zero)1006   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_zero) {
1007     TEST_REQUIRES_X86_AVX2;
1008 
1009     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1010     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1011     std::fill(inputs.begin(), inputs.end(), +0.0f);
1012     xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1013     const float reference_output = 1.0f;
1014     ASSERT_EQ(reference_output, outputs[0])
1015       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1016       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1017       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1018   }
1019 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,negative_saturation)1020   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_saturation) {
1021     TEST_REQUIRES_X86_AVX2;
1022 
1023     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1024     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1025     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1026       for (uint32_t i = 0; i < kBlockSize; i++) {
1027         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1028       }
1029       xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1030       for (uint32_t i = 0; i < kBlockSize; i++) {
1031         const uint32_t reference_output = UINT32_C(0x00000000);
1032         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1033           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1034           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1035           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1036       }
1037     }
1038   }
1039 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,positive_overflow)1040   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_overflow) {
1041     TEST_REQUIRES_X86_AVX2;
1042 
1043     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1044     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1045     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1046       for (uint32_t i = 0; i < kBlockSize; i++) {
1047         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1048       }
1049       xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1050       for (uint32_t i = 0; i < kBlockSize; i++) {
1051         const uint32_t reference_output = UINT32_C(0x7F800000);
1052         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1053           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1054           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1055           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1056       }
1057     }
1058   }
1059 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,positive_nan)1060   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_nan) {
1061     TEST_REQUIRES_X86_AVX2;
1062 
1063     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1064     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1065     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1066       for (uint32_t i = 0; i < kBlockSize; i++) {
1067         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1068       }
1069       xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1070       for (uint32_t i = 0; i < kBlockSize; i++) {
1071         ASSERT_TRUE(std::isnan(outputs[i]))
1072           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1073           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1074       }
1075     }
1076   }
1077 
TEST(EXP__AVX2_RR2_LUT8_P4_PERM,negative_nan)1078   TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_nan) {
1079     TEST_REQUIRES_X86_AVX2;
1080 
1081     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1082     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1083     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1084       for (uint32_t i = 0; i < kBlockSize; i++) {
1085         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1086       }
1087       xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1088       for (uint32_t i = 0; i < kBlockSize; i++) {
1089         ASSERT_TRUE(std::isnan(outputs[i]))
1090           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1091           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1092       }
1093     }
1094   }
1095 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1096 
1097 
1098 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX2_RR2_P5,negative_zero)1099   TEST(EXP__AVX2_RR2_P5, negative_zero) {
1100     TEST_REQUIRES_X86_AVX2;
1101 
1102     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1103     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1104     std::fill(inputs.begin(), inputs.end(), -0.0f);
1105     xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1106     const float reference_output = 1.0f;
1107     ASSERT_EQ(reference_output, outputs[0])
1108       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1109       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1110       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1111   }
1112 
TEST(EXP__AVX2_RR2_P5,positive_zero)1113   TEST(EXP__AVX2_RR2_P5, positive_zero) {
1114     TEST_REQUIRES_X86_AVX2;
1115 
1116     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1117     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1118     std::fill(inputs.begin(), inputs.end(), +0.0f);
1119     xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1120     const float reference_output = 1.0f;
1121     ASSERT_EQ(reference_output, outputs[0])
1122       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1123       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1124       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1125   }
1126 
TEST(EXP__AVX2_RR2_P5,negative_saturation)1127   TEST(EXP__AVX2_RR2_P5, negative_saturation) {
1128     TEST_REQUIRES_X86_AVX2;
1129 
1130     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1131     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1132     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1133       for (uint32_t i = 0; i < kBlockSize; i++) {
1134         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1135       }
1136       xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1137       for (uint32_t i = 0; i < kBlockSize; i++) {
1138         const uint32_t reference_output = UINT32_C(0x00000000);
1139         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1140           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1141           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1142           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1143       }
1144     }
1145   }
1146 
TEST(EXP__AVX2_RR2_P5,positive_overflow)1147   TEST(EXP__AVX2_RR2_P5, positive_overflow) {
1148     TEST_REQUIRES_X86_AVX2;
1149 
1150     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1151     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1152     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1153       for (uint32_t i = 0; i < kBlockSize; i++) {
1154         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1155       }
1156       xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1157       for (uint32_t i = 0; i < kBlockSize; i++) {
1158         const uint32_t reference_output = UINT32_C(0x7F800000);
1159         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1160           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1161           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1162           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1163       }
1164     }
1165   }
1166 
TEST(EXP__AVX2_RR2_P5,positive_nan)1167   TEST(EXP__AVX2_RR2_P5, positive_nan) {
1168     TEST_REQUIRES_X86_AVX2;
1169 
1170     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1171     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1172     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1173       for (uint32_t i = 0; i < kBlockSize; i++) {
1174         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1175       }
1176       xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1177       for (uint32_t i = 0; i < kBlockSize; i++) {
1178         ASSERT_TRUE(std::isnan(outputs[i]))
1179           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1180           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1181       }
1182     }
1183   }
1184 
TEST(EXP__AVX2_RR2_P5,negative_nan)1185   TEST(EXP__AVX2_RR2_P5, negative_nan) {
1186     TEST_REQUIRES_X86_AVX2;
1187 
1188     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1189     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1190     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1191       for (uint32_t i = 0; i < kBlockSize; i++) {
1192         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1193       }
1194       xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1195       for (uint32_t i = 0; i < kBlockSize; i++) {
1196         ASSERT_TRUE(std::isnan(outputs[i]))
1197           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1198           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1199       }
1200     }
1201   }
1202 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1203 
1204 
1205 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__AVX_RR2_P5,negative_zero)1206   TEST(EXP__AVX_RR2_P5, negative_zero) {
1207     TEST_REQUIRES_X86_AVX;
1208 
1209     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1210     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1211     std::fill(inputs.begin(), inputs.end(), -0.0f);
1212     xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1213     const float reference_output = 1.0f;
1214     ASSERT_EQ(reference_output, outputs[0])
1215       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1216       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1217       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1218   }
1219 
TEST(EXP__AVX_RR2_P5,positive_zero)1220   TEST(EXP__AVX_RR2_P5, positive_zero) {
1221     TEST_REQUIRES_X86_AVX;
1222 
1223     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1224     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1225     std::fill(inputs.begin(), inputs.end(), +0.0f);
1226     xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1227     const float reference_output = 1.0f;
1228     ASSERT_EQ(reference_output, outputs[0])
1229       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1230       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1231       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1232   }
1233 
TEST(EXP__AVX_RR2_P5,negative_saturation)1234   TEST(EXP__AVX_RR2_P5, negative_saturation) {
1235     TEST_REQUIRES_X86_AVX;
1236 
1237     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1238     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1239     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1240       for (uint32_t i = 0; i < kBlockSize; i++) {
1241         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1242       }
1243       xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1244       for (uint32_t i = 0; i < kBlockSize; i++) {
1245         const uint32_t reference_output = UINT32_C(0x00000000);
1246         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1247           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1248           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1249           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1250       }
1251     }
1252   }
1253 
TEST(EXP__AVX_RR2_P5,positive_overflow)1254   TEST(EXP__AVX_RR2_P5, positive_overflow) {
1255     TEST_REQUIRES_X86_AVX;
1256 
1257     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1258     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1259     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1260       for (uint32_t i = 0; i < kBlockSize; i++) {
1261         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1262       }
1263       xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1264       for (uint32_t i = 0; i < kBlockSize; i++) {
1265         const uint32_t reference_output = UINT32_C(0x7F800000);
1266         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1267           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1268           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1269           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1270       }
1271     }
1272   }
1273 
TEST(EXP__AVX_RR2_P5,positive_nan)1274   TEST(EXP__AVX_RR2_P5, positive_nan) {
1275     TEST_REQUIRES_X86_AVX;
1276 
1277     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1278     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1279     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1280       for (uint32_t i = 0; i < kBlockSize; i++) {
1281         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1282       }
1283       xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1284       for (uint32_t i = 0; i < kBlockSize; i++) {
1285         ASSERT_TRUE(std::isnan(outputs[i]))
1286           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1287           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1288       }
1289     }
1290   }
1291 
TEST(EXP__AVX_RR2_P5,negative_nan)1292   TEST(EXP__AVX_RR2_P5, negative_nan) {
1293     TEST_REQUIRES_X86_AVX;
1294 
1295     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1296     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1297     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1298       for (uint32_t i = 0; i < kBlockSize; i++) {
1299         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1300       }
1301       xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1302       for (uint32_t i = 0; i < kBlockSize; i++) {
1303         ASSERT_TRUE(std::isnan(outputs[i]))
1304           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1305           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1306       }
1307     }
1308   }
1309 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1310 
1311 
1312 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__SSE2_RR2_LUT64_P2,negative_zero)1313   TEST(EXP__SSE2_RR2_LUT64_P2, negative_zero) {
1314     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1315     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1316     std::fill(inputs.begin(), inputs.end(), -0.0f);
1317     xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1318     const float reference_output = 1.0f;
1319     ASSERT_EQ(reference_output, outputs[0])
1320       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1321       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1322       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1323   }
1324 
TEST(EXP__SSE2_RR2_LUT64_P2,positive_zero)1325   TEST(EXP__SSE2_RR2_LUT64_P2, positive_zero) {
1326     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1327     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1328     std::fill(inputs.begin(), inputs.end(), +0.0f);
1329     xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1330     const float reference_output = 1.0f;
1331     ASSERT_EQ(reference_output, outputs[0])
1332       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1333       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1334       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1335   }
1336 
TEST(EXP__SSE2_RR2_LUT64_P2,negative_saturation)1337   TEST(EXP__SSE2_RR2_LUT64_P2, negative_saturation) {
1338     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1339     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1340     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1341       for (uint32_t i = 0; i < kBlockSize; i++) {
1342         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1343       }
1344       xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1345       for (uint32_t i = 0; i < kBlockSize; i++) {
1346         const uint32_t reference_output = UINT32_C(0x00000000);
1347         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1348           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1349           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1350           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1351       }
1352     }
1353   }
1354 
TEST(EXP__SSE2_RR2_LUT64_P2,positive_overflow)1355   TEST(EXP__SSE2_RR2_LUT64_P2, positive_overflow) {
1356     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1357     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1358     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1359       for (uint32_t i = 0; i < kBlockSize; i++) {
1360         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1361       }
1362       xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1363       for (uint32_t i = 0; i < kBlockSize; i++) {
1364         const uint32_t reference_output = UINT32_C(0x7F800000);
1365         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1366           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1367           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1368           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1369       }
1370     }
1371   }
1372 
TEST(EXP__SSE2_RR2_LUT64_P2,positive_nan)1373   TEST(EXP__SSE2_RR2_LUT64_P2, positive_nan) {
1374     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1375     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1376     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1377       for (uint32_t i = 0; i < kBlockSize; i++) {
1378         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1379       }
1380       xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1381       for (uint32_t i = 0; i < kBlockSize; i++) {
1382         ASSERT_TRUE(std::isnan(outputs[i]))
1383           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1384           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1385       }
1386     }
1387   }
1388 
TEST(EXP__SSE2_RR2_LUT64_P2,negative_nan)1389   TEST(EXP__SSE2_RR2_LUT64_P2, negative_nan) {
1390     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1391     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1392     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1393       for (uint32_t i = 0; i < kBlockSize; i++) {
1394         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1395       }
1396       xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1397       for (uint32_t i = 0; i < kBlockSize; i++) {
1398         ASSERT_TRUE(std::isnan(outputs[i]))
1399           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1400           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1401       }
1402     }
1403   }
1404 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1405 
1406 
1407 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(EXP__SSE2_RR2_P5,negative_zero)1408   TEST(EXP__SSE2_RR2_P5, negative_zero) {
1409     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1410     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1411     std::fill(inputs.begin(), inputs.end(), -0.0f);
1412     xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1413     const float reference_output = 1.0f;
1414     ASSERT_EQ(reference_output, outputs[0])
1415       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1416       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1417       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1418   }
1419 
TEST(EXP__SSE2_RR2_P5,positive_zero)1420   TEST(EXP__SSE2_RR2_P5, positive_zero) {
1421     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1422     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1423     std::fill(inputs.begin(), inputs.end(), +0.0f);
1424     xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1425     const float reference_output = 1.0f;
1426     ASSERT_EQ(reference_output, outputs[0])
1427       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1428       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output)
1429       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1430   }
1431 
TEST(EXP__SSE2_RR2_P5,negative_saturation)1432   TEST(EXP__SSE2_RR2_P5, negative_saturation) {
1433     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1434     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1435     for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) {
1436       for (uint32_t i = 0; i < kBlockSize; i++) {
1437         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000)));
1438       }
1439       xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1440       for (uint32_t i = 0; i < kBlockSize; i++) {
1441         const uint32_t reference_output = UINT32_C(0x00000000);
1442         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1443           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1444           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1445           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1446       }
1447     }
1448   }
1449 
TEST(EXP__SSE2_RR2_P5,positive_overflow)1450   TEST(EXP__SSE2_RR2_P5, positive_overflow) {
1451     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1452     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1453     for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) {
1454       for (uint32_t i = 0; i < kBlockSize; i++) {
1455         inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000)));
1456       }
1457       xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1458       for (uint32_t i = 0; i < kBlockSize; i++) {
1459         const uint32_t reference_output = UINT32_C(0x7F800000);
1460         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1461           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1462           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1463           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1464       }
1465     }
1466   }
1467 
TEST(EXP__SSE2_RR2_P5,positive_nan)1468   TEST(EXP__SSE2_RR2_P5, positive_nan) {
1469     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1470     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1471     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1472       for (uint32_t i = 0; i < kBlockSize; i++) {
1473         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i));
1474       }
1475       xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1476       for (uint32_t i = 0; i < kBlockSize; i++) {
1477         ASSERT_TRUE(std::isnan(outputs[i]))
1478           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1479           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1480       }
1481     }
1482   }
1483 
TEST(EXP__SSE2_RR2_P5,negative_nan)1484   TEST(EXP__SSE2_RR2_P5, negative_nan) {
1485     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1486     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1487     for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) {
1488       for (uint32_t i = 0; i < kBlockSize; i++) {
1489         inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i)));
1490       }
1491       xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1492       for (uint32_t i = 0; i < kBlockSize; i++) {
1493         ASSERT_TRUE(std::isnan(outputs[i]))
1494           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1495           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1496       }
1497     }
1498   }
1499 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1500