1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstddef>
9 #include <cstdint>
10 #include <cstdlib>
11 #include <iomanip>
12 #include <ios>
13 #include <vector>
14 
15 #include <gtest/gtest.h>
16 
17 #include <fp16.h>
18 
19 #include <xnnpack/AlignedAllocator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/math-stubs.h>
22 
23 
24 constexpr int kBlockSize = 1024;
25 
26 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(ROUNDD__SSE_ADDSUB,positive_zero)27   TEST(ROUNDD__SSE_ADDSUB, positive_zero) {
28     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
29     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
30     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
31     xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
32     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
33     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
34       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
35       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
36       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
37   }
38 
TEST(ROUNDD__SSE_ADDSUB,negative_zero)39   TEST(ROUNDD__SSE_ADDSUB, negative_zero) {
40     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
41     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
42     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
43     xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
44     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
45     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
46       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
47       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
48       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
49   }
50 
TEST(ROUNDD__SSE_ADDSUB,positive_subnormal)51   TEST(ROUNDD__SSE_ADDSUB, positive_subnormal) {
52     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
53     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
54     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
55       for (uint32_t i = 0; i < kBlockSize; i++) {
56         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
57       }
58       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
59       for (uint32_t i = 0; i < kBlockSize; i++) {
60         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
61         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
62           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
63           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
64           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
65       }
66     }
67   }
68 
TEST(ROUNDD__SSE_ADDSUB,negative_subnormal)69   TEST(ROUNDD__SSE_ADDSUB, negative_subnormal) {
70     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
71     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
72     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
73       for (uint32_t i = 0; i < kBlockSize; i++) {
74         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
75       }
76       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
77       for (uint32_t i = 0; i < kBlockSize; i++) {
78         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
79         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
80           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
81           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
82           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
83       }
84     }
85   }
86 
TEST(ROUNDD__SSE_ADDSUB,positive_normal)87   TEST(ROUNDD__SSE_ADDSUB, positive_normal) {
88     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
89     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
90     for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
91       for (uint32_t i = 0; i < kBlockSize; i++) {
92         inputs[i] = fp32_from_bits(n + i);
93       }
94       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
95       for (uint32_t i = 0; i < kBlockSize; i++) {
96         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
97         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
98           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
99           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
100           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
101       }
102     }
103   }
104 
TEST(ROUNDD__SSE_ADDSUB,negative_normal)105   TEST(ROUNDD__SSE_ADDSUB, negative_normal) {
106     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
107     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
108     for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
109       for (uint32_t i = 0; i < kBlockSize; i++) {
110         inputs[i] = fp32_from_bits(n + i);
111       }
112       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
113       for (uint32_t i = 0; i < kBlockSize; i++) {
114         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
115         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
116           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
117           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
118           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
119       }
120     }
121   }
122 
TEST(ROUNDD__SSE_ADDSUB,positive_integral)123   TEST(ROUNDD__SSE_ADDSUB, positive_integral) {
124     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
125     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
126     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
127       for (uint32_t i = 0; i < kBlockSize; i++) {
128         inputs[i] = fp32_from_bits(n + i);
129       }
130       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
131       for (uint32_t i = 0; i < kBlockSize; i++) {
132         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
133         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
134           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
135           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
136           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
137       }
138     }
139   }
140 
TEST(ROUNDD__SSE_ADDSUB,negative_integral)141   TEST(ROUNDD__SSE_ADDSUB, negative_integral) {
142     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
143     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
144     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
145       for (uint32_t i = 0; i < kBlockSize; i++) {
146         inputs[i] = fp32_from_bits(n + i);
147       }
148       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
149       for (uint32_t i = 0; i < kBlockSize; i++) {
150         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
151         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
152           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
153           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
154           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
155       }
156     }
157   }
158 
TEST(ROUNDD__SSE_ADDSUB,positive_infinity)159   TEST(ROUNDD__SSE_ADDSUB, positive_infinity) {
160     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
161     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
162     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
163     xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
164     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
165     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
166       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
167       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
168       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
169   }
170 
TEST(ROUNDD__SSE_ADDSUB,negative_infinity)171   TEST(ROUNDD__SSE_ADDSUB, negative_infinity) {
172     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
173     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
174     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
175     xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
176     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
177     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
178       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
179       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
180       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
181   }
182 
TEST(ROUNDD__SSE_ADDSUB,positive_qnan)183   TEST(ROUNDD__SSE_ADDSUB, positive_qnan) {
184     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
185     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
186     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
187       for (uint32_t i = 0; i < kBlockSize; i++) {
188         inputs[i] = fp32_from_bits(n + i);
189       }
190       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
191       for (uint32_t i = 0; i < kBlockSize; i++) {
192         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
193         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
194           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
195           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
196           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
197       }
198     }
199   }
200 
TEST(ROUNDD__SSE_ADDSUB,negative_qnan)201   TEST(ROUNDD__SSE_ADDSUB, negative_qnan) {
202     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
203     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
204     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
205       for (uint32_t i = 0; i < kBlockSize; i++) {
206         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
207       }
208       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
209       for (uint32_t i = 0; i < kBlockSize; i++) {
210         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
211         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
212           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
213           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
214           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
215       }
216     }
217   }
218 
TEST(ROUNDD__SSE_ADDSUB,positive_snan)219   TEST(ROUNDD__SSE_ADDSUB, positive_snan) {
220     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
221     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
222     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
223       for (uint32_t i = 0; i < kBlockSize; i++) {
224         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
225       }
226       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
227       for (uint32_t i = 0; i < kBlockSize; i++) {
228         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
229         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
230           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
231           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
232           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
233       }
234     }
235   }
236 
TEST(ROUNDD__SSE_ADDSUB,negative_snan)237   TEST(ROUNDD__SSE_ADDSUB, negative_snan) {
238     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
239     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
240     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
241       for (uint32_t i = 0; i < kBlockSize; i++) {
242         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
243       }
244       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
245       for (uint32_t i = 0; i < kBlockSize; i++) {
246         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
247         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
248           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
249           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
250           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
251       }
252     }
253   }
254 
TEST(ROUNDD__SSE_ADDSUB,positive_snan_to_qnan)255   TEST(ROUNDD__SSE_ADDSUB, positive_snan_to_qnan) {
256     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
257     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
258     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
259       for (uint32_t i = 0; i < kBlockSize; i++) {
260         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
261       }
262       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
263       for (uint32_t i = 0; i < kBlockSize; i++) {
264         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
265         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
266           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
267           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
268           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
269       }
270     }
271   }
272 
TEST(ROUNDD__SSE_ADDSUB,negative_snan_to_qnan)273   TEST(ROUNDD__SSE_ADDSUB, negative_snan_to_qnan) {
274     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
275     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
276     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
277       for (uint32_t i = 0; i < kBlockSize; i++) {
278         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
279       }
280       xnn_math_f32_roundd__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
281       for (uint32_t i = 0; i < kBlockSize; i++) {
282         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
283         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
284           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
285           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
286           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
287       }
288     }
289   }
290 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
291 
292 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(ROUNDD__SSE2_CVT,positive_zero)293   TEST(ROUNDD__SSE2_CVT, positive_zero) {
294     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
295     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
296     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
297     xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
298     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
299     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
300       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
301       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
302       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
303   }
304 
TEST(ROUNDD__SSE2_CVT,negative_zero)305   TEST(ROUNDD__SSE2_CVT, negative_zero) {
306     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
307     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
308     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
309     xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
310     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
311     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
312       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
313       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
314       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
315   }
316 
TEST(ROUNDD__SSE2_CVT,positive_subnormal)317   TEST(ROUNDD__SSE2_CVT, positive_subnormal) {
318     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
319     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
320     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
321       for (uint32_t i = 0; i < kBlockSize; i++) {
322         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
323       }
324       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
325       for (uint32_t i = 0; i < kBlockSize; i++) {
326         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
327         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
328           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
329           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
330           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
331       }
332     }
333   }
334 
TEST(ROUNDD__SSE2_CVT,negative_subnormal)335   TEST(ROUNDD__SSE2_CVT, negative_subnormal) {
336     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
337     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
338     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
339       for (uint32_t i = 0; i < kBlockSize; i++) {
340         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
341       }
342       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
343       for (uint32_t i = 0; i < kBlockSize; i++) {
344         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
345         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
346           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
347           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
348           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
349       }
350     }
351   }
352 
TEST(ROUNDD__SSE2_CVT,positive_normal)353   TEST(ROUNDD__SSE2_CVT, positive_normal) {
354     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
355     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
356     for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
357       for (uint32_t i = 0; i < kBlockSize; i++) {
358         inputs[i] = fp32_from_bits(n + i);
359       }
360       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
361       for (uint32_t i = 0; i < kBlockSize; i++) {
362         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
363         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
364           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
365           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
366           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
367       }
368     }
369   }
370 
TEST(ROUNDD__SSE2_CVT,negative_normal)371   TEST(ROUNDD__SSE2_CVT, negative_normal) {
372     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
373     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
374     for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
375       for (uint32_t i = 0; i < kBlockSize; i++) {
376         inputs[i] = fp32_from_bits(n + i);
377       }
378       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
379       for (uint32_t i = 0; i < kBlockSize; i++) {
380         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
381         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
382           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
383           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
384           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
385       }
386     }
387   }
388 
TEST(ROUNDD__SSE2_CVT,positive_integral)389   TEST(ROUNDD__SSE2_CVT, positive_integral) {
390     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
391     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
392     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
393       for (uint32_t i = 0; i < kBlockSize; i++) {
394         inputs[i] = fp32_from_bits(n + i);
395       }
396       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
397       for (uint32_t i = 0; i < kBlockSize; i++) {
398         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
399         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
400           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
401           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
402           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
403       }
404     }
405   }
406 
TEST(ROUNDD__SSE2_CVT,negative_integral)407   TEST(ROUNDD__SSE2_CVT, negative_integral) {
408     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
409     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
410     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
411       for (uint32_t i = 0; i < kBlockSize; i++) {
412         inputs[i] = fp32_from_bits(n + i);
413       }
414       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
415       for (uint32_t i = 0; i < kBlockSize; i++) {
416         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
417         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
418           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
419           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
420           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
421       }
422     }
423   }
424 
TEST(ROUNDD__SSE2_CVT,positive_infinity)425   TEST(ROUNDD__SSE2_CVT, positive_infinity) {
426     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
427     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
428     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
429     xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
430     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
431     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
432       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
433       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
434       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
435   }
436 
TEST(ROUNDD__SSE2_CVT,negative_infinity)437   TEST(ROUNDD__SSE2_CVT, negative_infinity) {
438     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
439     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
440     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
441     xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
442     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
443     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
444       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
445       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
446       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
447   }
448 
TEST(ROUNDD__SSE2_CVT,positive_qnan)449   TEST(ROUNDD__SSE2_CVT, positive_qnan) {
450     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
451     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
452     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
453       for (uint32_t i = 0; i < kBlockSize; i++) {
454         inputs[i] = fp32_from_bits(n + i);
455       }
456       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
457       for (uint32_t i = 0; i < kBlockSize; i++) {
458         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
459         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
460           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
461           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
462           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
463       }
464     }
465   }
466 
TEST(ROUNDD__SSE2_CVT,negative_qnan)467   TEST(ROUNDD__SSE2_CVT, negative_qnan) {
468     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
469     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
470     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
471       for (uint32_t i = 0; i < kBlockSize; i++) {
472         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
473       }
474       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
475       for (uint32_t i = 0; i < kBlockSize; i++) {
476         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
477         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
478           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
479           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
480           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
481       }
482     }
483   }
484 
TEST(ROUNDD__SSE2_CVT,positive_snan)485   TEST(ROUNDD__SSE2_CVT, positive_snan) {
486     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
487     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
488     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
489       for (uint32_t i = 0; i < kBlockSize; i++) {
490         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
491       }
492       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
493       for (uint32_t i = 0; i < kBlockSize; i++) {
494         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
495         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
496           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
497           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
498           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
499       }
500     }
501   }
502 
TEST(ROUNDD__SSE2_CVT,negative_snan)503   TEST(ROUNDD__SSE2_CVT, negative_snan) {
504     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
505     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
506     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
507       for (uint32_t i = 0; i < kBlockSize; i++) {
508         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
509       }
510       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
511       for (uint32_t i = 0; i < kBlockSize; i++) {
512         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
513         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
514           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
515           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
516           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
517       }
518     }
519   }
520 
TEST(ROUNDD__SSE2_CVT,positive_snan_to_qnan)521   TEST(ROUNDD__SSE2_CVT, positive_snan_to_qnan) {
522     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
523     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
524     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
525       for (uint32_t i = 0; i < kBlockSize; i++) {
526         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
527       }
528       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
529       for (uint32_t i = 0; i < kBlockSize; i++) {
530         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
531         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
532           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
533           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
534           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
535       }
536     }
537   }
538 
TEST(ROUNDD__SSE2_CVT,negative_snan_to_qnan)539   TEST(ROUNDD__SSE2_CVT, negative_snan_to_qnan) {
540     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
541     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
542     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
543       for (uint32_t i = 0; i < kBlockSize; i++) {
544         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
545       }
546       xnn_math_f32_roundd__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
547       for (uint32_t i = 0; i < kBlockSize; i++) {
548         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
549         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
550           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
551           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
552           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
553       }
554     }
555   }
556 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
557 
558 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(ROUNDD__SSE41,positive_zero)559   TEST(ROUNDD__SSE41, positive_zero) {
560     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
561     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
562     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
563     xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
564     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
565     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
566       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
567       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
568       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
569   }
570 
TEST(ROUNDD__SSE41,negative_zero)571   TEST(ROUNDD__SSE41, negative_zero) {
572     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
573     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
574     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
575     xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
576     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
577     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
578       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
579       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
580       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
581   }
582 
TEST(ROUNDD__SSE41,positive_subnormal)583   TEST(ROUNDD__SSE41, positive_subnormal) {
584     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
585     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
586     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
587       for (uint32_t i = 0; i < kBlockSize; i++) {
588         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
589       }
590       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
591       for (uint32_t i = 0; i < kBlockSize; i++) {
592         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
593         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
594           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
595           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
596           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
597       }
598     }
599   }
600 
TEST(ROUNDD__SSE41,negative_subnormal)601   TEST(ROUNDD__SSE41, negative_subnormal) {
602     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
603     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
604     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
605       for (uint32_t i = 0; i < kBlockSize; i++) {
606         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
607       }
608       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
609       for (uint32_t i = 0; i < kBlockSize; i++) {
610         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
611         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
612           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
613           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
614           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
615       }
616     }
617   }
618 
TEST(ROUNDD__SSE41,positive_normal)619   TEST(ROUNDD__SSE41, positive_normal) {
620     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
621     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
622     for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
623       for (uint32_t i = 0; i < kBlockSize; i++) {
624         inputs[i] = fp32_from_bits(n + i);
625       }
626       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
627       for (uint32_t i = 0; i < kBlockSize; i++) {
628         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
629         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
630           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
631           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
632           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
633       }
634     }
635   }
636 
TEST(ROUNDD__SSE41,negative_normal)637   TEST(ROUNDD__SSE41, negative_normal) {
638     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
639     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
640     for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
641       for (uint32_t i = 0; i < kBlockSize; i++) {
642         inputs[i] = fp32_from_bits(n + i);
643       }
644       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
645       for (uint32_t i = 0; i < kBlockSize; i++) {
646         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
647         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
648           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
649           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
650           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
651       }
652     }
653   }
654 
TEST(ROUNDD__SSE41,positive_integral)655   TEST(ROUNDD__SSE41, positive_integral) {
656     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
657     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
658     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
659       for (uint32_t i = 0; i < kBlockSize; i++) {
660         inputs[i] = fp32_from_bits(n + i);
661       }
662       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
663       for (uint32_t i = 0; i < kBlockSize; i++) {
664         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
665         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
666           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
667           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
668           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
669       }
670     }
671   }
672 
TEST(ROUNDD__SSE41,negative_integral)673   TEST(ROUNDD__SSE41, negative_integral) {
674     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
675     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
676     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
677       for (uint32_t i = 0; i < kBlockSize; i++) {
678         inputs[i] = fp32_from_bits(n + i);
679       }
680       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
681       for (uint32_t i = 0; i < kBlockSize; i++) {
682         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
683         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
684           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
685           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
686           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
687       }
688     }
689   }
690 
TEST(ROUNDD__SSE41,positive_infinity)691   TEST(ROUNDD__SSE41, positive_infinity) {
692     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
693     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
694     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
695     xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
696     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
697     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
698       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
699       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
700       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
701   }
702 
TEST(ROUNDD__SSE41,negative_infinity)703   TEST(ROUNDD__SSE41, negative_infinity) {
704     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
705     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
706     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
707     xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
708     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
709     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
710       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
711       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
712       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
713   }
714 
TEST(ROUNDD__SSE41,positive_qnan)715   TEST(ROUNDD__SSE41, positive_qnan) {
716     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
717     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
718     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
719       for (uint32_t i = 0; i < kBlockSize; i++) {
720         inputs[i] = fp32_from_bits(n + i);
721       }
722       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
723       for (uint32_t i = 0; i < kBlockSize; i++) {
724         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
725         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
726           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
727           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
728           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
729       }
730     }
731   }
732 
TEST(ROUNDD__SSE41,negative_qnan)733   TEST(ROUNDD__SSE41, negative_qnan) {
734     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
735     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
736     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
737       for (uint32_t i = 0; i < kBlockSize; i++) {
738         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
739       }
740       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
741       for (uint32_t i = 0; i < kBlockSize; i++) {
742         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
743         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
744           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
745           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
746           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
747       }
748     }
749   }
750 
TEST(ROUNDD__SSE41,positive_snan)751   TEST(ROUNDD__SSE41, positive_snan) {
752     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
753     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
754     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
755       for (uint32_t i = 0; i < kBlockSize; i++) {
756         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
757       }
758       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
759       for (uint32_t i = 0; i < kBlockSize; i++) {
760         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
761         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
762           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
763           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
764           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
765       }
766     }
767   }
768 
TEST(ROUNDD__SSE41,negative_snan)769   TEST(ROUNDD__SSE41, negative_snan) {
770     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
771     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
772     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
773       for (uint32_t i = 0; i < kBlockSize; i++) {
774         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
775       }
776       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
777       for (uint32_t i = 0; i < kBlockSize; i++) {
778         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
779         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
780           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
781           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
782           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
783       }
784     }
785   }
786 
TEST(ROUNDD__SSE41,positive_snan_to_qnan)787   TEST(ROUNDD__SSE41, positive_snan_to_qnan) {
788     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
789     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
790     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
791       for (uint32_t i = 0; i < kBlockSize; i++) {
792         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
793       }
794       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
795       for (uint32_t i = 0; i < kBlockSize; i++) {
796         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
797         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
798           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
799           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
800           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
801       }
802     }
803   }
804 
TEST(ROUNDD__SSE41,negative_snan_to_qnan)805   TEST(ROUNDD__SSE41, negative_snan_to_qnan) {
806     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
807     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
808     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
809       for (uint32_t i = 0; i < kBlockSize; i++) {
810         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
811       }
812       xnn_math_f32_roundd__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
813       for (uint32_t i = 0; i < kBlockSize; i++) {
814         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
815         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
816           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
817           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
818           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
819       }
820     }
821   }
822 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
823 
824 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(ROUNDD__NEON_ADDSUB,positive_zero)825   TEST(ROUNDD__NEON_ADDSUB, positive_zero) {
826     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
827     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
828     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
829     xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
830     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
831     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
832       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
833       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
834       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
835   }
836 
TEST(ROUNDD__NEON_ADDSUB,negative_zero)837   TEST(ROUNDD__NEON_ADDSUB, negative_zero) {
838     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
839     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
840     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
841     xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
842     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
843     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
844       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
845       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
846       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
847   }
848 
TEST(ROUNDD__NEON_ADDSUB,positive_subnormal)849   TEST(ROUNDD__NEON_ADDSUB, positive_subnormal) {
850     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
851     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
852     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
853       for (uint32_t i = 0; i < kBlockSize; i++) {
854         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
855       }
856       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
857       for (uint32_t i = 0; i < kBlockSize; i++) {
858         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
859         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
860           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
861           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
862           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
863       }
864     }
865   }
866 
TEST(ROUNDD__NEON_ADDSUB,negative_subnormal)867   TEST(ROUNDD__NEON_ADDSUB, negative_subnormal) {
868     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
869     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
870     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
871       for (uint32_t i = 0; i < kBlockSize; i++) {
872         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
873       }
874       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
875       for (uint32_t i = 0; i < kBlockSize; i++) {
876         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
877         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
878           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
879           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
880           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
881       }
882     }
883   }
884 
TEST(ROUNDD__NEON_ADDSUB,positive_normal)885   TEST(ROUNDD__NEON_ADDSUB, positive_normal) {
886     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
887     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
888     for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
889       for (uint32_t i = 0; i < kBlockSize; i++) {
890         inputs[i] = fp32_from_bits(n + i);
891       }
892       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
893       for (uint32_t i = 0; i < kBlockSize; i++) {
894         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
895         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
896           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
897           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
898           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
899       }
900     }
901   }
902 
TEST(ROUNDD__NEON_ADDSUB,negative_normal)903   TEST(ROUNDD__NEON_ADDSUB, negative_normal) {
904     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
905     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
906     for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
907       for (uint32_t i = 0; i < kBlockSize; i++) {
908         inputs[i] = fp32_from_bits(n + i);
909       }
910       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
911       for (uint32_t i = 0; i < kBlockSize; i++) {
912         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
913         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
914           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
915           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
916           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
917       }
918     }
919   }
920 
TEST(ROUNDD__NEON_ADDSUB,positive_integral)921   TEST(ROUNDD__NEON_ADDSUB, positive_integral) {
922     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
923     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
924     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
925       for (uint32_t i = 0; i < kBlockSize; i++) {
926         inputs[i] = fp32_from_bits(n + i);
927       }
928       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
929       for (uint32_t i = 0; i < kBlockSize; i++) {
930         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
931         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
932           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
933           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
934           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
935       }
936     }
937   }
938 
TEST(ROUNDD__NEON_ADDSUB,negative_integral)939   TEST(ROUNDD__NEON_ADDSUB, negative_integral) {
940     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
941     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
942     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
943       for (uint32_t i = 0; i < kBlockSize; i++) {
944         inputs[i] = fp32_from_bits(n + i);
945       }
946       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
947       for (uint32_t i = 0; i < kBlockSize; i++) {
948         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
949         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
950           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
951           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
952           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
953       }
954     }
955   }
956 
TEST(ROUNDD__NEON_ADDSUB,positive_infinity)957   TEST(ROUNDD__NEON_ADDSUB, positive_infinity) {
958     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
959     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
960     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
961     xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
962     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
963     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
964       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
965       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
966       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
967   }
968 
TEST(ROUNDD__NEON_ADDSUB,negative_infinity)969   TEST(ROUNDD__NEON_ADDSUB, negative_infinity) {
970     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
971     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
972     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
973     xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
974     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
975     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
976       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
977       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
978       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
979   }
980 
TEST(ROUNDD__NEON_ADDSUB,positive_qnan)981   TEST(ROUNDD__NEON_ADDSUB, positive_qnan) {
982     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
983     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
984     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
985       for (uint32_t i = 0; i < kBlockSize; i++) {
986         inputs[i] = fp32_from_bits(n + i);
987       }
988       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
989       for (uint32_t i = 0; i < kBlockSize; i++) {
990         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
991         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
992           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
993           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
994           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
995       }
996     }
997   }
998 
TEST(ROUNDD__NEON_ADDSUB,negative_qnan)999   TEST(ROUNDD__NEON_ADDSUB, negative_qnan) {
1000     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1001     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1002     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1003       for (uint32_t i = 0; i < kBlockSize; i++) {
1004         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1005       }
1006       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1007       for (uint32_t i = 0; i < kBlockSize; i++) {
1008         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1009         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1010           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1011           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1012           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1013       }
1014     }
1015   }
1016 
TEST(ROUNDD__NEON_ADDSUB,positive_snan)1017   TEST(ROUNDD__NEON_ADDSUB, positive_snan) {
1018     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1019     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1020     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1021       for (uint32_t i = 0; i < kBlockSize; i++) {
1022         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1023       }
1024       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1025       for (uint32_t i = 0; i < kBlockSize; i++) {
1026         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1027         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1028           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1029           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1030           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1031       }
1032     }
1033   }
1034 
TEST(ROUNDD__NEON_ADDSUB,negative_snan)1035   TEST(ROUNDD__NEON_ADDSUB, negative_snan) {
1036     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1037     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1038     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1039       for (uint32_t i = 0; i < kBlockSize; i++) {
1040         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1041       }
1042       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1043       for (uint32_t i = 0; i < kBlockSize; i++) {
1044         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1045         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1046           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1047           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1048           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1049       }
1050     }
1051   }
1052 
TEST(ROUNDD__NEON_ADDSUB,positive_snan_to_qnan)1053   TEST(ROUNDD__NEON_ADDSUB, positive_snan_to_qnan) {
1054     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1055     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1056     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1057       for (uint32_t i = 0; i < kBlockSize; i++) {
1058         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1059       }
1060       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1061       for (uint32_t i = 0; i < kBlockSize; i++) {
1062         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1063         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1064           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1065           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1066           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1067       }
1068     }
1069   }
1070 
TEST(ROUNDD__NEON_ADDSUB,negative_snan_to_qnan)1071   TEST(ROUNDD__NEON_ADDSUB, negative_snan_to_qnan) {
1072     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1073     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1074     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1075       for (uint32_t i = 0; i < kBlockSize; i++) {
1076         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1077       }
1078       xnn_math_f32_roundd__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1079       for (uint32_t i = 0; i < kBlockSize; i++) {
1080         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1081         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1082           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1083           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1084           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1085       }
1086     }
1087   }
1088 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1089 
1090 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(ROUNDD__NEON_CVT,positive_zero)1091   TEST(ROUNDD__NEON_CVT, positive_zero) {
1092     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1093     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1094     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
1095     xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1096     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1097     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1098       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1099       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1100       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1101   }
1102 
TEST(ROUNDD__NEON_CVT,negative_zero)1103   TEST(ROUNDD__NEON_CVT, negative_zero) {
1104     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1105     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1106     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
1107     xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1108     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1109     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1110       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1111       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1112       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1113   }
1114 
TEST(ROUNDD__NEON_CVT,positive_subnormal)1115   TEST(ROUNDD__NEON_CVT, positive_subnormal) {
1116     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1117     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1118     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
1119       for (uint32_t i = 0; i < kBlockSize; i++) {
1120         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
1121       }
1122       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1123       for (uint32_t i = 0; i < kBlockSize; i++) {
1124         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1125         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1126           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1127           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1128           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1129       }
1130     }
1131   }
1132 
TEST(ROUNDD__NEON_CVT,negative_subnormal)1133   TEST(ROUNDD__NEON_CVT, negative_subnormal) {
1134     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1135     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1136     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
1137       for (uint32_t i = 0; i < kBlockSize; i++) {
1138         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
1139       }
1140       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1141       for (uint32_t i = 0; i < kBlockSize; i++) {
1142         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1143         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1144           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1145           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1146           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1147       }
1148     }
1149   }
1150 
TEST(ROUNDD__NEON_CVT,positive_normal)1151   TEST(ROUNDD__NEON_CVT, positive_normal) {
1152     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1153     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1154     for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1155       for (uint32_t i = 0; i < kBlockSize; i++) {
1156         inputs[i] = fp32_from_bits(n + i);
1157       }
1158       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1159       for (uint32_t i = 0; i < kBlockSize; i++) {
1160         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1161         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1162           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1163           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1164           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1165       }
1166     }
1167   }
1168 
TEST(ROUNDD__NEON_CVT,negative_normal)1169   TEST(ROUNDD__NEON_CVT, negative_normal) {
1170     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1171     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1172     for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1173       for (uint32_t i = 0; i < kBlockSize; i++) {
1174         inputs[i] = fp32_from_bits(n + i);
1175       }
1176       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1177       for (uint32_t i = 0; i < kBlockSize; i++) {
1178         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1179         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1180           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1181           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1182           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1183       }
1184     }
1185   }
1186 
TEST(ROUNDD__NEON_CVT,positive_integral)1187   TEST(ROUNDD__NEON_CVT, positive_integral) {
1188     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1189     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1190     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1191       for (uint32_t i = 0; i < kBlockSize; i++) {
1192         inputs[i] = fp32_from_bits(n + i);
1193       }
1194       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1195       for (uint32_t i = 0; i < kBlockSize; i++) {
1196         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1197         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1198           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1199           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1200           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1201       }
1202     }
1203   }
1204 
TEST(ROUNDD__NEON_CVT,negative_integral)1205   TEST(ROUNDD__NEON_CVT, negative_integral) {
1206     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1207     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1208     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1209       for (uint32_t i = 0; i < kBlockSize; i++) {
1210         inputs[i] = fp32_from_bits(n + i);
1211       }
1212       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1213       for (uint32_t i = 0; i < kBlockSize; i++) {
1214         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1215         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1216           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1217           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1218           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1219       }
1220     }
1221   }
1222 
TEST(ROUNDD__NEON_CVT,positive_infinity)1223   TEST(ROUNDD__NEON_CVT, positive_infinity) {
1224     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1225     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1226     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1227     xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1228     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1229     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1230       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1231       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1232       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1233   }
1234 
TEST(ROUNDD__NEON_CVT,negative_infinity)1235   TEST(ROUNDD__NEON_CVT, negative_infinity) {
1236     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1237     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1238     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1239     xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1240     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1241     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1242       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1243       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1244       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1245   }
1246 
TEST(ROUNDD__NEON_CVT,positive_qnan)1247   TEST(ROUNDD__NEON_CVT, positive_qnan) {
1248     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1249     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1250     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1251       for (uint32_t i = 0; i < kBlockSize; i++) {
1252         inputs[i] = fp32_from_bits(n + i);
1253       }
1254       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1255       for (uint32_t i = 0; i < kBlockSize; i++) {
1256         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1257         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1258           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1259           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1260           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1261       }
1262     }
1263   }
1264 
TEST(ROUNDD__NEON_CVT,negative_qnan)1265   TEST(ROUNDD__NEON_CVT, negative_qnan) {
1266     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1267     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1268     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1269       for (uint32_t i = 0; i < kBlockSize; i++) {
1270         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1271       }
1272       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1273       for (uint32_t i = 0; i < kBlockSize; i++) {
1274         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1275         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1276           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1277           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1278           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1279       }
1280     }
1281   }
1282 
TEST(ROUNDD__NEON_CVT,positive_snan)1283   TEST(ROUNDD__NEON_CVT, positive_snan) {
1284     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1285     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1286     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1287       for (uint32_t i = 0; i < kBlockSize; i++) {
1288         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1289       }
1290       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1291       for (uint32_t i = 0; i < kBlockSize; i++) {
1292         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1293         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1294           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1295           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1296           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1297       }
1298     }
1299   }
1300 
TEST(ROUNDD__NEON_CVT,negative_snan)1301   TEST(ROUNDD__NEON_CVT, negative_snan) {
1302     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1303     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1304     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1305       for (uint32_t i = 0; i < kBlockSize; i++) {
1306         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1307       }
1308       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1309       for (uint32_t i = 0; i < kBlockSize; i++) {
1310         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1311         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1312           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1313           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1314           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1315       }
1316     }
1317   }
1318 
TEST(ROUNDD__NEON_CVT,positive_snan_to_qnan)1319   TEST(ROUNDD__NEON_CVT, positive_snan_to_qnan) {
1320     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1321     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1322     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1323       for (uint32_t i = 0; i < kBlockSize; i++) {
1324         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1325       }
1326       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1327       for (uint32_t i = 0; i < kBlockSize; i++) {
1328         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1329         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1330           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1331           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1332           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1333       }
1334     }
1335   }
1336 
TEST(ROUNDD__NEON_CVT,negative_snan_to_qnan)1337   TEST(ROUNDD__NEON_CVT, negative_snan_to_qnan) {
1338     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1339     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1340     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1341       for (uint32_t i = 0; i < kBlockSize; i++) {
1342         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1343       }
1344       xnn_math_f32_roundd__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1345       for (uint32_t i = 0; i < kBlockSize; i++) {
1346         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1347         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1348           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1349           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1350           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1351       }
1352     }
1353   }
1354 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1355 
1356 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(ROUNDD__NEONV8,positive_zero)1357   TEST(ROUNDD__NEONV8, positive_zero) {
1358     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1359     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1360     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
1361     xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1362     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1363     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1364       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1365       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1366       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1367   }
1368 
TEST(ROUNDD__NEONV8,negative_zero)1369   TEST(ROUNDD__NEONV8, negative_zero) {
1370     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1371     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1372     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
1373     xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1374     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1375     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1376       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1377       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1378       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1379   }
1380 
TEST(ROUNDD__NEONV8,positive_subnormal)1381   TEST(ROUNDD__NEONV8, positive_subnormal) {
1382     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1383     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1384     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
1385       for (uint32_t i = 0; i < kBlockSize; i++) {
1386         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
1387       }
1388       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1389       for (uint32_t i = 0; i < kBlockSize; i++) {
1390         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1391         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1392           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1393           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1394           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1395       }
1396     }
1397   }
1398 
TEST(ROUNDD__NEONV8,negative_subnormal)1399   TEST(ROUNDD__NEONV8, negative_subnormal) {
1400     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1401     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1402     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
1403       for (uint32_t i = 0; i < kBlockSize; i++) {
1404         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
1405       }
1406       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1407       for (uint32_t i = 0; i < kBlockSize; i++) {
1408         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1409         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1410           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1411           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1412           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1413       }
1414     }
1415   }
1416 
TEST(ROUNDD__NEONV8,positive_normal)1417   TEST(ROUNDD__NEONV8, positive_normal) {
1418     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1419     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1420     for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1421       for (uint32_t i = 0; i < kBlockSize; i++) {
1422         inputs[i] = fp32_from_bits(n + i);
1423       }
1424       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1425       for (uint32_t i = 0; i < kBlockSize; i++) {
1426         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1427         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1428           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1429           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1430           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1431       }
1432     }
1433   }
1434 
TEST(ROUNDD__NEONV8,negative_normal)1435   TEST(ROUNDD__NEONV8, negative_normal) {
1436     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1437     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1438     for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1439       for (uint32_t i = 0; i < kBlockSize; i++) {
1440         inputs[i] = fp32_from_bits(n + i);
1441       }
1442       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1443       for (uint32_t i = 0; i < kBlockSize; i++) {
1444         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1445         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1446           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1447           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1448           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1449       }
1450     }
1451   }
1452 
TEST(ROUNDD__NEONV8,positive_integral)1453   TEST(ROUNDD__NEONV8, positive_integral) {
1454     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1455     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1456     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1457       for (uint32_t i = 0; i < kBlockSize; i++) {
1458         inputs[i] = fp32_from_bits(n + i);
1459       }
1460       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1461       for (uint32_t i = 0; i < kBlockSize; i++) {
1462         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1463         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1464           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1465           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1466           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1467       }
1468     }
1469   }
1470 
TEST(ROUNDD__NEONV8,negative_integral)1471   TEST(ROUNDD__NEONV8, negative_integral) {
1472     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1473     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1474     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1475       for (uint32_t i = 0; i < kBlockSize; i++) {
1476         inputs[i] = fp32_from_bits(n + i);
1477       }
1478       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1479       for (uint32_t i = 0; i < kBlockSize; i++) {
1480         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1481         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1482           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1483           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1484           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1485       }
1486     }
1487   }
1488 
TEST(ROUNDD__NEONV8,positive_infinity)1489   TEST(ROUNDD__NEONV8, positive_infinity) {
1490     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1491     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1492     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1493     xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1494     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1495     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1496       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1497       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1498       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1499   }
1500 
TEST(ROUNDD__NEONV8,negative_infinity)1501   TEST(ROUNDD__NEONV8, negative_infinity) {
1502     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1503     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1504     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1505     xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1506     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1507     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1508       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1509       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1510       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1511   }
1512 
TEST(ROUNDD__NEONV8,positive_qnan)1513   TEST(ROUNDD__NEONV8, positive_qnan) {
1514     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1515     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1516     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1517       for (uint32_t i = 0; i < kBlockSize; i++) {
1518         inputs[i] = fp32_from_bits(n + i);
1519       }
1520       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1521       for (uint32_t i = 0; i < kBlockSize; i++) {
1522         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1523         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1524           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1525           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1526           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1527       }
1528     }
1529   }
1530 
TEST(ROUNDD__NEONV8,negative_qnan)1531   TEST(ROUNDD__NEONV8, negative_qnan) {
1532     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1533     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1534     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1535       for (uint32_t i = 0; i < kBlockSize; i++) {
1536         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1537       }
1538       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1539       for (uint32_t i = 0; i < kBlockSize; i++) {
1540         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1541         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1542           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1543           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1544           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1545       }
1546     }
1547   }
1548 
TEST(ROUNDD__NEONV8,positive_snan)1549   TEST(ROUNDD__NEONV8, positive_snan) {
1550     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1551     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1552     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1553       for (uint32_t i = 0; i < kBlockSize; i++) {
1554         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1555       }
1556       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1557       for (uint32_t i = 0; i < kBlockSize; i++) {
1558         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1559         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1560           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1561           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1562           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1563       }
1564     }
1565   }
1566 
TEST(ROUNDD__NEONV8,negative_snan)1567   TEST(ROUNDD__NEONV8, negative_snan) {
1568     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1569     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1570     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1571       for (uint32_t i = 0; i < kBlockSize; i++) {
1572         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1573       }
1574       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1575       for (uint32_t i = 0; i < kBlockSize; i++) {
1576         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1577         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1578           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1579           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1580           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1581       }
1582     }
1583   }
1584 
TEST(ROUNDD__NEONV8,positive_snan_to_qnan)1585   TEST(ROUNDD__NEONV8, positive_snan_to_qnan) {
1586     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1587     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1588     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1589       for (uint32_t i = 0; i < kBlockSize; i++) {
1590         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1591       }
1592       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1593       for (uint32_t i = 0; i < kBlockSize; i++) {
1594         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1595         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1596           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1597           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1598           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1599       }
1600     }
1601   }
1602 
TEST(ROUNDD__NEONV8,negative_snan_to_qnan)1603   TEST(ROUNDD__NEONV8, negative_snan_to_qnan) {
1604     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1605     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1606     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1607       for (uint32_t i = 0; i < kBlockSize; i++) {
1608         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1609       }
1610       xnn_math_f32_roundd__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1611       for (uint32_t i = 0; i < kBlockSize; i++) {
1612         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1613         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1614           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1615           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1616           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1617       }
1618     }
1619   }
1620 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1621 
1622 #if XNN_ARCH_WASMSIMD
TEST(ROUNDD__WASMSIMD_ADDSUB,positive_zero)1623   TEST(ROUNDD__WASMSIMD_ADDSUB, positive_zero) {
1624     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1625     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1626     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
1627     xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1628     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1629     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1630       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1631       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1632       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1633   }
1634 
TEST(ROUNDD__WASMSIMD_ADDSUB,negative_zero)1635   TEST(ROUNDD__WASMSIMD_ADDSUB, negative_zero) {
1636     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1637     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1638     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
1639     xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1640     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1641     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1642       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1643       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1644       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1645   }
1646 
TEST(ROUNDD__WASMSIMD_ADDSUB,positive_subnormal)1647   TEST(ROUNDD__WASMSIMD_ADDSUB, positive_subnormal) {
1648     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1649     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1650     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
1651       for (uint32_t i = 0; i < kBlockSize; i++) {
1652         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
1653       }
1654       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1655       for (uint32_t i = 0; i < kBlockSize; i++) {
1656         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1657         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1658           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1659           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1660           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1661       }
1662     }
1663   }
1664 
TEST(ROUNDD__WASMSIMD_ADDSUB,negative_subnormal)1665   TEST(ROUNDD__WASMSIMD_ADDSUB, negative_subnormal) {
1666     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1667     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1668     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
1669       for (uint32_t i = 0; i < kBlockSize; i++) {
1670         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
1671       }
1672       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1673       for (uint32_t i = 0; i < kBlockSize; i++) {
1674         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1675         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1676           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1677           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1678           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1679       }
1680     }
1681   }
1682 
TEST(ROUNDD__WASMSIMD_ADDSUB,positive_normal)1683   TEST(ROUNDD__WASMSIMD_ADDSUB, positive_normal) {
1684     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1685     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1686     for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1687       for (uint32_t i = 0; i < kBlockSize; i++) {
1688         inputs[i] = fp32_from_bits(n + i);
1689       }
1690       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1691       for (uint32_t i = 0; i < kBlockSize; i++) {
1692         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1693         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1694           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1695           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1696           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1697       }
1698     }
1699   }
1700 
TEST(ROUNDD__WASMSIMD_ADDSUB,negative_normal)1701   TEST(ROUNDD__WASMSIMD_ADDSUB, negative_normal) {
1702     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1703     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1704     for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1705       for (uint32_t i = 0; i < kBlockSize; i++) {
1706         inputs[i] = fp32_from_bits(n + i);
1707       }
1708       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1709       for (uint32_t i = 0; i < kBlockSize; i++) {
1710         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1711         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1712           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1713           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1714           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1715       }
1716     }
1717   }
1718 
TEST(ROUNDD__WASMSIMD_ADDSUB,positive_integral)1719   TEST(ROUNDD__WASMSIMD_ADDSUB, positive_integral) {
1720     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1721     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1722     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1723       for (uint32_t i = 0; i < kBlockSize; i++) {
1724         inputs[i] = fp32_from_bits(n + i);
1725       }
1726       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1727       for (uint32_t i = 0; i < kBlockSize; i++) {
1728         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1729         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1730           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1731           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1732           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1733       }
1734     }
1735   }
1736 
TEST(ROUNDD__WASMSIMD_ADDSUB,negative_integral)1737   TEST(ROUNDD__WASMSIMD_ADDSUB, negative_integral) {
1738     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1739     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1740     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1741       for (uint32_t i = 0; i < kBlockSize; i++) {
1742         inputs[i] = fp32_from_bits(n + i);
1743       }
1744       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1745       for (uint32_t i = 0; i < kBlockSize; i++) {
1746         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1747         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1748           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1749           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1750           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1751       }
1752     }
1753   }
1754 
TEST(ROUNDD__WASMSIMD_ADDSUB,positive_infinity)1755   TEST(ROUNDD__WASMSIMD_ADDSUB, positive_infinity) {
1756     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1757     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1758     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1759     xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1760     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1761     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1762       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1763       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1764       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1765   }
1766 
TEST(ROUNDD__WASMSIMD_ADDSUB,negative_infinity)1767   TEST(ROUNDD__WASMSIMD_ADDSUB, negative_infinity) {
1768     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1769     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1770     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1771     xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1772     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1773     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1774       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1775       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1776       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1777   }
1778 
TEST(ROUNDD__WASMSIMD_ADDSUB,positive_qnan)1779   TEST(ROUNDD__WASMSIMD_ADDSUB, positive_qnan) {
1780     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1781     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1782     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1783       for (uint32_t i = 0; i < kBlockSize; i++) {
1784         inputs[i] = fp32_from_bits(n + i);
1785       }
1786       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1787       for (uint32_t i = 0; i < kBlockSize; i++) {
1788         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1789         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1790           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1791           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1792           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1793       }
1794     }
1795   }
1796 
TEST(ROUNDD__WASMSIMD_ADDSUB,negative_qnan)1797   TEST(ROUNDD__WASMSIMD_ADDSUB, negative_qnan) {
1798     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1799     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1800     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1801       for (uint32_t i = 0; i < kBlockSize; i++) {
1802         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1803       }
1804       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1805       for (uint32_t i = 0; i < kBlockSize; i++) {
1806         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1807         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1808           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1809           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1810           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1811       }
1812     }
1813   }
1814 
TEST(ROUNDD__WASMSIMD_ADDSUB,positive_snan)1815   TEST(ROUNDD__WASMSIMD_ADDSUB, positive_snan) {
1816     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1817     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1818     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1819       for (uint32_t i = 0; i < kBlockSize; i++) {
1820         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1821       }
1822       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1823       for (uint32_t i = 0; i < kBlockSize; i++) {
1824         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1825         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1826           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1827           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1828           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1829       }
1830     }
1831   }
1832 
TEST(ROUNDD__WASMSIMD_ADDSUB,negative_snan)1833   TEST(ROUNDD__WASMSIMD_ADDSUB, negative_snan) {
1834     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1835     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1836     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1837       for (uint32_t i = 0; i < kBlockSize; i++) {
1838         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1839       }
1840       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1841       for (uint32_t i = 0; i < kBlockSize; i++) {
1842         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1843         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1844           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1845           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1846           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1847       }
1848     }
1849   }
1850 
TEST(ROUNDD__WASMSIMD_ADDSUB,positive_snan_to_qnan)1851   TEST(ROUNDD__WASMSIMD_ADDSUB, positive_snan_to_qnan) {
1852     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1853     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1854     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1855       for (uint32_t i = 0; i < kBlockSize; i++) {
1856         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1857       }
1858       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1859       for (uint32_t i = 0; i < kBlockSize; i++) {
1860         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1861         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1862           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1863           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1864           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1865       }
1866     }
1867   }
1868 
TEST(ROUNDD__WASMSIMD_ADDSUB,negative_snan_to_qnan)1869   TEST(ROUNDD__WASMSIMD_ADDSUB, negative_snan_to_qnan) {
1870     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1871     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1872     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1873       for (uint32_t i = 0; i < kBlockSize; i++) {
1874         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1875       }
1876       xnn_math_f32_roundd__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1877       for (uint32_t i = 0; i < kBlockSize; i++) {
1878         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1879         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1880           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1881           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1882           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1883       }
1884     }
1885   }
1886 #endif  // XNN_ARCH_WASMSIMD
1887 
1888 #if XNN_ARCH_WASMSIMD
TEST(ROUNDD__WASMSIMD_CVT,positive_zero)1889   TEST(ROUNDD__WASMSIMD_CVT, positive_zero) {
1890     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1891     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1892     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
1893     xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1894     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1895     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1896       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1897       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1898       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1899   }
1900 
TEST(ROUNDD__WASMSIMD_CVT,negative_zero)1901   TEST(ROUNDD__WASMSIMD_CVT, negative_zero) {
1902     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1903     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1904     std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
1905     xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1906     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
1907     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1908       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1909       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1910       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1911   }
1912 
TEST(ROUNDD__WASMSIMD_CVT,positive_subnormal)1913   TEST(ROUNDD__WASMSIMD_CVT, positive_subnormal) {
1914     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1915     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1916     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
1917       for (uint32_t i = 0; i < kBlockSize; i++) {
1918         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
1919       }
1920       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1921       for (uint32_t i = 0; i < kBlockSize; i++) {
1922         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1923         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1924           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1925           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1926           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1927       }
1928     }
1929   }
1930 
TEST(ROUNDD__WASMSIMD_CVT,negative_subnormal)1931   TEST(ROUNDD__WASMSIMD_CVT, negative_subnormal) {
1932     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1933     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1934     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
1935       for (uint32_t i = 0; i < kBlockSize; i++) {
1936         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
1937       }
1938       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1939       for (uint32_t i = 0; i < kBlockSize; i++) {
1940         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1941         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1942           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1943           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1944           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1945       }
1946     }
1947   }
1948 
TEST(ROUNDD__WASMSIMD_CVT,positive_normal)1949   TEST(ROUNDD__WASMSIMD_CVT, positive_normal) {
1950     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1951     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1952     for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1953       for (uint32_t i = 0; i < kBlockSize; i++) {
1954         inputs[i] = fp32_from_bits(n + i);
1955       }
1956       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1957       for (uint32_t i = 0; i < kBlockSize; i++) {
1958         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1959         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1960           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1961           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1962           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1963       }
1964     }
1965   }
1966 
TEST(ROUNDD__WASMSIMD_CVT,negative_normal)1967   TEST(ROUNDD__WASMSIMD_CVT, negative_normal) {
1968     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1969     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1970     for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1971       for (uint32_t i = 0; i < kBlockSize; i++) {
1972         inputs[i] = fp32_from_bits(n + i);
1973       }
1974       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1975       for (uint32_t i = 0; i < kBlockSize; i++) {
1976         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1977         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1978           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1979           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1980           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1981       }
1982     }
1983   }
1984 
TEST(ROUNDD__WASMSIMD_CVT,positive_integral)1985   TEST(ROUNDD__WASMSIMD_CVT, positive_integral) {
1986     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1987     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1988     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1989       for (uint32_t i = 0; i < kBlockSize; i++) {
1990         inputs[i] = fp32_from_bits(n + i);
1991       }
1992       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1993       for (uint32_t i = 0; i < kBlockSize; i++) {
1994         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
1995         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1996           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1997           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1998           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1999       }
2000     }
2001   }
2002 
TEST(ROUNDD__WASMSIMD_CVT,negative_integral)2003   TEST(ROUNDD__WASMSIMD_CVT, negative_integral) {
2004     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2005     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2006     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
2007       for (uint32_t i = 0; i < kBlockSize; i++) {
2008         inputs[i] = fp32_from_bits(n + i);
2009       }
2010       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2011       for (uint32_t i = 0; i < kBlockSize; i++) {
2012         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2013         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2014           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2015           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2016           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2017       }
2018     }
2019   }
2020 
TEST(ROUNDD__WASMSIMD_CVT,positive_infinity)2021   TEST(ROUNDD__WASMSIMD_CVT, positive_infinity) {
2022     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2023     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2024     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
2025     xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2026     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2027     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2028       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2029       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2030       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2031   }
2032 
TEST(ROUNDD__WASMSIMD_CVT,negative_infinity)2033   TEST(ROUNDD__WASMSIMD_CVT, negative_infinity) {
2034     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2035     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2036     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
2037     xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2038     const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2039     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2040       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2041       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2042       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2043   }
2044 
TEST(ROUNDD__WASMSIMD_CVT,positive_qnan)2045   TEST(ROUNDD__WASMSIMD_CVT, positive_qnan) {
2046     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2047     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2048     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
2049       for (uint32_t i = 0; i < kBlockSize; i++) {
2050         inputs[i] = fp32_from_bits(n + i);
2051       }
2052       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2053       for (uint32_t i = 0; i < kBlockSize; i++) {
2054         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2055         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2056           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2057           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2058           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2059       }
2060     }
2061   }
2062 
TEST(ROUNDD__WASMSIMD_CVT,negative_qnan)2063   TEST(ROUNDD__WASMSIMD_CVT, negative_qnan) {
2064     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2065     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2066     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
2067       for (uint32_t i = 0; i < kBlockSize; i++) {
2068         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
2069       }
2070       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2071       for (uint32_t i = 0; i < kBlockSize; i++) {
2072         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2073         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2074           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2075           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2076           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2077       }
2078     }
2079   }
2080 
TEST(ROUNDD__WASMSIMD_CVT,positive_snan)2081   TEST(ROUNDD__WASMSIMD_CVT, positive_snan) {
2082     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2083     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2084     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2085       for (uint32_t i = 0; i < kBlockSize; i++) {
2086         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2087       }
2088       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2089       for (uint32_t i = 0; i < kBlockSize; i++) {
2090         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2091         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
2092           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2093           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2094           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2095       }
2096     }
2097   }
2098 
TEST(ROUNDD__WASMSIMD_CVT,negative_snan)2099   TEST(ROUNDD__WASMSIMD_CVT, negative_snan) {
2100     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2101     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2102     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2103       for (uint32_t i = 0; i < kBlockSize; i++) {
2104         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2105       }
2106       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2107       for (uint32_t i = 0; i < kBlockSize; i++) {
2108         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2109         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
2110           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2111           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2112           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2113       }
2114     }
2115   }
2116 
TEST(ROUNDD__WASMSIMD_CVT,positive_snan_to_qnan)2117   TEST(ROUNDD__WASMSIMD_CVT, positive_snan_to_qnan) {
2118     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2119     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2120     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2121       for (uint32_t i = 0; i < kBlockSize; i++) {
2122         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2123       }
2124       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2125       for (uint32_t i = 0; i < kBlockSize; i++) {
2126         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2127         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2128           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2129           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2130           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2131       }
2132     }
2133   }
2134 
TEST(ROUNDD__WASMSIMD_CVT,negative_snan_to_qnan)2135   TEST(ROUNDD__WASMSIMD_CVT, negative_snan_to_qnan) {
2136     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2137     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2138     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2139       for (uint32_t i = 0; i < kBlockSize; i++) {
2140         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2141       }
2142       xnn_math_f32_roundd__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2143       for (uint32_t i = 0; i < kBlockSize; i++) {
2144         const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2145         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2146           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2147           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2148           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2149       }
2150     }
2151   }
2152 #endif  // XNN_ARCH_WASMSIMD
2153 
TEST(ROUNDD__SCALAR_ADDSUB,positive_zero)2154 TEST(ROUNDD__SCALAR_ADDSUB, positive_zero) {
2155   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2156   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2157   std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
2158   xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2159   const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2160   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2161     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2162     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2163     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2164 }
2165 
TEST(ROUNDD__SCALAR_ADDSUB,negative_zero)2166 TEST(ROUNDD__SCALAR_ADDSUB, negative_zero) {
2167   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2168   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2169   std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
2170   xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2171   const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2172   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2173     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2174     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2175     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2176 }
2177 
TEST(ROUNDD__SCALAR_ADDSUB,positive_subnormal)2178 TEST(ROUNDD__SCALAR_ADDSUB, positive_subnormal) {
2179   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2180   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2181   for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
2182     for (uint32_t i = 0; i < kBlockSize; i++) {
2183       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
2184     }
2185     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2186     for (uint32_t i = 0; i < kBlockSize; i++) {
2187       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2188       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2189         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2190         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2191         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2192     }
2193   }
2194 }
2195 
TEST(ROUNDD__SCALAR_ADDSUB,negative_subnormal)2196 TEST(ROUNDD__SCALAR_ADDSUB, negative_subnormal) {
2197   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2198   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2199   for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
2200     for (uint32_t i = 0; i < kBlockSize; i++) {
2201       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
2202     }
2203     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2204     for (uint32_t i = 0; i < kBlockSize; i++) {
2205       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2206       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2207         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2208         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2209         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2210     }
2211   }
2212 }
2213 
TEST(ROUNDD__SCALAR_ADDSUB,positive_normal)2214 TEST(ROUNDD__SCALAR_ADDSUB, positive_normal) {
2215   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2216   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2217   for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
2218     for (uint32_t i = 0; i < kBlockSize; i++) {
2219       inputs[i] = fp32_from_bits(n + i);
2220     }
2221     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2222     for (uint32_t i = 0; i < kBlockSize; i++) {
2223       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2224       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2225         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2226         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2227         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2228     }
2229   }
2230 }
2231 
TEST(ROUNDD__SCALAR_ADDSUB,negative_normal)2232 TEST(ROUNDD__SCALAR_ADDSUB, negative_normal) {
2233   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2234   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2235   for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
2236     for (uint32_t i = 0; i < kBlockSize; i++) {
2237       inputs[i] = fp32_from_bits(n + i);
2238     }
2239     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2240     for (uint32_t i = 0; i < kBlockSize; i++) {
2241       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2242       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2243         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2244         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2245         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2246     }
2247   }
2248 }
2249 
TEST(ROUNDD__SCALAR_ADDSUB,positive_integral)2250 TEST(ROUNDD__SCALAR_ADDSUB, positive_integral) {
2251   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2252   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2253   for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
2254     for (uint32_t i = 0; i < kBlockSize; i++) {
2255       inputs[i] = fp32_from_bits(n + i);
2256     }
2257     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2258     for (uint32_t i = 0; i < kBlockSize; i++) {
2259       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2260       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2261         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2262         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2263         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2264     }
2265   }
2266 }
2267 
TEST(ROUNDD__SCALAR_ADDSUB,negative_integral)2268 TEST(ROUNDD__SCALAR_ADDSUB, negative_integral) {
2269   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2270   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2271   for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
2272     for (uint32_t i = 0; i < kBlockSize; i++) {
2273       inputs[i] = fp32_from_bits(n + i);
2274     }
2275     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2276     for (uint32_t i = 0; i < kBlockSize; i++) {
2277       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2278       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2279         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2280         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2281         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2282     }
2283   }
2284 }
2285 
TEST(ROUNDD__SCALAR_ADDSUB,positive_infinity)2286 TEST(ROUNDD__SCALAR_ADDSUB, positive_infinity) {
2287   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2288   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2289   std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
2290   xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2291   const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2292   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2293     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2294     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2295     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2296 }
2297 
TEST(ROUNDD__SCALAR_ADDSUB,negative_infinity)2298 TEST(ROUNDD__SCALAR_ADDSUB, negative_infinity) {
2299   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2300   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2301   std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
2302   xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2303   const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2304   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2305     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2306     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2307     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2308 }
2309 
TEST(ROUNDD__SCALAR_ADDSUB,positive_qnan)2310 TEST(ROUNDD__SCALAR_ADDSUB, positive_qnan) {
2311   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2312   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2313   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
2314     for (uint32_t i = 0; i < kBlockSize; i++) {
2315       inputs[i] = fp32_from_bits(n + i);
2316     }
2317     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2318     for (uint32_t i = 0; i < kBlockSize; i++) {
2319       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2320       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2321         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2322         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2323         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2324     }
2325   }
2326 }
2327 
TEST(ROUNDD__SCALAR_ADDSUB,negative_qnan)2328 TEST(ROUNDD__SCALAR_ADDSUB, negative_qnan) {
2329   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2330   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2331   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
2332     for (uint32_t i = 0; i < kBlockSize; i++) {
2333       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
2334     }
2335     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2336     for (uint32_t i = 0; i < kBlockSize; i++) {
2337       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2338       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2339         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2340         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2341         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2342     }
2343   }
2344 }
2345 
TEST(ROUNDD__SCALAR_ADDSUB,positive_snan)2346 TEST(ROUNDD__SCALAR_ADDSUB, positive_snan) {
2347   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2348   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2349   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2350     for (uint32_t i = 0; i < kBlockSize; i++) {
2351       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2352     }
2353     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2354     for (uint32_t i = 0; i < kBlockSize; i++) {
2355       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2356       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
2357         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2358         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2359         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2360     }
2361   }
2362 }
2363 
TEST(ROUNDD__SCALAR_ADDSUB,negative_snan)2364 TEST(ROUNDD__SCALAR_ADDSUB, negative_snan) {
2365   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2366   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2367   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2368     for (uint32_t i = 0; i < kBlockSize; i++) {
2369       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2370     }
2371     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2372     for (uint32_t i = 0; i < kBlockSize; i++) {
2373       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2374       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
2375         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2376         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2377         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2378     }
2379   }
2380 }
2381 
TEST(ROUNDD__SCALAR_ADDSUB,positive_snan_to_qnan)2382 TEST(ROUNDD__SCALAR_ADDSUB, positive_snan_to_qnan) {
2383   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2384   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2385   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2386     for (uint32_t i = 0; i < kBlockSize; i++) {
2387       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2388     }
2389     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2390     for (uint32_t i = 0; i < kBlockSize; i++) {
2391       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2392       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2393         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2394         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2395         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2396     }
2397   }
2398 }
2399 
TEST(ROUNDD__SCALAR_ADDSUB,negative_snan_to_qnan)2400 TEST(ROUNDD__SCALAR_ADDSUB, negative_snan_to_qnan) {
2401   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2402   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2403   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2404     for (uint32_t i = 0; i < kBlockSize; i++) {
2405       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2406     }
2407     xnn_math_f32_roundd__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2408     for (uint32_t i = 0; i < kBlockSize; i++) {
2409       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2410       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2411         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2412         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2413         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2414     }
2415   }
2416 }
2417 
TEST(ROUNDD__SCALAR_CVT,positive_zero)2418 TEST(ROUNDD__SCALAR_CVT, positive_zero) {
2419   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2420   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2421   std::fill(inputs.begin(), inputs.end(), UINT32_C(0x00000000));
2422   xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2423   const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2424   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2425     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2426     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2427     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2428 }
2429 
TEST(ROUNDD__SCALAR_CVT,negative_zero)2430 TEST(ROUNDD__SCALAR_CVT, negative_zero) {
2431   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2432   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2433   std::fill(inputs.begin(), inputs.end(), UINT32_C(0x80000000));
2434   xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2435   const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2436   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2437     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2438     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2439     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2440 }
2441 
TEST(ROUNDD__SCALAR_CVT,positive_subnormal)2442 TEST(ROUNDD__SCALAR_CVT, positive_subnormal) {
2443   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2444   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2445   for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x00800000); n += kBlockSize) {
2446     for (uint32_t i = 0; i < kBlockSize; i++) {
2447       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x00000001)));
2448     }
2449     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2450     for (uint32_t i = 0; i < kBlockSize; i++) {
2451       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2452       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2453         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2454         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2455         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2456     }
2457   }
2458 }
2459 
TEST(ROUNDD__SCALAR_CVT,negative_subnormal)2460 TEST(ROUNDD__SCALAR_CVT, negative_subnormal) {
2461   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2462   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2463   for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0x80800000); n += kBlockSize) {
2464     for (uint32_t i = 0; i < kBlockSize; i++) {
2465       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x80000001)));
2466     }
2467     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2468     for (uint32_t i = 0; i < kBlockSize; i++) {
2469       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2470       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2471         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2472         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2473         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2474     }
2475   }
2476 }
2477 
TEST(ROUNDD__SCALAR_CVT,positive_normal)2478 TEST(ROUNDD__SCALAR_CVT, positive_normal) {
2479   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2480   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2481   for (uint32_t n = UINT32_C(0x00800000); n < UINT32_C(0x4B800000); n += kBlockSize) {
2482     for (uint32_t i = 0; i < kBlockSize; i++) {
2483       inputs[i] = fp32_from_bits(n + i);
2484     }
2485     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2486     for (uint32_t i = 0; i < kBlockSize; i++) {
2487       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2488       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2489         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2490         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2491         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2492     }
2493   }
2494 }
2495 
TEST(ROUNDD__SCALAR_CVT,negative_normal)2496 TEST(ROUNDD__SCALAR_CVT, negative_normal) {
2497   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2498   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2499   for (uint32_t n = UINT32_C(0x80800000); n < UINT32_C(0xCB800000); n += kBlockSize) {
2500     for (uint32_t i = 0; i < kBlockSize; i++) {
2501       inputs[i] = fp32_from_bits(n + i);
2502     }
2503     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2504     for (uint32_t i = 0; i < kBlockSize; i++) {
2505       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2506       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2507         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2508         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2509         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2510     }
2511   }
2512 }
2513 
TEST(ROUNDD__SCALAR_CVT,positive_integral)2514 TEST(ROUNDD__SCALAR_CVT, positive_integral) {
2515   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2516   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2517   for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
2518     for (uint32_t i = 0; i < kBlockSize; i++) {
2519       inputs[i] = fp32_from_bits(n + i);
2520     }
2521     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2522     for (uint32_t i = 0; i < kBlockSize; i++) {
2523       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2524       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2525         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2526         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2527         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2528     }
2529   }
2530 }
2531 
TEST(ROUNDD__SCALAR_CVT,negative_integral)2532 TEST(ROUNDD__SCALAR_CVT, negative_integral) {
2533   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2534   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2535   for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
2536     for (uint32_t i = 0; i < kBlockSize; i++) {
2537       inputs[i] = fp32_from_bits(n + i);
2538     }
2539     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2540     for (uint32_t i = 0; i < kBlockSize; i++) {
2541       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2542       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2543         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2544         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2545         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2546     }
2547   }
2548 }
2549 
TEST(ROUNDD__SCALAR_CVT,positive_infinity)2550 TEST(ROUNDD__SCALAR_CVT, positive_infinity) {
2551   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2552   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2553   std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
2554   xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2555   const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2556   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2557     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2558     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2559     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2560 }
2561 
TEST(ROUNDD__SCALAR_CVT,negative_infinity)2562 TEST(ROUNDD__SCALAR_CVT, negative_infinity) {
2563   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2564   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2565   std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
2566   xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2567   const uint32_t reference_output = fp32_to_bits(std::floor(inputs[0]));
2568   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
2569     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
2570     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2571     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
2572 }
2573 
TEST(ROUNDD__SCALAR_CVT,positive_qnan)2574 TEST(ROUNDD__SCALAR_CVT, positive_qnan) {
2575   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2576   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2577   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
2578     for (uint32_t i = 0; i < kBlockSize; i++) {
2579       inputs[i] = fp32_from_bits(n + i);
2580     }
2581     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2582     for (uint32_t i = 0; i < kBlockSize; i++) {
2583       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2584       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2585         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2586         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2587         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2588     }
2589   }
2590 }
2591 
TEST(ROUNDD__SCALAR_CVT,negative_qnan)2592 TEST(ROUNDD__SCALAR_CVT, negative_qnan) {
2593   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2594   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2595   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
2596     for (uint32_t i = 0; i < kBlockSize; i++) {
2597       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
2598     }
2599     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2600     for (uint32_t i = 0; i < kBlockSize; i++) {
2601       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2602       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2603         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2604         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2605         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2606     }
2607   }
2608 }
2609 
TEST(ROUNDD__SCALAR_CVT,positive_snan)2610 TEST(ROUNDD__SCALAR_CVT, positive_snan) {
2611   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2612   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2613   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2614     for (uint32_t i = 0; i < kBlockSize; i++) {
2615       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2616     }
2617     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2618     for (uint32_t i = 0; i < kBlockSize; i++) {
2619       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2620       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
2621         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2622         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2623         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2624     }
2625   }
2626 }
2627 
TEST(ROUNDD__SCALAR_CVT,negative_snan)2628 TEST(ROUNDD__SCALAR_CVT, negative_snan) {
2629   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2630   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2631   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2632     for (uint32_t i = 0; i < kBlockSize; i++) {
2633       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2634     }
2635     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2636     for (uint32_t i = 0; i < kBlockSize; i++) {
2637       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2638       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
2639         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2640         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2641         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2642     }
2643   }
2644 }
2645 
TEST(ROUNDD__SCALAR_CVT,positive_snan_to_qnan)2646 TEST(ROUNDD__SCALAR_CVT, positive_snan_to_qnan) {
2647   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2648   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2649   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2650     for (uint32_t i = 0; i < kBlockSize; i++) {
2651       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2652     }
2653     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2654     for (uint32_t i = 0; i < kBlockSize; i++) {
2655       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2656       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2657         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2658         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2659         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2660     }
2661   }
2662 }
2663 
TEST(ROUNDD__SCALAR_CVT,negative_snan_to_qnan)2664 TEST(ROUNDD__SCALAR_CVT, negative_snan_to_qnan) {
2665   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2666   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2667   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2668     for (uint32_t i = 0; i < kBlockSize; i++) {
2669       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2670     }
2671     xnn_math_f32_roundd__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2672     for (uint32_t i = 0; i < kBlockSize; i++) {
2673       const uint32_t reference_output = fp32_to_bits(std::floor(inputs[i]));
2674       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2675         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2676         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2677         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2678     }
2679   }
2680 }
2681