1 // Copyright 2020 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #include <algorithm> 7 #include <cmath> 8 #include <cstddef> 9 #include <cstdint> 10 #include <cstdlib> 11 #include <iomanip> 12 #include <ios> 13 #include <vector> 14 15 #include <gtest/gtest.h> 16 17 #include <fp16.h> 18 19 #include <xnnpack/AlignedAllocator.h> 20 #include <xnnpack/common.h> 21 #include <xnnpack/isa-checks.h> 22 #include <xnnpack/math-stubs.h> 23 24 25 constexpr int kBlockSize = 1024; 26 27 28 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(EXP__NEONFMA_RR2_LUT64_P2,negative_zero)29 TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_zero) { 30 TEST_REQUIRES_ARM_NEON_FMA; 31 32 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 33 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 34 std::fill(inputs.begin(), inputs.end(), -0.0f); 35 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 36 const float reference_output = 1.0f; 37 ASSERT_EQ(reference_output, outputs[0]) 38 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 39 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 40 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 41 } 42 TEST(EXP__NEONFMA_RR2_LUT64_P2,positive_zero)43 TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_zero) { 44 TEST_REQUIRES_ARM_NEON_FMA; 45 46 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 47 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 48 std::fill(inputs.begin(), inputs.end(), +0.0f); 49 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 50 const float reference_output = 1.0f; 51 ASSERT_EQ(reference_output, outputs[0]) 52 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 53 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 54 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 55 } 56 TEST(EXP__NEONFMA_RR2_LUT64_P2,negative_saturation)57 TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_saturation) { 58 TEST_REQUIRES_ARM_NEON_FMA; 59 60 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 61 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 62 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 63 for (uint32_t i = 0; i < kBlockSize; i++) { 64 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 65 } 66 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 67 for (uint32_t i = 0; i < kBlockSize; i++) { 68 const uint32_t reference_output = UINT32_C(0x00000000); 69 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 70 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 71 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 72 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 73 } 74 } 75 } 76 TEST(EXP__NEONFMA_RR2_LUT64_P2,positive_overflow)77 TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_overflow) { 78 TEST_REQUIRES_ARM_NEON_FMA; 79 80 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 81 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 82 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 83 for (uint32_t i = 0; i < kBlockSize; i++) { 84 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 85 } 86 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 87 for (uint32_t i = 0; i < kBlockSize; i++) { 88 const uint32_t reference_output = UINT32_C(0x7F800000); 89 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 90 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 91 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 92 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 93 } 94 } 95 } 96 TEST(EXP__NEONFMA_RR2_LUT64_P2,positive_nan)97 TEST(EXP__NEONFMA_RR2_LUT64_P2, positive_nan) { 98 TEST_REQUIRES_ARM_NEON_FMA; 99 100 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 101 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 102 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 103 for (uint32_t i = 0; i < kBlockSize; i++) { 104 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 105 } 106 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 107 for (uint32_t i = 0; i < kBlockSize; i++) { 108 ASSERT_TRUE(std::isnan(outputs[i])) 109 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 110 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 111 } 112 } 113 } 114 TEST(EXP__NEONFMA_RR2_LUT64_P2,negative_nan)115 TEST(EXP__NEONFMA_RR2_LUT64_P2, negative_nan) { 116 TEST_REQUIRES_ARM_NEON_FMA; 117 118 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 119 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 120 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 121 for (uint32_t i = 0; i < kBlockSize; i++) { 122 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 123 } 124 xnn_math_f32_exp__neonfma_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 125 for (uint32_t i = 0; i < kBlockSize; i++) { 126 ASSERT_TRUE(std::isnan(outputs[i])) 127 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 128 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 129 } 130 } 131 } 132 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 133 134 135 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(EXP__NEONFMA_RR2_P5,negative_zero)136 TEST(EXP__NEONFMA_RR2_P5, negative_zero) { 137 TEST_REQUIRES_ARM_NEON_FMA; 138 139 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 140 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 141 std::fill(inputs.begin(), inputs.end(), -0.0f); 142 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 143 const float reference_output = 1.0f; 144 ASSERT_EQ(reference_output, outputs[0]) 145 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 146 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 147 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 148 } 149 TEST(EXP__NEONFMA_RR2_P5,positive_zero)150 TEST(EXP__NEONFMA_RR2_P5, positive_zero) { 151 TEST_REQUIRES_ARM_NEON_FMA; 152 153 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 154 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 155 std::fill(inputs.begin(), inputs.end(), +0.0f); 156 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 157 const float reference_output = 1.0f; 158 ASSERT_EQ(reference_output, outputs[0]) 159 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 160 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 161 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 162 } 163 TEST(EXP__NEONFMA_RR2_P5,negative_saturation)164 TEST(EXP__NEONFMA_RR2_P5, negative_saturation) { 165 TEST_REQUIRES_ARM_NEON_FMA; 166 167 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 168 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 169 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 170 for (uint32_t i = 0; i < kBlockSize; i++) { 171 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 172 } 173 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 174 for (uint32_t i = 0; i < kBlockSize; i++) { 175 const uint32_t reference_output = UINT32_C(0x00000000); 176 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 177 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 178 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 179 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 180 } 181 } 182 } 183 TEST(EXP__NEONFMA_RR2_P5,positive_overflow)184 TEST(EXP__NEONFMA_RR2_P5, positive_overflow) { 185 TEST_REQUIRES_ARM_NEON_FMA; 186 187 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 188 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 189 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 190 for (uint32_t i = 0; i < kBlockSize; i++) { 191 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 192 } 193 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 194 for (uint32_t i = 0; i < kBlockSize; i++) { 195 const uint32_t reference_output = UINT32_C(0x7F800000); 196 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 197 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 198 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 199 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 200 } 201 } 202 } 203 TEST(EXP__NEONFMA_RR2_P5,positive_nan)204 TEST(EXP__NEONFMA_RR2_P5, positive_nan) { 205 TEST_REQUIRES_ARM_NEON_FMA; 206 207 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 208 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 209 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 210 for (uint32_t i = 0; i < kBlockSize; i++) { 211 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 212 } 213 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 214 for (uint32_t i = 0; i < kBlockSize; i++) { 215 ASSERT_TRUE(std::isnan(outputs[i])) 216 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 217 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 218 } 219 } 220 } 221 TEST(EXP__NEONFMA_RR2_P5,negative_nan)222 TEST(EXP__NEONFMA_RR2_P5, negative_nan) { 223 TEST_REQUIRES_ARM_NEON_FMA; 224 225 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 226 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 227 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 228 for (uint32_t i = 0; i < kBlockSize; i++) { 229 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 230 } 231 xnn_math_f32_exp__neonfma_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 232 for (uint32_t i = 0; i < kBlockSize; i++) { 233 ASSERT_TRUE(std::isnan(outputs[i])) 234 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 235 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 236 } 237 } 238 } 239 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 240 241 242 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,negative_zero)243 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_zero) { 244 TEST_REQUIRES_X86_AVX512F; 245 246 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 247 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 248 std::fill(inputs.begin(), inputs.end(), -0.0f); 249 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 250 const float reference_output = 1.0f; 251 ASSERT_EQ(reference_output, outputs[0]) 252 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 253 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 254 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 255 } 256 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,positive_zero)257 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_zero) { 258 TEST_REQUIRES_X86_AVX512F; 259 260 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 261 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 262 std::fill(inputs.begin(), inputs.end(), +0.0f); 263 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 264 const float reference_output = 1.0f; 265 ASSERT_EQ(reference_output, outputs[0]) 266 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 267 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 268 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 269 } 270 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,negative_saturation)271 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_saturation) { 272 TEST_REQUIRES_X86_AVX512F; 273 274 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 275 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 276 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 277 for (uint32_t i = 0; i < kBlockSize; i++) { 278 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 279 } 280 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 281 for (uint32_t i = 0; i < kBlockSize; i++) { 282 const uint32_t reference_output = UINT32_C(0x00000000); 283 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 284 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 285 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 286 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 287 } 288 } 289 } 290 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,positive_overflow)291 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_overflow) { 292 TEST_REQUIRES_X86_AVX512F; 293 294 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 295 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 296 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 297 for (uint32_t i = 0; i < kBlockSize; i++) { 298 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 299 } 300 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 301 for (uint32_t i = 0; i < kBlockSize; i++) { 302 const uint32_t reference_output = UINT32_C(0x7F800000); 303 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 304 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 305 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 306 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 307 } 308 } 309 } 310 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,positive_nan)311 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, positive_nan) { 312 TEST_REQUIRES_X86_AVX512F; 313 314 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 315 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 316 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 317 for (uint32_t i = 0; i < kBlockSize; i++) { 318 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 319 } 320 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 321 for (uint32_t i = 0; i < kBlockSize; i++) { 322 ASSERT_TRUE(std::isnan(outputs[i])) 323 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 324 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 325 } 326 } 327 } 328 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM,negative_nan)329 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM, negative_nan) { 330 TEST_REQUIRES_X86_AVX512F; 331 332 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 333 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 334 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 335 for (uint32_t i = 0; i < kBlockSize; i++) { 336 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 337 } 338 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 339 for (uint32_t i = 0; i < kBlockSize; i++) { 340 ASSERT_TRUE(std::isnan(outputs[i])) 341 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 342 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 343 } 344 } 345 } 346 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 347 348 349 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,negative_zero)350 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_zero) { 351 TEST_REQUIRES_X86_AVX512F; 352 353 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 354 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 355 std::fill(inputs.begin(), inputs.end(), -0.0f); 356 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 357 const float reference_output = 1.0f; 358 ASSERT_EQ(reference_output, outputs[0]) 359 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 360 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 361 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 362 } 363 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,positive_zero)364 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_zero) { 365 TEST_REQUIRES_X86_AVX512F; 366 367 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 368 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 369 std::fill(inputs.begin(), inputs.end(), +0.0f); 370 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 371 const float reference_output = 1.0f; 372 ASSERT_EQ(reference_output, outputs[0]) 373 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 374 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 375 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 376 } 377 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,negative_saturation)378 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_saturation) { 379 TEST_REQUIRES_X86_AVX512F; 380 381 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 382 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 383 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 384 for (uint32_t i = 0; i < kBlockSize; i++) { 385 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 386 } 387 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 388 for (uint32_t i = 0; i < kBlockSize; i++) { 389 const uint32_t reference_output = UINT32_C(0x00000000); 390 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 391 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 392 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 393 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 394 } 395 } 396 } 397 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,positive_overflow)398 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_overflow) { 399 TEST_REQUIRES_X86_AVX512F; 400 401 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 402 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 403 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 404 for (uint32_t i = 0; i < kBlockSize; i++) { 405 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 406 } 407 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 408 for (uint32_t i = 0; i < kBlockSize; i++) { 409 const uint32_t reference_output = UINT32_C(0x7F800000); 410 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 411 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 412 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 413 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 414 } 415 } 416 } 417 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,positive_nan)418 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, positive_nan) { 419 TEST_REQUIRES_X86_AVX512F; 420 421 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 422 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 423 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 424 for (uint32_t i = 0; i < kBlockSize; i++) { 425 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 426 } 427 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 428 for (uint32_t i = 0; i < kBlockSize; i++) { 429 ASSERT_TRUE(std::isnan(outputs[i])) 430 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 431 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 432 } 433 } 434 } 435 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF,negative_nan)436 TEST(EXP__AVX512F_RR2_LUT16_P3_PERM_SCALEF, negative_nan) { 437 TEST_REQUIRES_X86_AVX512F; 438 439 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 440 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 441 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 442 for (uint32_t i = 0; i < kBlockSize; i++) { 443 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 444 } 445 xnn_math_f32_exp__avx512f_rr2_lut16_p3_perm_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 446 for (uint32_t i = 0; i < kBlockSize; i++) { 447 ASSERT_TRUE(std::isnan(outputs[i])) 448 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 449 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 450 } 451 } 452 } 453 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 454 455 456 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,negative_zero)457 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_zero) { 458 TEST_REQUIRES_X86_AVX512F; 459 460 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 461 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 462 std::fill(inputs.begin(), inputs.end(), -0.0f); 463 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 464 const float reference_output = 1.0f; 465 ASSERT_EQ(reference_output, outputs[0]) 466 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 467 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 468 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 469 } 470 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,positive_zero)471 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_zero) { 472 TEST_REQUIRES_X86_AVX512F; 473 474 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 475 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 476 std::fill(inputs.begin(), inputs.end(), +0.0f); 477 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 478 const float reference_output = 1.0f; 479 ASSERT_EQ(reference_output, outputs[0]) 480 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 481 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 482 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 483 } 484 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,negative_saturation)485 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_saturation) { 486 TEST_REQUIRES_X86_AVX512F; 487 488 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 489 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 490 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 491 for (uint32_t i = 0; i < kBlockSize; i++) { 492 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 493 } 494 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 495 for (uint32_t i = 0; i < kBlockSize; i++) { 496 const uint32_t reference_output = UINT32_C(0x00000000); 497 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 498 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 499 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 500 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 501 } 502 } 503 } 504 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,positive_overflow)505 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_overflow) { 506 TEST_REQUIRES_X86_AVX512F; 507 508 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 509 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 510 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 511 for (uint32_t i = 0; i < kBlockSize; i++) { 512 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 513 } 514 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 515 for (uint32_t i = 0; i < kBlockSize; i++) { 516 const uint32_t reference_output = UINT32_C(0x7F800000); 517 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 518 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 519 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 520 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 521 } 522 } 523 } 524 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,positive_nan)525 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, positive_nan) { 526 TEST_REQUIRES_X86_AVX512F; 527 528 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 529 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 530 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 531 for (uint32_t i = 0; i < kBlockSize; i++) { 532 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 533 } 534 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 535 for (uint32_t i = 0; i < kBlockSize; i++) { 536 ASSERT_TRUE(std::isnan(outputs[i])) 537 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 538 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 539 } 540 } 541 } 542 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2,negative_nan)543 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2, negative_nan) { 544 TEST_REQUIRES_X86_AVX512F; 545 546 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 547 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 548 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 549 for (uint32_t i = 0; i < kBlockSize; i++) { 550 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 551 } 552 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 553 for (uint32_t i = 0; i < kBlockSize; i++) { 554 ASSERT_TRUE(std::isnan(outputs[i])) 555 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 556 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 557 } 558 } 559 } 560 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 561 562 563 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,negative_zero)564 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_zero) { 565 TEST_REQUIRES_X86_AVX512F; 566 567 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 568 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 569 std::fill(inputs.begin(), inputs.end(), -0.0f); 570 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 571 const float reference_output = 1.0f; 572 ASSERT_EQ(reference_output, outputs[0]) 573 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 574 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 575 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 576 } 577 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,positive_zero)578 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_zero) { 579 TEST_REQUIRES_X86_AVX512F; 580 581 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 582 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 583 std::fill(inputs.begin(), inputs.end(), +0.0f); 584 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 585 const float reference_output = 1.0f; 586 ASSERT_EQ(reference_output, outputs[0]) 587 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 588 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 589 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 590 } 591 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,negative_saturation)592 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_saturation) { 593 TEST_REQUIRES_X86_AVX512F; 594 595 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 596 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 597 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 598 for (uint32_t i = 0; i < kBlockSize; i++) { 599 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 600 } 601 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 602 for (uint32_t i = 0; i < kBlockSize; i++) { 603 const uint32_t reference_output = UINT32_C(0x00000000); 604 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 605 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 606 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 607 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 608 } 609 } 610 } 611 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,positive_overflow)612 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_overflow) { 613 TEST_REQUIRES_X86_AVX512F; 614 615 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 616 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 617 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 618 for (uint32_t i = 0; i < kBlockSize; i++) { 619 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 620 } 621 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 622 for (uint32_t i = 0; i < kBlockSize; i++) { 623 const uint32_t reference_output = UINT32_C(0x7F800000); 624 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 625 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 626 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 627 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 628 } 629 } 630 } 631 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,positive_nan)632 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, positive_nan) { 633 TEST_REQUIRES_X86_AVX512F; 634 635 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 636 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 637 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 638 for (uint32_t i = 0; i < kBlockSize; i++) { 639 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 640 } 641 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 642 for (uint32_t i = 0; i < kBlockSize; i++) { 643 ASSERT_TRUE(std::isnan(outputs[i])) 644 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 645 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 646 } 647 } 648 } 649 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF,negative_nan)650 TEST(EXP__AVX512F_RR2_LUT32_P2_PERM2_SCALEF, negative_nan) { 651 TEST_REQUIRES_X86_AVX512F; 652 653 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 654 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 655 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 656 for (uint32_t i = 0; i < kBlockSize; i++) { 657 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 658 } 659 xnn_math_f32_exp__avx512f_rr2_lut32_p2_perm2_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 660 for (uint32_t i = 0; i < kBlockSize; i++) { 661 ASSERT_TRUE(std::isnan(outputs[i])) 662 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 663 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 664 } 665 } 666 } 667 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 668 669 670 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX512F_RR2_P5,negative_zero)671 TEST(EXP__AVX512F_RR2_P5, negative_zero) { 672 TEST_REQUIRES_X86_AVX512F; 673 674 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 675 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 676 std::fill(inputs.begin(), inputs.end(), -0.0f); 677 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 678 const float reference_output = 1.0f; 679 ASSERT_EQ(reference_output, outputs[0]) 680 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 681 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 682 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 683 } 684 TEST(EXP__AVX512F_RR2_P5,positive_zero)685 TEST(EXP__AVX512F_RR2_P5, positive_zero) { 686 TEST_REQUIRES_X86_AVX512F; 687 688 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 689 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 690 std::fill(inputs.begin(), inputs.end(), +0.0f); 691 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 692 const float reference_output = 1.0f; 693 ASSERT_EQ(reference_output, outputs[0]) 694 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 695 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 696 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 697 } 698 TEST(EXP__AVX512F_RR2_P5,negative_saturation)699 TEST(EXP__AVX512F_RR2_P5, negative_saturation) { 700 TEST_REQUIRES_X86_AVX512F; 701 702 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 703 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 704 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 705 for (uint32_t i = 0; i < kBlockSize; i++) { 706 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 707 } 708 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 709 for (uint32_t i = 0; i < kBlockSize; i++) { 710 const uint32_t reference_output = UINT32_C(0x00000000); 711 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 712 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 713 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 714 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 715 } 716 } 717 } 718 TEST(EXP__AVX512F_RR2_P5,positive_overflow)719 TEST(EXP__AVX512F_RR2_P5, positive_overflow) { 720 TEST_REQUIRES_X86_AVX512F; 721 722 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 723 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 724 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 725 for (uint32_t i = 0; i < kBlockSize; i++) { 726 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 727 } 728 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 729 for (uint32_t i = 0; i < kBlockSize; i++) { 730 const uint32_t reference_output = UINT32_C(0x7F800000); 731 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 732 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 733 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 734 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 735 } 736 } 737 } 738 TEST(EXP__AVX512F_RR2_P5,positive_nan)739 TEST(EXP__AVX512F_RR2_P5, positive_nan) { 740 TEST_REQUIRES_X86_AVX512F; 741 742 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 743 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 744 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 745 for (uint32_t i = 0; i < kBlockSize; i++) { 746 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 747 } 748 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 749 for (uint32_t i = 0; i < kBlockSize; i++) { 750 ASSERT_TRUE(std::isnan(outputs[i])) 751 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 752 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 753 } 754 } 755 } 756 TEST(EXP__AVX512F_RR2_P5,negative_nan)757 TEST(EXP__AVX512F_RR2_P5, negative_nan) { 758 TEST_REQUIRES_X86_AVX512F; 759 760 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 761 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 762 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 763 for (uint32_t i = 0; i < kBlockSize; i++) { 764 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 765 } 766 xnn_math_f32_exp__avx512f_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 767 for (uint32_t i = 0; i < kBlockSize; i++) { 768 ASSERT_TRUE(std::isnan(outputs[i])) 769 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 770 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 771 } 772 } 773 } 774 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 775 776 777 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX512F_RR2_P5_SCALEF,negative_zero)778 TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_zero) { 779 TEST_REQUIRES_X86_AVX512F; 780 781 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 782 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 783 std::fill(inputs.begin(), inputs.end(), -0.0f); 784 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 785 const float reference_output = 1.0f; 786 ASSERT_EQ(reference_output, outputs[0]) 787 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 788 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 789 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 790 } 791 TEST(EXP__AVX512F_RR2_P5_SCALEF,positive_zero)792 TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_zero) { 793 TEST_REQUIRES_X86_AVX512F; 794 795 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 796 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 797 std::fill(inputs.begin(), inputs.end(), +0.0f); 798 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 799 const float reference_output = 1.0f; 800 ASSERT_EQ(reference_output, outputs[0]) 801 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 802 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 803 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 804 } 805 TEST(EXP__AVX512F_RR2_P5_SCALEF,negative_saturation)806 TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_saturation) { 807 TEST_REQUIRES_X86_AVX512F; 808 809 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 810 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 811 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 812 for (uint32_t i = 0; i < kBlockSize; i++) { 813 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 814 } 815 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 816 for (uint32_t i = 0; i < kBlockSize; i++) { 817 const uint32_t reference_output = UINT32_C(0x00000000); 818 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 819 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 820 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 821 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 822 } 823 } 824 } 825 TEST(EXP__AVX512F_RR2_P5_SCALEF,positive_overflow)826 TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_overflow) { 827 TEST_REQUIRES_X86_AVX512F; 828 829 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 830 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 831 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 832 for (uint32_t i = 0; i < kBlockSize; i++) { 833 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 834 } 835 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 836 for (uint32_t i = 0; i < kBlockSize; i++) { 837 const uint32_t reference_output = UINT32_C(0x7F800000); 838 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 839 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 840 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 841 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 842 } 843 } 844 } 845 TEST(EXP__AVX512F_RR2_P5_SCALEF,positive_nan)846 TEST(EXP__AVX512F_RR2_P5_SCALEF, positive_nan) { 847 TEST_REQUIRES_X86_AVX512F; 848 849 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 850 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 851 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 852 for (uint32_t i = 0; i < kBlockSize; i++) { 853 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 854 } 855 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 856 for (uint32_t i = 0; i < kBlockSize; i++) { 857 ASSERT_TRUE(std::isnan(outputs[i])) 858 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 859 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 860 } 861 } 862 } 863 TEST(EXP__AVX512F_RR2_P5_SCALEF,negative_nan)864 TEST(EXP__AVX512F_RR2_P5_SCALEF, negative_nan) { 865 TEST_REQUIRES_X86_AVX512F; 866 867 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 868 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 869 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 870 for (uint32_t i = 0; i < kBlockSize; i++) { 871 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 872 } 873 xnn_math_f32_exp__avx512f_rr2_p5_scalef(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 874 for (uint32_t i = 0; i < kBlockSize; i++) { 875 ASSERT_TRUE(std::isnan(outputs[i])) 876 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 877 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 878 } 879 } 880 } 881 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 882 883 884 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX2_RR2_LUT8_P3_PERM,negative_zero)885 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_zero) { 886 TEST_REQUIRES_X86_AVX2; 887 888 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 889 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 890 std::fill(inputs.begin(), inputs.end(), -0.0f); 891 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 892 const float reference_output = 1.0f; 893 ASSERT_EQ(reference_output, outputs[0]) 894 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 895 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 896 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 897 } 898 TEST(EXP__AVX2_RR2_LUT8_P3_PERM,positive_zero)899 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_zero) { 900 TEST_REQUIRES_X86_AVX2; 901 902 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 903 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 904 std::fill(inputs.begin(), inputs.end(), +0.0f); 905 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 906 const float reference_output = 1.0f; 907 ASSERT_EQ(reference_output, outputs[0]) 908 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 909 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 910 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 911 } 912 TEST(EXP__AVX2_RR2_LUT8_P3_PERM,negative_saturation)913 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_saturation) { 914 TEST_REQUIRES_X86_AVX2; 915 916 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 917 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 918 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 919 for (uint32_t i = 0; i < kBlockSize; i++) { 920 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 921 } 922 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 923 for (uint32_t i = 0; i < kBlockSize; i++) { 924 const uint32_t reference_output = UINT32_C(0x00000000); 925 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 926 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 927 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 928 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 929 } 930 } 931 } 932 TEST(EXP__AVX2_RR2_LUT8_P3_PERM,positive_overflow)933 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_overflow) { 934 TEST_REQUIRES_X86_AVX2; 935 936 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 937 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 938 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 939 for (uint32_t i = 0; i < kBlockSize; i++) { 940 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 941 } 942 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 943 for (uint32_t i = 0; i < kBlockSize; i++) { 944 const uint32_t reference_output = UINT32_C(0x7F800000); 945 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 946 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 947 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 948 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 949 } 950 } 951 } 952 TEST(EXP__AVX2_RR2_LUT8_P3_PERM,positive_nan)953 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, positive_nan) { 954 TEST_REQUIRES_X86_AVX2; 955 956 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 957 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 958 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 959 for (uint32_t i = 0; i < kBlockSize; i++) { 960 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 961 } 962 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 963 for (uint32_t i = 0; i < kBlockSize; i++) { 964 ASSERT_TRUE(std::isnan(outputs[i])) 965 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 966 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 967 } 968 } 969 } 970 TEST(EXP__AVX2_RR2_LUT8_P3_PERM,negative_nan)971 TEST(EXP__AVX2_RR2_LUT8_P3_PERM, negative_nan) { 972 TEST_REQUIRES_X86_AVX2; 973 974 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 975 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 976 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 977 for (uint32_t i = 0; i < kBlockSize; i++) { 978 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 979 } 980 xnn_math_f32_exp__avx2_rr2_lut8_p3_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 981 for (uint32_t i = 0; i < kBlockSize; i++) { 982 ASSERT_TRUE(std::isnan(outputs[i])) 983 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 984 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 985 } 986 } 987 } 988 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 989 990 991 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX2_RR2_LUT8_P4_PERM,negative_zero)992 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_zero) { 993 TEST_REQUIRES_X86_AVX2; 994 995 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 996 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 997 std::fill(inputs.begin(), inputs.end(), -0.0f); 998 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 999 const float reference_output = 1.0f; 1000 ASSERT_EQ(reference_output, outputs[0]) 1001 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1002 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1003 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1004 } 1005 TEST(EXP__AVX2_RR2_LUT8_P4_PERM,positive_zero)1006 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_zero) { 1007 TEST_REQUIRES_X86_AVX2; 1008 1009 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1010 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1011 std::fill(inputs.begin(), inputs.end(), +0.0f); 1012 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1013 const float reference_output = 1.0f; 1014 ASSERT_EQ(reference_output, outputs[0]) 1015 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1016 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1017 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1018 } 1019 TEST(EXP__AVX2_RR2_LUT8_P4_PERM,negative_saturation)1020 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_saturation) { 1021 TEST_REQUIRES_X86_AVX2; 1022 1023 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1024 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1025 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 1026 for (uint32_t i = 0; i < kBlockSize; i++) { 1027 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 1028 } 1029 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1030 for (uint32_t i = 0; i < kBlockSize; i++) { 1031 const uint32_t reference_output = UINT32_C(0x00000000); 1032 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1033 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1034 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1035 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1036 } 1037 } 1038 } 1039 TEST(EXP__AVX2_RR2_LUT8_P4_PERM,positive_overflow)1040 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_overflow) { 1041 TEST_REQUIRES_X86_AVX2; 1042 1043 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1044 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1045 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 1046 for (uint32_t i = 0; i < kBlockSize; i++) { 1047 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 1048 } 1049 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1050 for (uint32_t i = 0; i < kBlockSize; i++) { 1051 const uint32_t reference_output = UINT32_C(0x7F800000); 1052 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1053 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1054 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1055 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1056 } 1057 } 1058 } 1059 TEST(EXP__AVX2_RR2_LUT8_P4_PERM,positive_nan)1060 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, positive_nan) { 1061 TEST_REQUIRES_X86_AVX2; 1062 1063 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1064 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1065 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1066 for (uint32_t i = 0; i < kBlockSize; i++) { 1067 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 1068 } 1069 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1070 for (uint32_t i = 0; i < kBlockSize; i++) { 1071 ASSERT_TRUE(std::isnan(outputs[i])) 1072 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1073 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1074 } 1075 } 1076 } 1077 TEST(EXP__AVX2_RR2_LUT8_P4_PERM,negative_nan)1078 TEST(EXP__AVX2_RR2_LUT8_P4_PERM, negative_nan) { 1079 TEST_REQUIRES_X86_AVX2; 1080 1081 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1082 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1083 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1084 for (uint32_t i = 0; i < kBlockSize; i++) { 1085 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 1086 } 1087 xnn_math_f32_exp__avx2_rr2_lut8_p4_perm(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1088 for (uint32_t i = 0; i < kBlockSize; i++) { 1089 ASSERT_TRUE(std::isnan(outputs[i])) 1090 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1091 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1092 } 1093 } 1094 } 1095 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1096 1097 1098 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX2_RR2_P5,negative_zero)1099 TEST(EXP__AVX2_RR2_P5, negative_zero) { 1100 TEST_REQUIRES_X86_AVX2; 1101 1102 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1103 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1104 std::fill(inputs.begin(), inputs.end(), -0.0f); 1105 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1106 const float reference_output = 1.0f; 1107 ASSERT_EQ(reference_output, outputs[0]) 1108 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1109 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1110 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1111 } 1112 TEST(EXP__AVX2_RR2_P5,positive_zero)1113 TEST(EXP__AVX2_RR2_P5, positive_zero) { 1114 TEST_REQUIRES_X86_AVX2; 1115 1116 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1117 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1118 std::fill(inputs.begin(), inputs.end(), +0.0f); 1119 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1120 const float reference_output = 1.0f; 1121 ASSERT_EQ(reference_output, outputs[0]) 1122 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1123 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1124 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1125 } 1126 TEST(EXP__AVX2_RR2_P5,negative_saturation)1127 TEST(EXP__AVX2_RR2_P5, negative_saturation) { 1128 TEST_REQUIRES_X86_AVX2; 1129 1130 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1131 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1132 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 1133 for (uint32_t i = 0; i < kBlockSize; i++) { 1134 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 1135 } 1136 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1137 for (uint32_t i = 0; i < kBlockSize; i++) { 1138 const uint32_t reference_output = UINT32_C(0x00000000); 1139 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1140 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1141 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1142 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1143 } 1144 } 1145 } 1146 TEST(EXP__AVX2_RR2_P5,positive_overflow)1147 TEST(EXP__AVX2_RR2_P5, positive_overflow) { 1148 TEST_REQUIRES_X86_AVX2; 1149 1150 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1151 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1152 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 1153 for (uint32_t i = 0; i < kBlockSize; i++) { 1154 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 1155 } 1156 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1157 for (uint32_t i = 0; i < kBlockSize; i++) { 1158 const uint32_t reference_output = UINT32_C(0x7F800000); 1159 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1160 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1161 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1162 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1163 } 1164 } 1165 } 1166 TEST(EXP__AVX2_RR2_P5,positive_nan)1167 TEST(EXP__AVX2_RR2_P5, positive_nan) { 1168 TEST_REQUIRES_X86_AVX2; 1169 1170 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1171 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1172 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1173 for (uint32_t i = 0; i < kBlockSize; i++) { 1174 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 1175 } 1176 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1177 for (uint32_t i = 0; i < kBlockSize; i++) { 1178 ASSERT_TRUE(std::isnan(outputs[i])) 1179 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1180 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1181 } 1182 } 1183 } 1184 TEST(EXP__AVX2_RR2_P5,negative_nan)1185 TEST(EXP__AVX2_RR2_P5, negative_nan) { 1186 TEST_REQUIRES_X86_AVX2; 1187 1188 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1189 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1190 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1191 for (uint32_t i = 0; i < kBlockSize; i++) { 1192 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 1193 } 1194 xnn_math_f32_exp__avx2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1195 for (uint32_t i = 0; i < kBlockSize; i++) { 1196 ASSERT_TRUE(std::isnan(outputs[i])) 1197 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1198 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1199 } 1200 } 1201 } 1202 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1203 1204 1205 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__AVX_RR2_P5,negative_zero)1206 TEST(EXP__AVX_RR2_P5, negative_zero) { 1207 TEST_REQUIRES_X86_AVX; 1208 1209 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1210 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1211 std::fill(inputs.begin(), inputs.end(), -0.0f); 1212 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1213 const float reference_output = 1.0f; 1214 ASSERT_EQ(reference_output, outputs[0]) 1215 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1216 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1217 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1218 } 1219 TEST(EXP__AVX_RR2_P5,positive_zero)1220 TEST(EXP__AVX_RR2_P5, positive_zero) { 1221 TEST_REQUIRES_X86_AVX; 1222 1223 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1224 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1225 std::fill(inputs.begin(), inputs.end(), +0.0f); 1226 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1227 const float reference_output = 1.0f; 1228 ASSERT_EQ(reference_output, outputs[0]) 1229 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1230 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1231 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1232 } 1233 TEST(EXP__AVX_RR2_P5,negative_saturation)1234 TEST(EXP__AVX_RR2_P5, negative_saturation) { 1235 TEST_REQUIRES_X86_AVX; 1236 1237 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1238 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1239 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 1240 for (uint32_t i = 0; i < kBlockSize; i++) { 1241 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 1242 } 1243 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1244 for (uint32_t i = 0; i < kBlockSize; i++) { 1245 const uint32_t reference_output = UINT32_C(0x00000000); 1246 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1247 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1248 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1249 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1250 } 1251 } 1252 } 1253 TEST(EXP__AVX_RR2_P5,positive_overflow)1254 TEST(EXP__AVX_RR2_P5, positive_overflow) { 1255 TEST_REQUIRES_X86_AVX; 1256 1257 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1258 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1259 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 1260 for (uint32_t i = 0; i < kBlockSize; i++) { 1261 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 1262 } 1263 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1264 for (uint32_t i = 0; i < kBlockSize; i++) { 1265 const uint32_t reference_output = UINT32_C(0x7F800000); 1266 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1267 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1268 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1269 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1270 } 1271 } 1272 } 1273 TEST(EXP__AVX_RR2_P5,positive_nan)1274 TEST(EXP__AVX_RR2_P5, positive_nan) { 1275 TEST_REQUIRES_X86_AVX; 1276 1277 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1278 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1279 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1280 for (uint32_t i = 0; i < kBlockSize; i++) { 1281 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 1282 } 1283 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1284 for (uint32_t i = 0; i < kBlockSize; i++) { 1285 ASSERT_TRUE(std::isnan(outputs[i])) 1286 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1287 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1288 } 1289 } 1290 } 1291 TEST(EXP__AVX_RR2_P5,negative_nan)1292 TEST(EXP__AVX_RR2_P5, negative_nan) { 1293 TEST_REQUIRES_X86_AVX; 1294 1295 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1296 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1297 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1298 for (uint32_t i = 0; i < kBlockSize; i++) { 1299 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 1300 } 1301 xnn_math_f32_exp__avx_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1302 for (uint32_t i = 0; i < kBlockSize; i++) { 1303 ASSERT_TRUE(std::isnan(outputs[i])) 1304 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1305 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1306 } 1307 } 1308 } 1309 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1310 1311 1312 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__SSE2_RR2_LUT64_P2,negative_zero)1313 TEST(EXP__SSE2_RR2_LUT64_P2, negative_zero) { 1314 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1315 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1316 std::fill(inputs.begin(), inputs.end(), -0.0f); 1317 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1318 const float reference_output = 1.0f; 1319 ASSERT_EQ(reference_output, outputs[0]) 1320 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1321 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1322 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1323 } 1324 TEST(EXP__SSE2_RR2_LUT64_P2,positive_zero)1325 TEST(EXP__SSE2_RR2_LUT64_P2, positive_zero) { 1326 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1327 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1328 std::fill(inputs.begin(), inputs.end(), +0.0f); 1329 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1330 const float reference_output = 1.0f; 1331 ASSERT_EQ(reference_output, outputs[0]) 1332 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1333 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1334 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1335 } 1336 TEST(EXP__SSE2_RR2_LUT64_P2,negative_saturation)1337 TEST(EXP__SSE2_RR2_LUT64_P2, negative_saturation) { 1338 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1339 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1340 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 1341 for (uint32_t i = 0; i < kBlockSize; i++) { 1342 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 1343 } 1344 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1345 for (uint32_t i = 0; i < kBlockSize; i++) { 1346 const uint32_t reference_output = UINT32_C(0x00000000); 1347 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1348 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1349 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1350 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1351 } 1352 } 1353 } 1354 TEST(EXP__SSE2_RR2_LUT64_P2,positive_overflow)1355 TEST(EXP__SSE2_RR2_LUT64_P2, positive_overflow) { 1356 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1357 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1358 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 1359 for (uint32_t i = 0; i < kBlockSize; i++) { 1360 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 1361 } 1362 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1363 for (uint32_t i = 0; i < kBlockSize; i++) { 1364 const uint32_t reference_output = UINT32_C(0x7F800000); 1365 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1366 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1367 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1368 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1369 } 1370 } 1371 } 1372 TEST(EXP__SSE2_RR2_LUT64_P2,positive_nan)1373 TEST(EXP__SSE2_RR2_LUT64_P2, positive_nan) { 1374 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1375 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1376 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1377 for (uint32_t i = 0; i < kBlockSize; i++) { 1378 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 1379 } 1380 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1381 for (uint32_t i = 0; i < kBlockSize; i++) { 1382 ASSERT_TRUE(std::isnan(outputs[i])) 1383 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1384 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1385 } 1386 } 1387 } 1388 TEST(EXP__SSE2_RR2_LUT64_P2,negative_nan)1389 TEST(EXP__SSE2_RR2_LUT64_P2, negative_nan) { 1390 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1391 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1392 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1393 for (uint32_t i = 0; i < kBlockSize; i++) { 1394 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 1395 } 1396 xnn_math_f32_exp__sse2_rr2_lut64_p2(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1397 for (uint32_t i = 0; i < kBlockSize; i++) { 1398 ASSERT_TRUE(std::isnan(outputs[i])) 1399 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1400 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1401 } 1402 } 1403 } 1404 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1405 1406 1407 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(EXP__SSE2_RR2_P5,negative_zero)1408 TEST(EXP__SSE2_RR2_P5, negative_zero) { 1409 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1410 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1411 std::fill(inputs.begin(), inputs.end(), -0.0f); 1412 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1413 const float reference_output = 1.0f; 1414 ASSERT_EQ(reference_output, outputs[0]) 1415 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1416 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1417 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1418 } 1419 TEST(EXP__SSE2_RR2_P5,positive_zero)1420 TEST(EXP__SSE2_RR2_P5, positive_zero) { 1421 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1422 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1423 std::fill(inputs.begin(), inputs.end(), +0.0f); 1424 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1425 const float reference_output = 1.0f; 1426 ASSERT_EQ(reference_output, outputs[0]) 1427 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0]) 1428 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(reference_output) 1429 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]); 1430 } 1431 TEST(EXP__SSE2_RR2_P5,negative_saturation)1432 TEST(EXP__SSE2_RR2_P5, negative_saturation) { 1433 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1434 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1435 for (uint32_t n = UINT32_C(0xC2CFF1B5); n <= UINT32_C(0xFF800000); n += kBlockSize) { 1436 for (uint32_t i = 0; i < kBlockSize; i++) { 1437 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0xFF800000))); 1438 } 1439 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1440 for (uint32_t i = 0; i < kBlockSize; i++) { 1441 const uint32_t reference_output = UINT32_C(0x00000000); 1442 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1443 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1444 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1445 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1446 } 1447 } 1448 } 1449 TEST(EXP__SSE2_RR2_P5,positive_overflow)1450 TEST(EXP__SSE2_RR2_P5, positive_overflow) { 1451 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1452 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1453 for (uint32_t n = UINT32_C(0x42B17218); n <= UINT32_C(0x7F800000); n += kBlockSize) { 1454 for (uint32_t i = 0; i < kBlockSize; i++) { 1455 inputs[i] = fp32_from_bits(std::min(n + i, UINT32_C(0x7F800000))); 1456 } 1457 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1458 for (uint32_t i = 0; i < kBlockSize; i++) { 1459 const uint32_t reference_output = UINT32_C(0x7F800000); 1460 ASSERT_EQ(reference_output, fp32_to_bits(outputs[i])) 1461 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1462 << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output 1463 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1464 } 1465 } 1466 } 1467 TEST(EXP__SSE2_RR2_P5,positive_nan)1468 TEST(EXP__SSE2_RR2_P5, positive_nan) { 1469 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1470 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1471 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1472 for (uint32_t i = 0; i < kBlockSize; i++) { 1473 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), n + i)); 1474 } 1475 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1476 for (uint32_t i = 0; i < kBlockSize; i++) { 1477 ASSERT_TRUE(std::isnan(outputs[i])) 1478 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1479 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1480 } 1481 } 1482 } 1483 TEST(EXP__SSE2_RR2_P5,negative_nan)1484 TEST(EXP__SSE2_RR2_P5, negative_nan) { 1485 std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize); 1486 std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize); 1487 for (uint32_t n = UINT32_C(0x7F800001); n < UINT32_C(0x80000000); n += kBlockSize) { 1488 for (uint32_t i = 0; i < kBlockSize; i++) { 1489 inputs[i] = fp32_from_bits(std::min(UINT32_C(0x7FFFFFFF), UINT32_C(0x80000000) | (n + i))); 1490 } 1491 xnn_math_f32_exp__sse2_rr2_p5(kBlockSize * sizeof(float), inputs.data(), outputs.data()); 1492 for (uint32_t i = 0; i < kBlockSize; i++) { 1493 ASSERT_TRUE(std::isnan(outputs[i])) 1494 << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i]) 1495 << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]); 1496 } 1497 } 1498 } 1499 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1500