1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file formats.h 24 * 25 * @brief Definitions for SWR_FORMAT functions. 26 * 27 ******************************************************************************/ 28 #pragma once 29 30 #include "utils.h" 31 #include "common/simdintrin.h" 32 33 ////////////////////////////////////////////////////////////////////////// 34 /// PackTraits - Helpers for packing / unpacking same pixel sizes 35 ////////////////////////////////////////////////////////////////////////// 36 template <uint32_t NumBits, bool Signed = false> 37 struct PackTraits 38 { 39 static const uint32_t MyNumBits = NumBits; 40 41 static simdscalar loadSOA(const uint8_t* pSrc) = delete; 42 static void storeSOA(uint8_t* pDst, simdscalar const& src) = delete; 43 static simdscalar unpack(simdscalar& in) = delete; 44 static simdscalar pack(simdscalar& in) = delete; 45 46 static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete; 47 static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete; 48 static simd16scalar unpack(simd16scalar& in) = delete; 49 static simd16scalar pack(simd16scalar& in) = delete; 50 }; 51 52 ////////////////////////////////////////////////////////////////////////// 53 /// PackTraits - Helpers for packing / unpacking unused channels 54 ////////////////////////////////////////////////////////////////////////// 55 template <> 56 struct PackTraits<0, false> 57 { 58 static const uint32_t MyNumBits = 0; 59 60 static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_setzero_ps(); } 61 static void storeSOA(uint8_t* pDst, simdscalar const& src) { return; } 62 static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); } 63 static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); } 64 65 static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); } 66 static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; } 67 static simd16scalar unpack(simd16scalar& in) { return _simd16_setzero_ps(); } 68 static simd16scalar pack(simd16scalar& in) { return _simd16_setzero_ps(); } 69 }; 70 71 ////////////////////////////////////////////////////////////////////////// 72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels 73 ////////////////////////////////////////////////////////////////////////// 74 template <> 75 struct PackTraits<8, false> 76 { 77 static const uint32_t MyNumBits = 8; 78 79 static simdscalar loadSOA(const uint8_t* pSrc) 80 { 81 #if KNOB_SIMD_WIDTH == 8 82 __m256 result = _mm256_setzero_ps(); 83 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); 84 return _mm256_insertf128_ps(result, vLo, 0); 85 #else 86 #error Unsupported vector width 87 #endif 88 } 89 90 static void storeSOA(uint8_t* pDst, simdscalar const& src) 91 { 92 // store simd bytes 93 #if KNOB_SIMD_WIDTH == 8 94 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); 95 #else 96 #error Unsupported vector width 97 #endif 98 } 99 100 static simdscalar unpack(simdscalar& in) 101 { 102 #if KNOB_SIMD_WIDTH == 8 103 #if KNOB_ARCH <= KNOB_ARCH_AVX 104 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); 105 __m128i resLo = _mm_cvtepu8_epi32(src); 106 __m128i resHi = 107 _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); 108 109 __m256i result = _mm256_castsi128_si256(resLo); 110 result = _mm256_insertf128_si256(result, resHi, 1); 111 return simdscalar{_mm256_castsi256_ps(result)}; 112 #else 113 return _mm256_castsi256_ps( 114 _mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); 115 #endif 116 #else 117 #error Unsupported vector width 118 #endif 119 } 120 121 static simdscalar pack(simdscalar& in) 122 { 123 #if KNOB_SIMD_WIDTH == 8 124 simdscalari src = _simd_castps_si(in); 125 __m128i res16 = 126 _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); 127 __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128()); 128 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); 129 #else 130 #error Unsupported vector width 131 #endif 132 } 133 134 static simd16scalar loadSOA_16(const uint8_t* pSrc) 135 { 136 simd16scalar result = _simd16_setzero_ps(); 137 simdscalar resultlo = _simd_setzero_ps(); 138 139 const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc)); 140 141 resultlo = _mm256_insertf128_ps(resultlo, src, 0); 142 result = _simd16_insert_ps(result, resultlo, 0); 143 144 return result; 145 } 146 147 static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) 148 { 149 // store simd16 bytes 150 _mm_store_ps(reinterpret_cast<float*>(pDst), 151 _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); 152 } 153 154 static simd16scalar unpack(simd16scalar& in) 155 { 156 simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))); 157 simd16scalari result = _simd16_cvtepu8_epi32(tmp); 158 159 return _simd16_castsi_ps(result); 160 } 161 162 static simd16scalar pack(simd16scalar& in) 163 { 164 // clang-format off 165 166 simd16scalari result = _simd16_setzero_si(); 167 168 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) 169 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF 170 171 simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) 172 simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) 173 174 simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) 175 176 const simdscalari zero = _simd_setzero_si(); 177 178 permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) 179 permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) 180 181 pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) 182 183 result = _simd16_insert_si(result, pack, 0); 184 185 return _simd16_castsi_ps(result); 186 187 // clang-format on 188 } 189 }; 190 191 ////////////////////////////////////////////////////////////////////////// 192 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels 193 ////////////////////////////////////////////////////////////////////////// 194 template <> 195 struct PackTraits<8, true> 196 { 197 static const uint32_t MyNumBits = 8; 198 199 static simdscalar loadSOA(const uint8_t* pSrc) 200 { 201 #if KNOB_SIMD_WIDTH == 8 202 __m256 result = _mm256_setzero_ps(); 203 __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc)); 204 return _mm256_insertf128_ps(result, vLo, 0); 205 #else 206 #error Unsupported vector width 207 #endif 208 } 209 210 static void storeSOA(uint8_t* pDst, simdscalar const& src) 211 { 212 // store simd bytes 213 #if KNOB_SIMD_WIDTH == 8 214 _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src))); 215 #else 216 #error Unsupported vector width 217 #endif 218 } 219 220 static simdscalar unpack(simdscalar& in) 221 { 222 #if KNOB_SIMD_WIDTH == 8 223 #if KNOB_ARCH <= KNOB_ARCH_AVX 224 SWR_INVALID("I think this may be incorrect."); 225 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); 226 __m128i resLo = _mm_cvtepi8_epi32(src); 227 __m128i resHi = 228 _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004)); 229 230 __m256i result = _mm256_castsi128_si256(resLo); 231 result = _mm256_insertf128_si256(result, resHi, 1); 232 return _mm256_castsi256_ps(result); 233 #else 234 return _mm256_castsi256_ps( 235 _mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); 236 #endif 237 #else 238 #error Unsupported vector width 239 #endif 240 } 241 242 static simdscalar pack(simdscalar& in) 243 { 244 #if KNOB_SIMD_WIDTH == 8 245 simdscalari src = _simd_castps_si(in); 246 __m128i res16 = 247 _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)); 248 __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128()); 249 return _mm256_castsi256_ps(_mm256_castsi128_si256(res8)); 250 #else 251 #error Unsupported vector width 252 #endif 253 } 254 255 static simd16scalar loadSOA_16(const uint8_t* pSrc) 256 { 257 simd16scalar result = _simd16_setzero_ps(); 258 simdscalar resultlo = _simd_setzero_ps(); 259 260 const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc)); 261 262 resultlo = _mm256_insertf128_ps(resultlo, src, 0); 263 result = _simd16_insert_ps(result, resultlo, 0); 264 265 return result; 266 } 267 268 static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) 269 { 270 // store simd16 bytes 271 _mm_store_ps(reinterpret_cast<float*>(pDst), 272 _mm256_castps256_ps128(_simd16_extract_ps(src, 0))); 273 } 274 275 static simd16scalar unpack(simd16scalar& in) 276 { 277 simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))); 278 simd16scalari result = _simd16_cvtepu8_epi32(tmp); 279 280 return _simd16_castsi_ps(result); 281 } 282 283 static simd16scalar pack(simd16scalar& in) 284 { 285 // clang-format off 286 287 simd16scalari result = _simd16_setzero_si(); 288 289 simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) 290 simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF 291 292 simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) 293 simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) 294 295 simdscalari pack = _simd_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) 296 297 const simdscalari zero = _simd_setzero_si(); 298 299 permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) 300 permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) 301 302 pack = _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) 303 304 result = _simd16_insert_si(result, pack, 0); 305 306 return _simd16_castsi_ps(result); 307 308 // clang-format on 309 } 310 }; 311 312 ////////////////////////////////////////////////////////////////////////// 313 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels 314 ////////////////////////////////////////////////////////////////////////// 315 template <> 316 struct PackTraits<16, false> 317 { 318 static const uint32_t MyNumBits = 16; 319 320 static simdscalar loadSOA(const uint8_t* pSrc) 321 { 322 #if KNOB_SIMD_WIDTH == 8 323 __m256 result = _mm256_setzero_ps(); 324 __m128 vLo = _mm_load_ps((const float*)pSrc); 325 return _mm256_insertf128_ps(result, vLo, 0); 326 #else 327 #error Unsupported vector width 328 #endif 329 } 330 331 static void storeSOA(uint8_t* pDst, simdscalar const& src) 332 { 333 #if KNOB_SIMD_WIDTH == 8 334 // store 16B (2B * 8) 335 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); 336 #else 337 #error Unsupported vector width 338 #endif 339 } 340 341 static simdscalar unpack(simdscalar& in) 342 { 343 #if KNOB_SIMD_WIDTH == 8 344 #if KNOB_ARCH <= KNOB_ARCH_AVX 345 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); 346 __m128i resLo = _mm_cvtepu16_epi32(src); 347 __m128i resHi = 348 _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); 349 350 __m256i result = _mm256_castsi128_si256(resLo); 351 result = _mm256_insertf128_si256(result, resHi, 1); 352 return _mm256_castsi256_ps(result); 353 #else 354 return _mm256_castsi256_ps( 355 _mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); 356 #endif 357 #else 358 #error Unsupported vector width 359 #endif 360 } 361 362 static simdscalar pack(simdscalar& in) 363 { 364 #if KNOB_SIMD_WIDTH == 8 365 simdscalari src = _simd_castps_si(in); 366 __m256i res = _mm256_castsi128_si256( 367 _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); 368 return _mm256_castsi256_ps(res); 369 #else 370 #error Unsupported vector width 371 #endif 372 } 373 374 static simd16scalar loadSOA_16(const uint8_t* pSrc) 375 { 376 simd16scalar result = _simd16_setzero_ps(); 377 378 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); 379 380 result = _simd16_insert_ps(result, resultlo, 0); 381 382 return result; 383 } 384 385 static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) 386 { 387 _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0)); 388 } 389 390 static simd16scalar unpack(simd16scalar& in) 391 { 392 simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0))); 393 394 return _simd16_castsi_ps(result); 395 } 396 397 static simd16scalar pack(simd16scalar& in) 398 { 399 // clang-format off 400 401 const simd16scalari zero = _simd16_setzero_si(); 402 403 simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) 404 simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 405 406 simd16scalari result = _simd16_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) 407 408 return _simd16_castsi_ps(result); 409 410 // clang-format on 411 } 412 }; 413 414 ////////////////////////////////////////////////////////////////////////// 415 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels 416 ////////////////////////////////////////////////////////////////////////// 417 template <> 418 struct PackTraits<16, true> 419 { 420 static const uint32_t MyNumBits = 16; 421 422 static simdscalar loadSOA(const uint8_t* pSrc) 423 { 424 #if KNOB_SIMD_WIDTH == 8 425 __m256 result = _mm256_setzero_ps(); 426 __m128 vLo = _mm_load_ps((const float*)pSrc); 427 return _mm256_insertf128_ps(result, vLo, 0); 428 #else 429 #error Unsupported vector width 430 #endif 431 } 432 433 static void storeSOA(uint8_t* pDst, simdscalar const& src) 434 { 435 #if KNOB_SIMD_WIDTH == 8 436 // store 16B (2B * 8) 437 _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src)); 438 #else 439 #error Unsupported vector width 440 #endif 441 } 442 443 static simdscalar unpack(simdscalar& in) 444 { 445 #if KNOB_SIMD_WIDTH == 8 446 #if KNOB_ARCH <= KNOB_ARCH_AVX 447 SWR_INVALID("I think this may be incorrect."); 448 __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in)); 449 __m128i resLo = _mm_cvtepi16_epi32(src); 450 __m128i resHi = 451 _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908)); 452 453 __m256i result = _mm256_castsi128_si256(resLo); 454 result = _mm256_insertf128_si256(result, resHi, 1); 455 return _mm256_castsi256_ps(result); 456 #else 457 return _mm256_castsi256_ps( 458 _mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in)))); 459 #endif 460 #else 461 #error Unsupported vector width 462 #endif 463 } 464 465 static simdscalar pack(simdscalar& in) 466 { 467 #if KNOB_SIMD_WIDTH == 8 468 simdscalari src = _simd_castps_si(in); 469 __m256i res = _mm256_castsi128_si256( 470 _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1))); 471 return _mm256_castsi256_ps(res); 472 #else 473 #error Unsupported vector width 474 #endif 475 } 476 477 static simd16scalar loadSOA_16(const uint8_t* pSrc) 478 { 479 simd16scalar result = _simd16_setzero_ps(); 480 481 simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc)); 482 483 result = _simd16_insert_ps(result, resultlo, 0); 484 485 return result; 486 } 487 488 static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) 489 { 490 _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0)); 491 } 492 493 static simd16scalar unpack(simd16scalar& in) 494 { 495 simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0))); 496 497 return _simd16_castsi_ps(result); 498 } 499 500 static simd16scalar pack(simd16scalar& in) 501 { 502 // clang-format off 503 504 const simd16scalari zero = _simd16_setzero_si(); 505 506 simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) 507 simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 508 509 simd16scalari result = _simd16_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) 510 511 return _simd16_castsi_ps(result); 512 513 // clang-format on 514 } 515 }; 516 517 ////////////////////////////////////////////////////////////////////////// 518 /// PackTraits - Helpers for packing / unpacking 32 bit channels 519 ////////////////////////////////////////////////////////////////////////// 520 template <> 521 struct PackTraits<32, false> 522 { 523 static const uint32_t MyNumBits = 32; 524 525 static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_load_ps((const float*)pSrc); } 526 static void storeSOA(uint8_t* pDst, simdscalar const& src) 527 { 528 _simd_store_ps((float*)pDst, src); 529 } 530 static simdscalar unpack(simdscalar& in) { return in; } 531 static simdscalar pack(simdscalar& in) { return in; } 532 533 static simd16scalar loadSOA_16(const uint8_t* pSrc) 534 { 535 return _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); 536 } 537 538 static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) 539 { 540 _simd16_store_ps(reinterpret_cast<float*>(pDst), src); 541 } 542 543 static simd16scalar unpack(simd16scalar& in) { return in; } 544 545 static simd16scalar pack(simd16scalar& in) { return in; } 546 }; 547 548 ////////////////////////////////////////////////////////////////////////// 549 /// TypeTraits - Format type traits. 550 ////////////////////////////////////////////////////////////////////////// 551 template <SWR_TYPE type, uint32_t NumBits> 552 struct TypeTraits : PackTraits<NumBits> 553 { 554 static const SWR_TYPE MyType = type; 555 static float toFloat() { return 0.0; } 556 static float fromFloat() 557 { 558 SWR_NOT_IMPL; 559 return 0.0; 560 } 561 static simdscalar convertSrgb(simdscalar& in) 562 { 563 SWR_NOT_IMPL; 564 return _simd_setzero_ps(); 565 } 566 }; 567 568 ////////////////////////////////////////////////////////////////////////// 569 /// TypeTraits - Format type traits specialization for UINT8 570 ////////////////////////////////////////////////////////////////////////// 571 template <> 572 struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8> 573 { 574 static const SWR_TYPE MyType = SWR_TYPE_UINT; 575 static float toFloat() { return 0.0; } 576 static float fromFloat() 577 { 578 SWR_NOT_IMPL; 579 return 0.0; 580 } 581 static simdscalar convertSrgb(simdscalar& in) 582 { 583 SWR_NOT_IMPL; 584 return _simd_setzero_ps(); 585 } 586 }; 587 588 ////////////////////////////////////////////////////////////////////////// 589 /// TypeTraits - Format type traits specialization for UINT8 590 ////////////////////////////////////////////////////////////////////////// 591 template <> 592 struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true> 593 { 594 static const SWR_TYPE MyType = SWR_TYPE_SINT; 595 static float toFloat() { return 0.0; } 596 static float fromFloat() 597 { 598 SWR_NOT_IMPL; 599 return 0.0; 600 } 601 static simdscalar convertSrgb(simdscalar& in) 602 { 603 SWR_NOT_IMPL; 604 return _simd_setzero_ps(); 605 } 606 }; 607 608 ////////////////////////////////////////////////////////////////////////// 609 /// TypeTraits - Format type traits specialization for UINT16 610 ////////////////////////////////////////////////////////////////////////// 611 template <> 612 struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16> 613 { 614 static const SWR_TYPE MyType = SWR_TYPE_UINT; 615 static float toFloat() { return 0.0; } 616 static float fromFloat() 617 { 618 SWR_NOT_IMPL; 619 return 0.0; 620 } 621 static simdscalar convertSrgb(simdscalar& in) 622 { 623 SWR_NOT_IMPL; 624 return _simd_setzero_ps(); 625 } 626 }; 627 628 ////////////////////////////////////////////////////////////////////////// 629 /// TypeTraits - Format type traits specialization for SINT16 630 ////////////////////////////////////////////////////////////////////////// 631 template <> 632 struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true> 633 { 634 static const SWR_TYPE MyType = SWR_TYPE_SINT; 635 static float toFloat() { return 0.0; } 636 static float fromFloat() 637 { 638 SWR_NOT_IMPL; 639 return 0.0; 640 } 641 static simdscalar convertSrgb(simdscalar& in) 642 { 643 SWR_NOT_IMPL; 644 return _simd_setzero_ps(); 645 } 646 }; 647 648 ////////////////////////////////////////////////////////////////////////// 649 /// TypeTraits - Format type traits specialization for UINT32 650 ////////////////////////////////////////////////////////////////////////// 651 template <> 652 struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32> 653 { 654 static const SWR_TYPE MyType = SWR_TYPE_UINT; 655 static float toFloat() { return 0.0; } 656 static float fromFloat() 657 { 658 SWR_NOT_IMPL; 659 return 0.0; 660 } 661 static simdscalar convertSrgb(simdscalar& in) 662 { 663 SWR_NOT_IMPL; 664 return _simd_setzero_ps(); 665 } 666 }; 667 668 ////////////////////////////////////////////////////////////////////////// 669 /// TypeTraits - Format type traits specialization for UINT32 670 ////////////////////////////////////////////////////////////////////////// 671 template <> 672 struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32> 673 { 674 static const SWR_TYPE MyType = SWR_TYPE_SINT; 675 static float toFloat() { return 0.0; } 676 static float fromFloat() 677 { 678 SWR_NOT_IMPL; 679 return 0.0; 680 } 681 static simdscalar convertSrgb(simdscalar& in) 682 { 683 SWR_NOT_IMPL; 684 return _simd_setzero_ps(); 685 } 686 }; 687 688 ////////////////////////////////////////////////////////////////////////// 689 /// TypeTraits - Format type traits specialization for UNORM5 690 ////////////////////////////////////////////////////////////////////////// 691 template <> 692 struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5> 693 { 694 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 695 static float toFloat() { return 1.0f / 31.0f; } 696 static float fromFloat() { return 31.0f; } 697 static simdscalar convertSrgb(simdscalar& in) 698 { 699 SWR_NOT_IMPL; 700 return _simd_setzero_ps(); 701 } 702 }; 703 704 ////////////////////////////////////////////////////////////////////////// 705 /// TypeTraits - Format type traits specialization for UNORM6 706 ////////////////////////////////////////////////////////////////////////// 707 template <> 708 struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6> 709 { 710 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 711 static float toFloat() { return 1.0f / 63.0f; } 712 static float fromFloat() { return 63.0f; } 713 static simdscalar convertSrgb(simdscalar& in) 714 { 715 SWR_NOT_IMPL; 716 return _simd_setzero_ps(); 717 } 718 }; 719 720 ////////////////////////////////////////////////////////////////////////// 721 /// TypeTraits - Format type traits specialization for UNORM8 722 ////////////////////////////////////////////////////////////////////////// 723 template <> 724 struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8> 725 { 726 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 727 static float toFloat() { return 1.0f / 255.0f; } 728 static float fromFloat() { return 255.0f; } 729 static simdscalar convertSrgb(simdscalar& in) 730 { 731 SWR_NOT_IMPL; 732 return _simd_setzero_ps(); 733 } 734 }; 735 736 ////////////////////////////////////////////////////////////////////////// 737 /// TypeTraits - Format type traits specialization for UNORM8 738 ////////////////////////////////////////////////////////////////////////// 739 template <> 740 struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true> 741 { 742 static const SWR_TYPE MyType = SWR_TYPE_SNORM; 743 static float toFloat() { return 1.0f / 127.0f; } 744 static float fromFloat() { return 127.0f; } 745 static simdscalar convertSrgb(simdscalar& in) 746 { 747 SWR_NOT_IMPL; 748 return _simd_setzero_ps(); 749 } 750 }; 751 752 ////////////////////////////////////////////////////////////////////////// 753 /// TypeTraits - Format type traits specialization for UNORM16 754 ////////////////////////////////////////////////////////////////////////// 755 template <> 756 struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16> 757 { 758 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 759 static float toFloat() { return 1.0f / 65535.0f; } 760 static float fromFloat() { return 65535.0f; } 761 static simdscalar convertSrgb(simdscalar& in) 762 { 763 SWR_NOT_IMPL; 764 return _simd_setzero_ps(); 765 } 766 }; 767 768 ////////////////////////////////////////////////////////////////////////// 769 /// TypeTraits - Format type traits specialization for SNORM16 770 ////////////////////////////////////////////////////////////////////////// 771 template <> 772 struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true> 773 { 774 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 775 static float toFloat() { return 1.0f / 32767.0f; } 776 static float fromFloat() { return 32767.0f; } 777 static simdscalar convertSrgb(simdscalar& in) 778 { 779 SWR_NOT_IMPL; 780 return _simd_setzero_ps(); 781 } 782 }; 783 784 ////////////////////////////////////////////////////////////////////////// 785 /// TypeTraits - Format type traits specialization for UNORM24 786 ////////////////////////////////////////////////////////////////////////// 787 template <> 788 struct TypeTraits<SWR_TYPE_UNORM, 24> : PackTraits<32> 789 { 790 static const SWR_TYPE MyType = SWR_TYPE_UNORM; 791 static float toFloat() { return 1.0f / 16777215.0f; } 792 static float fromFloat() { return 16777215.0f; } 793 static simdscalar convertSrgb(simdscalar& in) 794 { 795 SWR_NOT_IMPL; 796 return _simd_setzero_ps(); 797 } 798 }; 799 800 ////////////////////////////////////////////////////////////////////////// 801 // FLOAT Specializations from here on... 802 ////////////////////////////////////////////////////////////////////////// 803 #define TO_M128i(a) _mm_castps_si128(a) 804 #define TO_M128(a) _mm_castsi128_ps(a) 805 806 #include "math.h" 807 808 template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden> 809 inline static __m128 fastpow(__m128 arg) 810 { 811 __m128 ret = arg; 812 813 static const __m128 factor = 814 _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) * 815 powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum)); 816 817 // Apply a constant pre-correction factor. 818 ret = _mm_mul_ps(ret, factor); 819 820 // Reinterpret arg as integer to obtain logarithm. 821 // asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret)); 822 ret = _mm_cvtepi32_ps(_mm_castps_si128(ret)); 823 824 // Multiply logarithm by power. 825 ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden)); 826 827 // Convert back to "integer" to exponentiate. 828 // asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret)); 829 ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret)); 830 831 return ret; 832 } 833 834 inline static __m128 pow512_4(__m128 arg) 835 { 836 // 5/12 is too small, so compute the 4th root of 20/12 instead. 837 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. 838 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 839 __m128 xf = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg); 840 __m128 xover = _mm_mul_ps(arg, xf); 841 842 __m128 xfm1 = _mm_rsqrt_ps(xf); 843 __m128 x2 = _mm_mul_ps(arg, arg); 844 __m128 xunder = _mm_mul_ps(x2, xfm1); 845 846 // sqrt2 * over + 2 * sqrt2 * under 847 __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), 848 _mm_add_ps(xover, xunder)); 849 850 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); 851 xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg)); 852 return xavg; 853 } 854 855 inline static __m128 powf_wrapper(__m128 Base, float Exp) 856 { 857 float* f = (float*)(&Base); 858 859 return _mm_set_ps(powf(f[3], Exp), powf(f[2], Exp), powf(f[1], Exp), powf(f[0], Exp)); 860 } 861 862 static inline __m128 ConvertFloatToSRGB2(__m128& Src) 863 { 864 // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float 865 // value 866 __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src)); 867 868 // squeeze the mask down to 16 bits (4 bits per DWORD) 869 int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask); 870 871 __m128 Result; 872 873 // 874 if (CompareResult == 0xFFFF) 875 { 876 // all DWORDs are <= the threshold 877 Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); 878 } 879 else if (CompareResult == 0x0) 880 { 881 // all DWORDs are > the threshold 882 __m128 fSrc_0RGB = Src; 883 884 // --> 1.055f * c(1.0f/2.4f) - 0.055f 885 #if KNOB_USE_FAST_SRGB == TRUE 886 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. 887 __m128 f = pow512_4(fSrc_0RGB); 888 #else 889 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); 890 #endif 891 f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); 892 Result = _mm_sub_ps(f, _mm_set1_ps(0.055f)); 893 } 894 else 895 { 896 // some DWORDs are <= the threshold and some are > threshold 897 __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f)); 898 899 __m128 fSrc_0RGB = Src; 900 901 // --> 1.055f * c(1.0f/2.4f) - 0.055f 902 #if KNOB_USE_FAST_SRGB == TRUE 903 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. 904 __m128 f = pow512_4(fSrc_0RGB); 905 #else 906 __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f); 907 #endif 908 f = _mm_mul_ps(f, _mm_set1_ps(1.055f)); 909 f = _mm_sub_ps(f, _mm_set1_ps(0.055f)); 910 911 // Clear the alpha (is garbage after the sub) 912 __m128i i = _mm_and_si128(TO_M128i(f), 913 _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF)); 914 915 __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm)); 916 __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i); 917 __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart); 918 919 Result = TO_M128(CombinedParts); 920 } 921 922 return Result; 923 } 924 925 template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden> 926 inline static simd16scalar SIMDCALL fastpow(simd16scalar const& value) 927 { 928 static const float factor1 = exp2(127.0f * expden / expnum - 127.0f) * 929 powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum); 930 931 // Apply a constant pre-correction factor. 932 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1)); 933 934 // Reinterpret arg as integer to obtain logarithm. 935 // asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result)); 936 result = _simd16_cvtepi32_ps(_simd16_castps_si(result)); 937 938 // Multiply logarithm by power. 939 result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden)); 940 941 // Convert back to "integer" to exponentiate. 942 // asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result)); 943 result = _simd16_castsi_ps(_simd16_cvtps_epi32(result)); 944 945 return result; 946 } 947 948 inline static simd16scalar SIMDCALL pow512_4(simd16scalar const& arg) 949 { 950 // 5/12 is too small, so compute the 4th root of 20/12 instead. 951 // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow. 952 // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3 953 simd16scalar xf = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg); 954 simd16scalar xover = _simd16_mul_ps(arg, xf); 955 956 simd16scalar xfm1 = _simd16_rsqrt_ps(xf); 957 simd16scalar x2 = _simd16_mul_ps(arg, arg); 958 simd16scalar xunder = _simd16_mul_ps(x2, xfm1); 959 960 // sqrt2 * over + 2 * sqrt2 * under 961 simd16scalar xavg = 962 _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), 963 _simd16_add_ps(xover, xunder)); 964 965 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg)); 966 xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg)); 967 968 return xavg; 969 } 970 971 inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar& base, float exp) 972 { 973 const float* f = reinterpret_cast<const float*>(&base); 974 975 return _simd16_set_ps(powf(f[15], exp), 976 powf(f[14], exp), 977 powf(f[13], exp), 978 powf(f[12], exp), 979 powf(f[11], exp), 980 powf(f[10], exp), 981 powf(f[9], exp), 982 powf(f[8], exp), 983 powf(f[7], exp), 984 powf(f[6], exp), 985 powf(f[5], exp), 986 powf(f[4], exp), 987 powf(f[3], exp), 988 powf(f[2], exp), 989 powf(f[1], exp), 990 powf(f[0], exp)); 991 } 992 993 // float to SRGB conversion formula 994 // 995 // if (value < 0.0031308f) 996 // value *= 12.92f; 997 // else 998 // value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f; 999 // 1000 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value) 1001 { 1002 // create a mask where the source is < the minimal SRGB float value 1003 const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f)); 1004 1005 // if all elements are < the threshold, result = value * 12.92 1006 simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f)); 1007 1008 if (_simd16_mask2int(mask) != 0xFFFF) 1009 { 1010 // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055 1011 #if KNOB_USE_FAST_SRGB == TRUE 1012 // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation. 1013 simd16scalar result2 = pow512_4(value); 1014 #else 1015 simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f); 1016 #endif 1017 1018 result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f)); 1019 result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f)); 1020 1021 #if (KNOB_ARCH == KNOB_ARCH_AVX512) 1022 // only native AVX512 can directly use the computed mask for the blend operation 1023 result = _mm512_mask_blend_ps(mask, result2, result); 1024 #else 1025 result = _simd16_blendv_ps( 1026 result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f))); 1027 #endif 1028 } 1029 1030 return result; 1031 } 1032 1033 ////////////////////////////////////////////////////////////////////////// 1034 /// TypeTraits - Format type traits specialization for FLOAT16 1035 ////////////////////////////////////////////////////////////////////////// 1036 template <> 1037 struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> 1038 { 1039 static const SWR_TYPE MyType = SWR_TYPE_FLOAT; 1040 static float toFloat() { return 1.0f; } 1041 static float fromFloat() { return 1.0f; } 1042 static simdscalar convertSrgb(simdscalar& in) 1043 { 1044 SWR_NOT_IMPL; 1045 return _simd_setzero_ps(); 1046 } 1047 1048 static simdscalar pack(const simdscalar& in) 1049 { 1050 #if KNOB_SIMD_WIDTH == 8 1051 #if (KNOB_ARCH == KNOB_ARCH_AVX) 1052 // input is 8 packed float32, output is 8 packed float16 1053 simdscalari src = _simd_castps_si(in); 1054 1055 static const uint32_t FLOAT_EXP_BITS = 8; 1056 static const uint32_t FLOAT_MANTISSA_BITS = 23; 1057 static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1; 1058 static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS; 1059 1060 static const uint32_t HALF_EXP_BITS = 5; 1061 static const uint32_t HALF_MANTISSA_BITS = 10; 1062 static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS; 1063 1064 // minimum exponent required, exponents below this are flushed to 0. 1065 static const int32_t HALF_EXP_MIN = -14; 1066 static const int32_t FLOAT_EXP_BIAS = 127; 1067 static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS; 1068 static const int32_t FLOAT_EXP_MIN_FTZ = 1069 FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand 1070 1071 // maximum exponent required, exponents above this are set to infinity 1072 static const int32_t HALF_EXP_MAX = 15; 1073 static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS; 1074 1075 const simdscalari vSignMask = _simd_set1_epi32(0x80000000); 1076 const simdscalari vExpMask = _simd_set1_epi32(FLOAT_EXP_MASK); 1077 const simdscalari vManMask = _simd_set1_epi32(FLOAT_MANTISSA_MASK); 1078 const simdscalari vExpMin = 1079 _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS)); 1080 const simdscalari vExpMinFtz = 1081 _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS)); 1082 const simdscalari vExpMax = 1083 _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS)); 1084 1085 simdscalari vSign = _simd_and_si(src, vSignMask); 1086 simdscalari vExp = _simd_and_si(src, vExpMask); 1087 simdscalari vMan = _simd_and_si(src, vManMask); 1088 1089 simdscalari vFTZMask = _simd_cmplt_epi32(vExp, vExpMinFtz); 1090 simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin)); 1091 simdscalari vInfMask = _simd_cmpeq_epi32(vExpMask, vExp); 1092 simdscalari vClampMask = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp)); 1093 1094 simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), 1095 _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS)); 1096 1097 // pack output 16-bits into the lower 16-bits of each 32-bit channel 1098 simdscalari vDst = 1099 _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK)); 1100 vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); 1101 1102 // Flush To Zero 1103 vDst = _simd_andnot_si(vFTZMask, vDst); 1104 // Apply Infinites / NaN 1105 vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK))); 1106 1107 // Apply clamps 1108 vDst = _simd_andnot_si(vClampMask, vDst); 1109 vDst = _simd_or_si(vDst, _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF))); 1110 1111 // Compute Denormals (subnormals) 1112 if (!_mm256_testz_si256(vDenormMask, vDenormMask)) 1113 { 1114 uint32_t* pDenormMask = (uint32_t*)&vDenormMask; 1115 uint32_t* pExp = (uint32_t*)&vExp; 1116 uint32_t* pMan = (uint32_t*)&vMan; 1117 uint32_t* pDst = (uint32_t*)&vDst; 1118 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) 1119 { 1120 if (pDenormMask[i]) 1121 { 1122 // Need to compute subnormal value 1123 uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS; 1124 uint32_t mantissa = 1125 pMan[i] | (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s. 1126 // Make it explicit 1127 1128 pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + 1129 (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS)); 1130 } 1131 } 1132 } 1133 1134 // Add in sign bits 1135 vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16)); 1136 1137 // Pack to lower 128-bits 1138 vDst = _mm256_castsi128_si256( 1139 _mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1))); 1140 1141 #if 0 1142 #if !defined(NDEBUG) 1143 simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)); 1144 1145 for (uint32_t i = 0; i < 4; ++i) 1146 { 1147 SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]); 1148 } 1149 #endif 1150 #endif 1151 1152 return _simd_castsi_ps(vDst); 1153 1154 #else 1155 return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC))); 1156 #endif 1157 #else 1158 #error Unsupported vector width 1159 #endif 1160 } 1161 1162 static simdscalar unpack(const simdscalar& in) 1163 { 1164 // input is 8 packed float16, output is 8 packed float32 1165 SWR_NOT_IMPL; // @todo 1166 return _simd_setzero_ps(); 1167 } 1168 1169 static simd16scalar pack(const simd16scalar& in) 1170 { 1171 simd16scalari result = _simd16_setzero_si(); 1172 simdscalari resultlo = _simd_setzero_si(); 1173 1174 #if (KNOB_ARCH == KNOB_ARCH_AVX) 1175 simdscalar simdlo = pack(_simd16_extract_ps(in, 0)); 1176 simdscalar simdhi = pack(_simd16_extract_ps(in, 1)); 1177 1178 __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0); 1179 __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0); 1180 1181 #else 1182 __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC); 1183 __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC); 1184 1185 #endif 1186 resultlo = _simd_insertf128_si(resultlo, templo, 0); 1187 resultlo = _simd_insertf128_si(resultlo, temphi, 1); 1188 1189 result = _simd16_insert_si(result, resultlo, 0); 1190 1191 return _simd16_castsi_ps(result); 1192 } 1193 1194 static simd16scalar unpack(const simd16scalar& in) 1195 { 1196 // input is 16 packed float16, output is 16 packed float32 1197 SWR_NOT_IMPL; // @todo 1198 return _simd16_setzero_ps(); 1199 } 1200 }; 1201 1202 ////////////////////////////////////////////////////////////////////////// 1203 /// TypeTraits - Format type traits specialization for FLOAT32 1204 ////////////////////////////////////////////////////////////////////////// 1205 template <> 1206 struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32> 1207 { 1208 static const SWR_TYPE MyType = SWR_TYPE_FLOAT; 1209 static float toFloat() { return 1.0f; } 1210 static float fromFloat() { return 1.0f; } 1211 static inline simdscalar convertSrgb(simdscalar& in) 1212 { 1213 #if KNOB_SIMD_WIDTH == 8 1214 __m128 srcLo = _mm256_extractf128_ps(in, 0); 1215 __m128 srcHi = _mm256_extractf128_ps(in, 1); 1216 1217 srcLo = ConvertFloatToSRGB2(srcLo); 1218 srcHi = ConvertFloatToSRGB2(srcHi); 1219 1220 in = _mm256_insertf128_ps(in, srcLo, 0); 1221 in = _mm256_insertf128_ps(in, srcHi, 1); 1222 #else 1223 #error Unsupported vector width 1224 #endif 1225 return in; 1226 } 1227 1228 static inline simd16scalar convertSrgb(simd16scalar& in) { return ConvertFloatToSRGB2(in); } 1229 }; 1230 1231 ////////////////////////////////////////////////////////////////////////// 1232 /// FormatIntType - Calculate base integer type for pixel components based 1233 /// on total number of bits. Components can be smaller 1234 /// that this type, but the entire pixel must not be 1235 /// any smaller than this type. 1236 ////////////////////////////////////////////////////////////////////////// 1237 template <uint32_t bits, bool bits8 = bits <= 8, bool bits16 = bits <= 16> 1238 struct FormatIntType 1239 { 1240 typedef uint32_t TYPE; 1241 }; 1242 1243 template <uint32_t bits> 1244 struct FormatIntType<bits, true, true> 1245 { 1246 typedef uint8_t TYPE; 1247 }; 1248 1249 template <uint32_t bits> 1250 struct FormatIntType<bits, false, true> 1251 { 1252 typedef uint16_t TYPE; 1253 }; 1254 1255 ////////////////////////////////////////////////////////////////////////// 1256 /// Format1 - Bitfield for single component formats. 1257 ////////////////////////////////////////////////////////////////////////// 1258 template <uint32_t x> 1259 union Format1 1260 { 1261 typedef typename FormatIntType<x>::TYPE TYPE; 1262 struct 1263 { 1264 TYPE r : x; 1265 }; 1266 1267 ///@ The following are here to provide full template needed in Formats. 1268 struct 1269 { 1270 TYPE g : x; 1271 }; 1272 struct 1273 { 1274 TYPE b : x; 1275 }; 1276 struct 1277 { 1278 TYPE a : x; 1279 }; 1280 }; 1281 1282 ////////////////////////////////////////////////////////////////////////// 1283 /// Format2 - Bitfield for 2 component formats. 1284 ////////////////////////////////////////////////////////////////////////// 1285 template <uint32_t x, uint32_t y> 1286 union Format2 1287 { 1288 typedef typename FormatIntType<x + y>::TYPE TYPE; 1289 1290 struct 1291 { 1292 TYPE r : x; 1293 TYPE g : y; 1294 }; 1295 struct 1296 { 1297 ///@ The following are here to provide full template needed in Formats. 1298 TYPE b : x; 1299 TYPE a : y; 1300 }; 1301 }; 1302 1303 ////////////////////////////////////////////////////////////////////////// 1304 /// Format3 - Bitfield for 3 component formats. 1305 ////////////////////////////////////////////////////////////////////////// 1306 template <uint32_t x, uint32_t y, uint32_t z> 1307 union Format3 1308 { 1309 typedef typename FormatIntType<x + y + z>::TYPE TYPE; 1310 1311 struct 1312 { 1313 TYPE r : x; 1314 TYPE g : y; 1315 TYPE b : z; 1316 }; 1317 TYPE a; ///@note This is here to provide full template needed in Formats. 1318 }; 1319 1320 ////////////////////////////////////////////////////////////////////////// 1321 /// Format4 - Bitfield for 4 component formats. 1322 ////////////////////////////////////////////////////////////////////////// 1323 template <uint32_t x, uint32_t y, uint32_t z, uint32_t w> 1324 struct Format4 1325 { 1326 typedef typename FormatIntType<x + y + z + w>::TYPE TYPE; 1327 1328 TYPE r : x; 1329 TYPE g : y; 1330 TYPE b : z; 1331 TYPE a : w; 1332 }; 1333 1334 ////////////////////////////////////////////////////////////////////////// 1335 /// ComponentTraits - Default components 1336 ////////////////////////////////////////////////////////////////////////// 1337 template <uint32_t x, uint32_t y, uint32_t z, uint32_t w> 1338 struct Defaults 1339 { 1340 INLINE static uint32_t GetDefault(uint32_t comp) 1341 { 1342 static const uint32_t defaults[4]{x, y, z, w}; 1343 return defaults[comp]; 1344 } 1345 }; 1346 1347 ////////////////////////////////////////////////////////////////////////// 1348 /// ComponentTraits - Component type traits. 1349 ////////////////////////////////////////////////////////////////////////// 1350 template <SWR_TYPE X, 1351 uint32_t NumBitsX, 1352 SWR_TYPE Y = SWR_TYPE_UNKNOWN, 1353 uint32_t NumBitsY = 0, 1354 SWR_TYPE Z = SWR_TYPE_UNKNOWN, 1355 uint32_t NumBitsZ = 0, 1356 SWR_TYPE W = SWR_TYPE_UNKNOWN, 1357 uint32_t NumBitsW = 0> 1358 struct ComponentTraits 1359 { 1360 INLINE static SWR_TYPE GetType(uint32_t comp) 1361 { 1362 static const SWR_TYPE CompType[4]{X, Y, Z, W}; 1363 return CompType[comp]; 1364 } 1365 1366 INLINE static constexpr uint32_t GetConstBPC(uint32_t comp) 1367 { 1368 return (comp == 3) ? NumBitsW 1369 : ((comp == 2) ? NumBitsZ : ((comp == 1) ? NumBitsY : NumBitsX)); 1370 } 1371 1372 INLINE static uint32_t GetBPC(uint32_t comp) 1373 { 1374 static const uint32_t MyBpc[4]{NumBitsX, NumBitsY, NumBitsZ, NumBitsW}; 1375 return MyBpc[comp]; 1376 } 1377 1378 INLINE static bool isNormalized(uint32_t comp) 1379 { 1380 switch (comp) 1381 { 1382 case 0: 1383 return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false; 1384 case 1: 1385 return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false; 1386 case 2: 1387 return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false; 1388 case 3: 1389 return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false; 1390 } 1391 SWR_INVALID("Invalid component: %d", comp); 1392 return false; 1393 } 1394 1395 INLINE static float toFloat(uint32_t comp) 1396 { 1397 switch (comp) 1398 { 1399 case 0: 1400 return TypeTraits<X, NumBitsX>::toFloat(); 1401 case 1: 1402 return TypeTraits<Y, NumBitsY>::toFloat(); 1403 case 2: 1404 return TypeTraits<Z, NumBitsZ>::toFloat(); 1405 case 3: 1406 return TypeTraits<W, NumBitsW>::toFloat(); 1407 } 1408 SWR_INVALID("Invalid component: %d", comp); 1409 return TypeTraits<X, NumBitsX>::toFloat(); 1410 } 1411 1412 INLINE static float fromFloat(uint32_t comp) 1413 { 1414 switch (comp) 1415 { 1416 case 0: 1417 return TypeTraits<X, NumBitsX>::fromFloat(); 1418 case 1: 1419 return TypeTraits<Y, NumBitsY>::fromFloat(); 1420 case 2: 1421 return TypeTraits<Z, NumBitsZ>::fromFloat(); 1422 case 3: 1423 return TypeTraits<W, NumBitsW>::fromFloat(); 1424 } 1425 SWR_INVALID("Invalid component: %d", comp); 1426 return TypeTraits<X, NumBitsX>::fromFloat(); 1427 } 1428 1429 INLINE static void loadSOA(uint32_t comp, const uint8_t* pSrc, simdscalar& dst) 1430 { 1431 switch (comp) 1432 { 1433 case 0: 1434 dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc); 1435 return; 1436 case 1: 1437 dst = TypeTraits<Y, NumBitsY>::loadSOA(pSrc); 1438 return; 1439 case 2: 1440 dst = TypeTraits<Z, NumBitsZ>::loadSOA(pSrc); 1441 return; 1442 case 3: 1443 dst = TypeTraits<W, NumBitsW>::loadSOA(pSrc); 1444 return; 1445 } 1446 SWR_INVALID("Invalid component: %d", comp); 1447 dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc); 1448 } 1449 1450 INLINE static void storeSOA(uint32_t comp, uint8_t* pDst, simdscalar const& src) 1451 { 1452 switch (comp) 1453 { 1454 case 0: 1455 TypeTraits<X, NumBitsX>::storeSOA(pDst, src); 1456 return; 1457 case 1: 1458 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src); 1459 return; 1460 case 2: 1461 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src); 1462 return; 1463 case 3: 1464 TypeTraits<W, NumBitsW>::storeSOA(pDst, src); 1465 return; 1466 } 1467 SWR_INVALID("Invalid component: %d", comp); 1468 } 1469 1470 INLINE static simdscalar unpack(uint32_t comp, simdscalar& in) 1471 { 1472 simdscalar out; 1473 switch (comp) 1474 { 1475 case 0: 1476 out = TypeTraits<X, NumBitsX>::unpack(in); 1477 break; 1478 case 1: 1479 out = TypeTraits<Y, NumBitsY>::unpack(in); 1480 break; 1481 case 2: 1482 out = TypeTraits<Z, NumBitsZ>::unpack(in); 1483 break; 1484 case 3: 1485 out = TypeTraits<W, NumBitsW>::unpack(in); 1486 break; 1487 default: 1488 SWR_INVALID("Invalid component: %d", comp); 1489 out = in; 1490 break; 1491 } 1492 return out; 1493 } 1494 1495 INLINE static simdscalar pack(uint32_t comp, simdscalar& in) 1496 { 1497 simdscalar out; 1498 switch (comp) 1499 { 1500 case 0: 1501 out = TypeTraits<X, NumBitsX>::pack(in); 1502 break; 1503 case 1: 1504 out = TypeTraits<Y, NumBitsY>::pack(in); 1505 break; 1506 case 2: 1507 out = TypeTraits<Z, NumBitsZ>::pack(in); 1508 break; 1509 case 3: 1510 out = TypeTraits<W, NumBitsW>::pack(in); 1511 break; 1512 default: 1513 SWR_INVALID("Invalid component: %d", comp); 1514 out = in; 1515 break; 1516 } 1517 return out; 1518 } 1519 1520 INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar& in) 1521 { 1522 switch (comp) 1523 { 1524 case 0: 1525 return TypeTraits<X, NumBitsX>::convertSrgb(in); 1526 case 1: 1527 return TypeTraits<Y, NumBitsY>::convertSrgb(in); 1528 case 2: 1529 return TypeTraits<Z, NumBitsZ>::convertSrgb(in); 1530 case 3: 1531 return TypeTraits<W, NumBitsW>::convertSrgb(in); 1532 } 1533 SWR_INVALID("Invalid component: %d", comp); 1534 return TypeTraits<X, NumBitsX>::convertSrgb(in); 1535 } 1536 1537 INLINE static void SIMDCALL loadSOA(uint32_t comp, const uint8_t* pSrc, simd16scalar& dst) 1538 { 1539 switch (comp) 1540 { 1541 case 0: 1542 dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); 1543 return; 1544 case 1: 1545 dst = TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc); 1546 return; 1547 case 2: 1548 dst = TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc); 1549 return; 1550 case 3: 1551 dst = TypeTraits<W, NumBitsW>::loadSOA_16(pSrc); 1552 return; 1553 } 1554 SWR_INVALID("Invalid component: %d", comp); 1555 dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); 1556 } 1557 1558 INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t* pDst, simd16scalar const& src) 1559 { 1560 switch (comp) 1561 { 1562 case 0: 1563 TypeTraits<X, NumBitsX>::storeSOA(pDst, src); 1564 return; 1565 case 1: 1566 TypeTraits<Y, NumBitsY>::storeSOA(pDst, src); 1567 return; 1568 case 2: 1569 TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src); 1570 return; 1571 case 3: 1572 TypeTraits<W, NumBitsW>::storeSOA(pDst, src); 1573 return; 1574 } 1575 SWR_INVALID("Invalid component: %d", comp); 1576 TypeTraits<X, NumBitsX>::storeSOA(pDst, src); 1577 } 1578 1579 INLINE static simd16scalar unpack(uint32_t comp, simd16scalar& in) 1580 { 1581 switch (comp) 1582 { 1583 case 0: 1584 return TypeTraits<X, NumBitsX>::unpack(in); 1585 case 1: 1586 return TypeTraits<Y, NumBitsY>::unpack(in); 1587 case 2: 1588 return TypeTraits<Z, NumBitsZ>::unpack(in); 1589 case 3: 1590 return TypeTraits<W, NumBitsW>::unpack(in); 1591 } 1592 SWR_INVALID("Invalid component: %d", comp); 1593 return TypeTraits<X, NumBitsX>::unpack(in); 1594 } 1595 1596 INLINE static simd16scalar pack(uint32_t comp, simd16scalar& in) 1597 { 1598 switch (comp) 1599 { 1600 case 0: 1601 return TypeTraits<X, NumBitsX>::pack(in); 1602 case 1: 1603 return TypeTraits<Y, NumBitsY>::pack(in); 1604 case 2: 1605 return TypeTraits<Z, NumBitsZ>::pack(in); 1606 case 3: 1607 return TypeTraits<W, NumBitsW>::pack(in); 1608 } 1609 SWR_INVALID("Invalid component: %d", comp); 1610 return TypeTraits<X, NumBitsX>::pack(in); 1611 } 1612 1613 INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar& in) 1614 { 1615 switch (comp) 1616 { 1617 case 0: 1618 return TypeTraits<X, NumBitsX>::convertSrgb(in); 1619 case 1: 1620 return TypeTraits<Y, NumBitsY>::convertSrgb(in); 1621 case 2: 1622 return TypeTraits<Z, NumBitsZ>::convertSrgb(in); 1623 case 3: 1624 return TypeTraits<W, NumBitsW>::convertSrgb(in); 1625 } 1626 SWR_INVALID("Invalid component: %d", comp); 1627 return TypeTraits<X, NumBitsX>::convertSrgb(in); 1628 } 1629 }; 1630