1 /****************************************************************************
2  * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * @file formats.h
24  *
25  * @brief Definitions for SWR_FORMAT functions.
26  *
27  ******************************************************************************/
28 #pragma once
29 
30 #include "utils.h"
31 #include "common/simdintrin.h"
32 
33 //////////////////////////////////////////////////////////////////////////
34 /// PackTraits - Helpers for packing / unpacking same pixel sizes
35 //////////////////////////////////////////////////////////////////////////
36 template <uint32_t NumBits, bool Signed = false>
37 struct PackTraits
38 {
39     static const uint32_t MyNumBits = NumBits;
40 
41     static simdscalar     loadSOA(const uint8_t* pSrc)                   = delete;
42     static void           storeSOA(uint8_t* pDst, simdscalar const& src) = delete;
43     static simdscalar     unpack(simdscalar& in)                         = delete;
44     static simdscalar     pack(simdscalar& in)                           = delete;
45 
46     static simd16scalar  loadSOA_16(const uint8_t* pSrc)                  = delete;
47     static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete;
48     static simd16scalar  unpack(simd16scalar& in)                         = delete;
49     static simd16scalar  pack(simd16scalar& in)                           = delete;
50 };
51 
52 //////////////////////////////////////////////////////////////////////////
53 /// PackTraits - Helpers for packing / unpacking unused channels
54 //////////////////////////////////////////////////////////////////////////
55 template <>
56 struct PackTraits<0, false>
57 {
58     static const uint32_t MyNumBits = 0;
59 
60     static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_setzero_ps(); }
61     static void       storeSOA(uint8_t* pDst, simdscalar const& src) { return; }
62     static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); }
63     static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); }
64 
65     static simd16scalar  loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); }
66     static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; }
67     static simd16scalar  unpack(simd16scalar& in) { return _simd16_setzero_ps(); }
68     static simd16scalar  pack(simd16scalar& in) { return _simd16_setzero_ps(); }
69 };
70 
71 //////////////////////////////////////////////////////////////////////////
72 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
73 //////////////////////////////////////////////////////////////////////////
74 template <>
75 struct PackTraits<8, false>
76 {
77     static const uint32_t MyNumBits = 8;
78 
79     static simdscalar loadSOA(const uint8_t* pSrc)
80     {
81 #if KNOB_SIMD_WIDTH == 8
82         __m256 result = _mm256_setzero_ps();
83         __m128 vLo    = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
84         return _mm256_insertf128_ps(result, vLo, 0);
85 #else
86 #error Unsupported vector width
87 #endif
88     }
89 
90     static void storeSOA(uint8_t* pDst, simdscalar const& src)
91     {
92         // store simd bytes
93 #if KNOB_SIMD_WIDTH == 8
94         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
95 #else
96 #error Unsupported vector width
97 #endif
98     }
99 
100     static simdscalar unpack(simdscalar& in)
101     {
102 #if KNOB_SIMD_WIDTH == 8
103 #if KNOB_ARCH <= KNOB_ARCH_AVX
104         __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
105         __m128i resLo = _mm_cvtepu8_epi32(src);
106         __m128i resHi =
107             _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
108 
109         __m256i result = _mm256_castsi128_si256(resLo);
110         result         = _mm256_insertf128_si256(result, resHi, 1);
111         return simdscalar{_mm256_castsi256_ps(result)};
112 #else
113         return _mm256_castsi256_ps(
114             _mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
115 #endif
116 #else
117 #error Unsupported vector width
118 #endif
119     }
120 
121     static simdscalar pack(simdscalar& in)
122     {
123 #if KNOB_SIMD_WIDTH == 8
124         simdscalari src = _simd_castps_si(in);
125         __m128i     res16 =
126             _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
127         __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
128         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
129 #else
130 #error Unsupported vector width
131 #endif
132     }
133 
134     static simd16scalar loadSOA_16(const uint8_t* pSrc)
135     {
136         simd16scalar result   = _simd16_setzero_ps();
137         simdscalar   resultlo = _simd_setzero_ps();
138 
139         const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
140 
141         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
142         result   = _simd16_insert_ps(result, resultlo, 0);
143 
144         return result;
145     }
146 
147     static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
148     {
149         // store simd16 bytes
150         _mm_store_ps(reinterpret_cast<float*>(pDst),
151                      _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
152     }
153 
154     static simd16scalar unpack(simd16scalar& in)
155     {
156         simd4scalari  tmp    = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
157         simd16scalari result = _simd16_cvtepu8_epi32(tmp);
158 
159         return _simd16_castsi_ps(result);
160     }
161 
162     static simd16scalar pack(simd16scalar& in)
163     {
164         // clang-format off
165 
166         simd16scalari result = _simd16_setzero_si();
167 
168         simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));  // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
169         simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));  // r8 r9 rA rB rC rD rE rF
170 
171         simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);   // r0 r1 r2 r3 r8 r9 rA rB (32b)
172         simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);   // r4 r5 r6 r7 rC rD rE rF (32b)
173 
174         simdscalari pack = _simd_packus_epi32(permlo, permhi);          // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
175 
176         const simdscalari zero = _simd_setzero_si();
177 
178         permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0)     // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
179         permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1)     // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
180 
181         pack = _simd_packus_epi16(permlo, permhi);                      // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
182 
183         result = _simd16_insert_si(result, pack, 0);
184 
185         return _simd16_castsi_ps(result);
186 
187         // clang-format on
188     }
189 };
190 
191 //////////////////////////////////////////////////////////////////////////
192 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
193 //////////////////////////////////////////////////////////////////////////
194 template <>
195 struct PackTraits<8, true>
196 {
197     static const uint32_t MyNumBits = 8;
198 
199     static simdscalar loadSOA(const uint8_t* pSrc)
200     {
201 #if KNOB_SIMD_WIDTH == 8
202         __m256 result = _mm256_setzero_ps();
203         __m128 vLo    = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
204         return _mm256_insertf128_ps(result, vLo, 0);
205 #else
206 #error Unsupported vector width
207 #endif
208     }
209 
210     static void storeSOA(uint8_t* pDst, simdscalar const& src)
211     {
212         // store simd bytes
213 #if KNOB_SIMD_WIDTH == 8
214         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
215 #else
216 #error Unsupported vector width
217 #endif
218     }
219 
220     static simdscalar unpack(simdscalar& in)
221     {
222 #if KNOB_SIMD_WIDTH == 8
223 #if KNOB_ARCH <= KNOB_ARCH_AVX
224         SWR_INVALID("I think this may be incorrect.");
225         __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
226         __m128i resLo = _mm_cvtepi8_epi32(src);
227         __m128i resHi =
228             _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
229 
230         __m256i result = _mm256_castsi128_si256(resLo);
231         result         = _mm256_insertf128_si256(result, resHi, 1);
232         return _mm256_castsi256_ps(result);
233 #else
234         return _mm256_castsi256_ps(
235             _mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
236 #endif
237 #else
238 #error Unsupported vector width
239 #endif
240     }
241 
242     static simdscalar pack(simdscalar& in)
243     {
244 #if KNOB_SIMD_WIDTH == 8
245         simdscalari src = _simd_castps_si(in);
246         __m128i     res16 =
247             _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
248         __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
249         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
250 #else
251 #error Unsupported vector width
252 #endif
253     }
254 
255     static simd16scalar loadSOA_16(const uint8_t* pSrc)
256     {
257         simd16scalar result   = _simd16_setzero_ps();
258         simdscalar   resultlo = _simd_setzero_ps();
259 
260         const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
261 
262         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
263         result   = _simd16_insert_ps(result, resultlo, 0);
264 
265         return result;
266     }
267 
268     static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
269     {
270         // store simd16 bytes
271         _mm_store_ps(reinterpret_cast<float*>(pDst),
272                      _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
273     }
274 
275     static simd16scalar unpack(simd16scalar& in)
276     {
277         simd4scalari  tmp    = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
278         simd16scalari result = _simd16_cvtepu8_epi32(tmp);
279 
280         return _simd16_castsi_ps(result);
281     }
282 
283     static simd16scalar pack(simd16scalar& in)
284     {
285         // clang-format off
286 
287         simd16scalari result = _simd16_setzero_si();
288 
289         simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));  // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
290         simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));  // r8 r9 rA rB rC rD rE rF
291 
292         simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);   // r0 r1 r2 r3 r8 r9 rA rB (32b)
293         simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);   // r4 r5 r6 r7 rC rD rE rF (32b)
294 
295         simdscalari pack = _simd_packs_epi32(permlo, permhi);           // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
296 
297         const simdscalari zero = _simd_setzero_si();
298 
299         permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0)     // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
300         permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1)     // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
301 
302         pack = _simd_packs_epi16(permlo, permhi);                       // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
303 
304         result = _simd16_insert_si(result, pack, 0);
305 
306         return _simd16_castsi_ps(result);
307 
308         // clang-format on
309     }
310 };
311 
312 //////////////////////////////////////////////////////////////////////////
313 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
314 //////////////////////////////////////////////////////////////////////////
315 template <>
316 struct PackTraits<16, false>
317 {
318     static const uint32_t MyNumBits = 16;
319 
320     static simdscalar loadSOA(const uint8_t* pSrc)
321     {
322 #if KNOB_SIMD_WIDTH == 8
323         __m256 result = _mm256_setzero_ps();
324         __m128 vLo    = _mm_load_ps((const float*)pSrc);
325         return _mm256_insertf128_ps(result, vLo, 0);
326 #else
327 #error Unsupported vector width
328 #endif
329     }
330 
331     static void storeSOA(uint8_t* pDst, simdscalar const& src)
332     {
333 #if KNOB_SIMD_WIDTH == 8
334         // store 16B (2B * 8)
335         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
336 #else
337 #error Unsupported vector width
338 #endif
339     }
340 
341     static simdscalar unpack(simdscalar& in)
342     {
343 #if KNOB_SIMD_WIDTH == 8
344 #if KNOB_ARCH <= KNOB_ARCH_AVX
345         __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
346         __m128i resLo = _mm_cvtepu16_epi32(src);
347         __m128i resHi =
348             _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
349 
350         __m256i result = _mm256_castsi128_si256(resLo);
351         result         = _mm256_insertf128_si256(result, resHi, 1);
352         return _mm256_castsi256_ps(result);
353 #else
354         return _mm256_castsi256_ps(
355             _mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
356 #endif
357 #else
358 #error Unsupported vector width
359 #endif
360     }
361 
362     static simdscalar pack(simdscalar& in)
363     {
364 #if KNOB_SIMD_WIDTH == 8
365         simdscalari src = _simd_castps_si(in);
366         __m256i     res = _mm256_castsi128_si256(
367             _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
368         return _mm256_castsi256_ps(res);
369 #else
370 #error Unsupported vector width
371 #endif
372     }
373 
374     static simd16scalar loadSOA_16(const uint8_t* pSrc)
375     {
376         simd16scalar result = _simd16_setzero_ps();
377 
378         simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
379 
380         result = _simd16_insert_ps(result, resultlo, 0);
381 
382         return result;
383     }
384 
385     static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
386     {
387         _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0));
388     }
389 
390     static simd16scalar unpack(simd16scalar& in)
391     {
392         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
393 
394         return _simd16_castsi_ps(result);
395     }
396 
397     static simd16scalar pack(simd16scalar& in)
398     {
399         // clang-format off
400 
401         const simd16scalari zero = _simd16_setzero_si();
402 
403         simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
404         simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
405 
406         simd16scalari result = _simd16_packus_epi32(permlo, permhi);                        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
407 
408         return _simd16_castsi_ps(result);
409 
410         // clang-format on
411     }
412 };
413 
414 //////////////////////////////////////////////////////////////////////////
415 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
416 //////////////////////////////////////////////////////////////////////////
417 template <>
418 struct PackTraits<16, true>
419 {
420     static const uint32_t MyNumBits = 16;
421 
422     static simdscalar loadSOA(const uint8_t* pSrc)
423     {
424 #if KNOB_SIMD_WIDTH == 8
425         __m256 result = _mm256_setzero_ps();
426         __m128 vLo    = _mm_load_ps((const float*)pSrc);
427         return _mm256_insertf128_ps(result, vLo, 0);
428 #else
429 #error Unsupported vector width
430 #endif
431     }
432 
433     static void storeSOA(uint8_t* pDst, simdscalar const& src)
434     {
435 #if KNOB_SIMD_WIDTH == 8
436         // store 16B (2B * 8)
437         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
438 #else
439 #error Unsupported vector width
440 #endif
441     }
442 
443     static simdscalar unpack(simdscalar& in)
444     {
445 #if KNOB_SIMD_WIDTH == 8
446 #if KNOB_ARCH <= KNOB_ARCH_AVX
447         SWR_INVALID("I think this may be incorrect.");
448         __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
449         __m128i resLo = _mm_cvtepi16_epi32(src);
450         __m128i resHi =
451             _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
452 
453         __m256i result = _mm256_castsi128_si256(resLo);
454         result         = _mm256_insertf128_si256(result, resHi, 1);
455         return _mm256_castsi256_ps(result);
456 #else
457         return _mm256_castsi256_ps(
458             _mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
459 #endif
460 #else
461 #error Unsupported vector width
462 #endif
463     }
464 
465     static simdscalar pack(simdscalar& in)
466     {
467 #if KNOB_SIMD_WIDTH == 8
468         simdscalari src = _simd_castps_si(in);
469         __m256i     res = _mm256_castsi128_si256(
470             _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
471         return _mm256_castsi256_ps(res);
472 #else
473 #error Unsupported vector width
474 #endif
475     }
476 
477     static simd16scalar loadSOA_16(const uint8_t* pSrc)
478     {
479         simd16scalar result = _simd16_setzero_ps();
480 
481         simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
482 
483         result = _simd16_insert_ps(result, resultlo, 0);
484 
485         return result;
486     }
487 
488     static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
489     {
490         _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0));
491     }
492 
493     static simd16scalar unpack(simd16scalar& in)
494     {
495         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
496 
497         return _simd16_castsi_ps(result);
498     }
499 
500     static simd16scalar pack(simd16scalar& in)
501     {
502         // clang-format off
503 
504         const simd16scalari zero = _simd16_setzero_si();
505 
506         simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
507         simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
508 
509         simd16scalari result = _simd16_packs_epi32(permlo, permhi);                         // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
510 
511         return _simd16_castsi_ps(result);
512 
513         // clang-format on
514     }
515 };
516 
517 //////////////////////////////////////////////////////////////////////////
518 /// PackTraits - Helpers for packing / unpacking 32 bit channels
519 //////////////////////////////////////////////////////////////////////////
520 template <>
521 struct PackTraits<32, false>
522 {
523     static const uint32_t MyNumBits = 32;
524 
525     static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_load_ps((const float*)pSrc); }
526     static void       storeSOA(uint8_t* pDst, simdscalar const& src)
527     {
528         _simd_store_ps((float*)pDst, src);
529     }
530     static simdscalar unpack(simdscalar& in) { return in; }
531     static simdscalar pack(simdscalar& in) { return in; }
532 
533     static simd16scalar loadSOA_16(const uint8_t* pSrc)
534     {
535         return _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
536     }
537 
538     static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
539     {
540         _simd16_store_ps(reinterpret_cast<float*>(pDst), src);
541     }
542 
543     static simd16scalar unpack(simd16scalar& in) { return in; }
544 
545     static simd16scalar pack(simd16scalar& in) { return in; }
546 };
547 
548 //////////////////////////////////////////////////////////////////////////
549 /// TypeTraits - Format type traits.
550 //////////////////////////////////////////////////////////////////////////
551 template <SWR_TYPE type, uint32_t NumBits>
552 struct TypeTraits : PackTraits<NumBits>
553 {
554     static const SWR_TYPE MyType = type;
555     static float          toFloat() { return 0.0; }
556     static float          fromFloat()
557     {
558         SWR_NOT_IMPL;
559         return 0.0;
560     }
561     static simdscalar convertSrgb(simdscalar& in)
562     {
563         SWR_NOT_IMPL;
564         return _simd_setzero_ps();
565     }
566 };
567 
568 //////////////////////////////////////////////////////////////////////////
569 /// TypeTraits - Format type traits specialization for UINT8
570 //////////////////////////////////////////////////////////////////////////
571 template <>
572 struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
573 {
574     static const SWR_TYPE MyType = SWR_TYPE_UINT;
575     static float          toFloat() { return 0.0; }
576     static float          fromFloat()
577     {
578         SWR_NOT_IMPL;
579         return 0.0;
580     }
581     static simdscalar convertSrgb(simdscalar& in)
582     {
583         SWR_NOT_IMPL;
584         return _simd_setzero_ps();
585     }
586 };
587 
588 //////////////////////////////////////////////////////////////////////////
589 /// TypeTraits - Format type traits specialization for UINT8
590 //////////////////////////////////////////////////////////////////////////
591 template <>
592 struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
593 {
594     static const SWR_TYPE MyType = SWR_TYPE_SINT;
595     static float          toFloat() { return 0.0; }
596     static float          fromFloat()
597     {
598         SWR_NOT_IMPL;
599         return 0.0;
600     }
601     static simdscalar convertSrgb(simdscalar& in)
602     {
603         SWR_NOT_IMPL;
604         return _simd_setzero_ps();
605     }
606 };
607 
608 //////////////////////////////////////////////////////////////////////////
609 /// TypeTraits - Format type traits specialization for UINT16
610 //////////////////////////////////////////////////////////////////////////
611 template <>
612 struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
613 {
614     static const SWR_TYPE MyType = SWR_TYPE_UINT;
615     static float          toFloat() { return 0.0; }
616     static float          fromFloat()
617     {
618         SWR_NOT_IMPL;
619         return 0.0;
620     }
621     static simdscalar convertSrgb(simdscalar& in)
622     {
623         SWR_NOT_IMPL;
624         return _simd_setzero_ps();
625     }
626 };
627 
628 //////////////////////////////////////////////////////////////////////////
629 /// TypeTraits - Format type traits specialization for SINT16
630 //////////////////////////////////////////////////////////////////////////
631 template <>
632 struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
633 {
634     static const SWR_TYPE MyType = SWR_TYPE_SINT;
635     static float          toFloat() { return 0.0; }
636     static float          fromFloat()
637     {
638         SWR_NOT_IMPL;
639         return 0.0;
640     }
641     static simdscalar convertSrgb(simdscalar& in)
642     {
643         SWR_NOT_IMPL;
644         return _simd_setzero_ps();
645     }
646 };
647 
648 //////////////////////////////////////////////////////////////////////////
649 /// TypeTraits - Format type traits specialization for UINT32
650 //////////////////////////////////////////////////////////////////////////
651 template <>
652 struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
653 {
654     static const SWR_TYPE MyType = SWR_TYPE_UINT;
655     static float          toFloat() { return 0.0; }
656     static float          fromFloat()
657     {
658         SWR_NOT_IMPL;
659         return 0.0;
660     }
661     static simdscalar convertSrgb(simdscalar& in)
662     {
663         SWR_NOT_IMPL;
664         return _simd_setzero_ps();
665     }
666 };
667 
668 //////////////////////////////////////////////////////////////////////////
669 /// TypeTraits - Format type traits specialization for UINT32
670 //////////////////////////////////////////////////////////////////////////
671 template <>
672 struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
673 {
674     static const SWR_TYPE MyType = SWR_TYPE_SINT;
675     static float          toFloat() { return 0.0; }
676     static float          fromFloat()
677     {
678         SWR_NOT_IMPL;
679         return 0.0;
680     }
681     static simdscalar convertSrgb(simdscalar& in)
682     {
683         SWR_NOT_IMPL;
684         return _simd_setzero_ps();
685     }
686 };
687 
688 //////////////////////////////////////////////////////////////////////////
689 /// TypeTraits - Format type traits specialization for UNORM5
690 //////////////////////////////////////////////////////////////////////////
691 template <>
692 struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
693 {
694     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
695     static float          toFloat() { return 1.0f / 31.0f; }
696     static float          fromFloat() { return 31.0f; }
697     static simdscalar     convertSrgb(simdscalar& in)
698     {
699         SWR_NOT_IMPL;
700         return _simd_setzero_ps();
701     }
702 };
703 
704 //////////////////////////////////////////////////////////////////////////
705 /// TypeTraits - Format type traits specialization for UNORM6
706 //////////////////////////////////////////////////////////////////////////
707 template <>
708 struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
709 {
710     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
711     static float          toFloat() { return 1.0f / 63.0f; }
712     static float          fromFloat() { return 63.0f; }
713     static simdscalar     convertSrgb(simdscalar& in)
714     {
715         SWR_NOT_IMPL;
716         return _simd_setzero_ps();
717     }
718 };
719 
720 //////////////////////////////////////////////////////////////////////////
721 /// TypeTraits - Format type traits specialization for UNORM8
722 //////////////////////////////////////////////////////////////////////////
723 template <>
724 struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
725 {
726     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
727     static float          toFloat() { return 1.0f / 255.0f; }
728     static float          fromFloat() { return 255.0f; }
729     static simdscalar     convertSrgb(simdscalar& in)
730     {
731         SWR_NOT_IMPL;
732         return _simd_setzero_ps();
733     }
734 };
735 
736 //////////////////////////////////////////////////////////////////////////
737 /// TypeTraits - Format type traits specialization for UNORM8
738 //////////////////////////////////////////////////////////////////////////
739 template <>
740 struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
741 {
742     static const SWR_TYPE MyType = SWR_TYPE_SNORM;
743     static float          toFloat() { return 1.0f / 127.0f; }
744     static float          fromFloat() { return 127.0f; }
745     static simdscalar     convertSrgb(simdscalar& in)
746     {
747         SWR_NOT_IMPL;
748         return _simd_setzero_ps();
749     }
750 };
751 
752 //////////////////////////////////////////////////////////////////////////
753 /// TypeTraits - Format type traits specialization for UNORM16
754 //////////////////////////////////////////////////////////////////////////
755 template <>
756 struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
757 {
758     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
759     static float          toFloat() { return 1.0f / 65535.0f; }
760     static float          fromFloat() { return 65535.0f; }
761     static simdscalar     convertSrgb(simdscalar& in)
762     {
763         SWR_NOT_IMPL;
764         return _simd_setzero_ps();
765     }
766 };
767 
768 //////////////////////////////////////////////////////////////////////////
769 /// TypeTraits - Format type traits specialization for SNORM16
770 //////////////////////////////////////////////////////////////////////////
771 template <>
772 struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
773 {
774     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
775     static float          toFloat() { return 1.0f / 32767.0f; }
776     static float          fromFloat() { return 32767.0f; }
777     static simdscalar     convertSrgb(simdscalar& in)
778     {
779         SWR_NOT_IMPL;
780         return _simd_setzero_ps();
781     }
782 };
783 
784 //////////////////////////////////////////////////////////////////////////
785 /// TypeTraits - Format type traits specialization for UNORM24
786 //////////////////////////////////////////////////////////////////////////
787 template <>
788 struct TypeTraits<SWR_TYPE_UNORM, 24> : PackTraits<32>
789 {
790     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
791     static float          toFloat() { return 1.0f / 16777215.0f; }
792     static float          fromFloat() { return 16777215.0f; }
793     static simdscalar     convertSrgb(simdscalar& in)
794     {
795         SWR_NOT_IMPL;
796         return _simd_setzero_ps();
797     }
798 };
799 
800 //////////////////////////////////////////////////////////////////////////
801 // FLOAT Specializations from here on...
802 //////////////////////////////////////////////////////////////////////////
803 #define TO_M128i(a) _mm_castps_si128(a)
804 #define TO_M128(a) _mm_castsi128_ps(a)
805 
806 #include "math.h"
807 
808 template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden>
809 inline static __m128 fastpow(__m128 arg)
810 {
811     __m128 ret = arg;
812 
813     static const __m128 factor =
814         _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) *
815                     powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
816 
817     // Apply a constant pre-correction factor.
818     ret = _mm_mul_ps(ret, factor);
819 
820     // Reinterpret arg as integer to obtain logarithm.
821     // asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
822     ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
823 
824     // Multiply logarithm by power.
825     ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
826 
827     // Convert back to "integer" to exponentiate.
828     // asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
829     ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
830 
831     return ret;
832 }
833 
834 inline static __m128 pow512_4(__m128 arg)
835 {
836     // 5/12 is too small, so compute the 4th root of 20/12 instead.
837     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
838     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
839     __m128 xf    = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg);
840     __m128 xover = _mm_mul_ps(arg, xf);
841 
842     __m128 xfm1   = _mm_rsqrt_ps(xf);
843     __m128 x2     = _mm_mul_ps(arg, arg);
844     __m128 xunder = _mm_mul_ps(x2, xfm1);
845 
846     // sqrt2 * over + 2 * sqrt2 * under
847     __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
848                              _mm_add_ps(xover, xunder));
849 
850     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
851     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
852     return xavg;
853 }
854 
855 inline static __m128 powf_wrapper(__m128 Base, float Exp)
856 {
857     float* f = (float*)(&Base);
858 
859     return _mm_set_ps(powf(f[3], Exp), powf(f[2], Exp), powf(f[1], Exp), powf(f[0], Exp));
860 }
861 
862 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
863 {
864     // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float
865     // value
866     __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
867 
868     // squeeze the mask down to 16 bits (4 bits per DWORD)
869     int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
870 
871     __m128 Result;
872 
873     //
874     if (CompareResult == 0xFFFF)
875     {
876         // all DWORDs are <= the threshold
877         Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
878     }
879     else if (CompareResult == 0x0)
880     {
881         // all DWORDs are > the threshold
882         __m128 fSrc_0RGB = Src;
883 
884         // --> 1.055f * c(1.0f/2.4f) - 0.055f
885 #if KNOB_USE_FAST_SRGB == TRUE
886         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
887         __m128 f = pow512_4(fSrc_0RGB);
888 #else
889         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
890 #endif
891         f      = _mm_mul_ps(f, _mm_set1_ps(1.055f));
892         Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
893     }
894     else
895     {
896         // some DWORDs are <= the threshold and some are > threshold
897         __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
898 
899         __m128 fSrc_0RGB = Src;
900 
901         // --> 1.055f * c(1.0f/2.4f) - 0.055f
902 #if KNOB_USE_FAST_SRGB == TRUE
903         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
904         __m128 f = pow512_4(fSrc_0RGB);
905 #else
906         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
907 #endif
908         f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
909         f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
910 
911         // Clear the alpha (is garbage after the sub)
912         __m128i i = _mm_and_si128(TO_M128i(f),
913                                   _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
914 
915         __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
916         __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
917         __m128i CombinedParts    = _mm_or_si128(LessThanPart, GreaterEqualPart);
918 
919         Result = TO_M128(CombinedParts);
920     }
921 
922     return Result;
923 }
924 
925 template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden>
926 inline static simd16scalar SIMDCALL fastpow(simd16scalar const& value)
927 {
928     static const float factor1 = exp2(127.0f * expden / expnum - 127.0f) *
929                                  powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
930 
931     // Apply a constant pre-correction factor.
932     simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
933 
934     // Reinterpret arg as integer to obtain logarithm.
935     // asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
936     result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
937 
938     // Multiply logarithm by power.
939     result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
940 
941     // Convert back to "integer" to exponentiate.
942     // asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
943     result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
944 
945     return result;
946 }
947 
948 inline static simd16scalar SIMDCALL pow512_4(simd16scalar const& arg)
949 {
950     // 5/12 is too small, so compute the 4th root of 20/12 instead.
951     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
952     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
953     simd16scalar xf    = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg);
954     simd16scalar xover = _simd16_mul_ps(arg, xf);
955 
956     simd16scalar xfm1   = _simd16_rsqrt_ps(xf);
957     simd16scalar x2     = _simd16_mul_ps(arg, arg);
958     simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
959 
960     // sqrt2 * over + 2 * sqrt2 * under
961     simd16scalar xavg =
962         _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
963                        _simd16_add_ps(xover, xunder));
964 
965     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
966     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
967 
968     return xavg;
969 }
970 
971 inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar& base, float exp)
972 {
973     const float* f = reinterpret_cast<const float*>(&base);
974 
975     return _simd16_set_ps(powf(f[15], exp),
976                           powf(f[14], exp),
977                           powf(f[13], exp),
978                           powf(f[12], exp),
979                           powf(f[11], exp),
980                           powf(f[10], exp),
981                           powf(f[9], exp),
982                           powf(f[8], exp),
983                           powf(f[7], exp),
984                           powf(f[6], exp),
985                           powf(f[5], exp),
986                           powf(f[4], exp),
987                           powf(f[3], exp),
988                           powf(f[2], exp),
989                           powf(f[1], exp),
990                           powf(f[0], exp));
991 }
992 
993 // float to SRGB conversion formula
994 //
995 // if (value < 0.0031308f)
996 //     value *= 12.92f;
997 // else
998 //     value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
999 //
1000 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value)
1001 {
1002     // create a mask where the source is < the minimal SRGB float value
1003     const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
1004 
1005     // if all elements are < the threshold, result = value * 12.92
1006     simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f));
1007 
1008     if (_simd16_mask2int(mask) != 0xFFFF)
1009     {
1010         // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
1011 #if KNOB_USE_FAST_SRGB == TRUE
1012         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
1013         simd16scalar result2 = pow512_4(value);
1014 #else
1015         simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f);
1016 #endif
1017 
1018         result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f));
1019         result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f));
1020 
1021 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
1022         // only native AVX512 can directly use the computed mask for the blend operation
1023         result = _mm512_mask_blend_ps(mask, result2, result);
1024 #else
1025         result = _simd16_blendv_ps(
1026             result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
1027 #endif
1028     }
1029 
1030     return result;
1031 }
1032 
1033 //////////////////////////////////////////////////////////////////////////
1034 /// TypeTraits - Format type traits specialization for FLOAT16
1035 //////////////////////////////////////////////////////////////////////////
1036 template <>
1037 struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
1038 {
1039     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
1040     static float          toFloat() { return 1.0f; }
1041     static float          fromFloat() { return 1.0f; }
1042     static simdscalar     convertSrgb(simdscalar& in)
1043     {
1044         SWR_NOT_IMPL;
1045         return _simd_setzero_ps();
1046     }
1047 
1048     static simdscalar pack(const simdscalar& in)
1049     {
1050 #if KNOB_SIMD_WIDTH == 8
1051 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1052         // input is 8 packed float32, output is 8 packed float16
1053         simdscalari src = _simd_castps_si(in);
1054 
1055         static const uint32_t FLOAT_EXP_BITS      = 8;
1056         static const uint32_t FLOAT_MANTISSA_BITS = 23;
1057         static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
1058         static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
1059 
1060         static const uint32_t HALF_EXP_BITS      = 5;
1061         static const uint32_t HALF_MANTISSA_BITS = 10;
1062         static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
1063 
1064         // minimum exponent required, exponents below this are flushed to 0.
1065         static const int32_t HALF_EXP_MIN   = -14;
1066         static const int32_t FLOAT_EXP_BIAS = 127;
1067         static const int32_t FLOAT_EXP_MIN  = HALF_EXP_MIN + FLOAT_EXP_BIAS;
1068         static const int32_t FLOAT_EXP_MIN_FTZ =
1069             FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
1070 
1071         // maximum exponent required, exponents above this are set to infinity
1072         static const int32_t HALF_EXP_MAX  = 15;
1073         static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
1074 
1075         const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
1076         const simdscalari vExpMask  = _simd_set1_epi32(FLOAT_EXP_MASK);
1077         const simdscalari vManMask  = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
1078         const simdscalari vExpMin =
1079             _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
1080         const simdscalari vExpMinFtz =
1081             _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
1082         const simdscalari vExpMax =
1083             _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
1084 
1085         simdscalari vSign = _simd_and_si(src, vSignMask);
1086         simdscalari vExp  = _simd_and_si(src, vExpMask);
1087         simdscalari vMan  = _simd_and_si(src, vManMask);
1088 
1089         simdscalari vFTZMask    = _simd_cmplt_epi32(vExp, vExpMinFtz);
1090         simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
1091         simdscalari vInfMask    = _simd_cmpeq_epi32(vExpMask, vExp);
1092         simdscalari vClampMask  = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
1093 
1094         simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin),
1095                                                _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
1096 
1097         // pack output 16-bits into the lower 16-bits of each 32-bit channel
1098         simdscalari vDst =
1099             _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
1100         vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
1101 
1102         // Flush To Zero
1103         vDst = _simd_andnot_si(vFTZMask, vDst);
1104         // Apply Infinites / NaN
1105         vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
1106 
1107         // Apply clamps
1108         vDst = _simd_andnot_si(vClampMask, vDst);
1109         vDst = _simd_or_si(vDst, _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
1110 
1111         // Compute Denormals (subnormals)
1112         if (!_mm256_testz_si256(vDenormMask, vDenormMask))
1113         {
1114             uint32_t* pDenormMask = (uint32_t*)&vDenormMask;
1115             uint32_t* pExp        = (uint32_t*)&vExp;
1116             uint32_t* pMan        = (uint32_t*)&vMan;
1117             uint32_t* pDst        = (uint32_t*)&vDst;
1118             for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
1119             {
1120                 if (pDenormMask[i])
1121                 {
1122                     // Need to compute subnormal value
1123                     uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
1124                     uint32_t mantissa =
1125                         pMan[i] | (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.
1126                                                                // Make it explicit
1127 
1128                     pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) +
1129                                            (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
1130                 }
1131             }
1132         }
1133 
1134         // Add in sign bits
1135         vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
1136 
1137         // Pack to lower 128-bits
1138         vDst = _mm256_castsi128_si256(
1139             _mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
1140 
1141 #if 0
1142 #if !defined(NDEBUG)
1143         simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
1144 
1145         for (uint32_t i = 0; i < 4; ++i)
1146         {
1147             SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
1148         }
1149 #endif
1150 #endif
1151 
1152         return _simd_castsi_ps(vDst);
1153 
1154 #else
1155         return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
1156 #endif
1157 #else
1158 #error Unsupported vector width
1159 #endif
1160     }
1161 
1162     static simdscalar unpack(const simdscalar& in)
1163     {
1164         // input is 8 packed float16, output is 8 packed float32
1165         SWR_NOT_IMPL; // @todo
1166         return _simd_setzero_ps();
1167     }
1168 
1169     static simd16scalar pack(const simd16scalar& in)
1170     {
1171         simd16scalari result   = _simd16_setzero_si();
1172         simdscalari   resultlo = _simd_setzero_si();
1173 
1174 #if (KNOB_ARCH == KNOB_ARCH_AVX)
1175         simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
1176         simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
1177 
1178         __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0);
1179         __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0);
1180 
1181 #else
1182         __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
1183         __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
1184 
1185 #endif
1186         resultlo = _simd_insertf128_si(resultlo, templo, 0);
1187         resultlo = _simd_insertf128_si(resultlo, temphi, 1);
1188 
1189         result = _simd16_insert_si(result, resultlo, 0);
1190 
1191         return _simd16_castsi_ps(result);
1192     }
1193 
1194     static simd16scalar unpack(const simd16scalar& in)
1195     {
1196         // input is 16 packed float16, output is 16 packed float32
1197         SWR_NOT_IMPL; //  @todo
1198         return _simd16_setzero_ps();
1199     }
1200 };
1201 
1202 //////////////////////////////////////////////////////////////////////////
1203 /// TypeTraits - Format type traits specialization for FLOAT32
1204 //////////////////////////////////////////////////////////////////////////
1205 template <>
1206 struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
1207 {
1208     static const SWR_TYPE    MyType = SWR_TYPE_FLOAT;
1209     static float             toFloat() { return 1.0f; }
1210     static float             fromFloat() { return 1.0f; }
1211     static inline simdscalar convertSrgb(simdscalar& in)
1212     {
1213 #if KNOB_SIMD_WIDTH == 8
1214         __m128 srcLo = _mm256_extractf128_ps(in, 0);
1215         __m128 srcHi = _mm256_extractf128_ps(in, 1);
1216 
1217         srcLo = ConvertFloatToSRGB2(srcLo);
1218         srcHi = ConvertFloatToSRGB2(srcHi);
1219 
1220         in = _mm256_insertf128_ps(in, srcLo, 0);
1221         in = _mm256_insertf128_ps(in, srcHi, 1);
1222 #else
1223 #error Unsupported vector width
1224 #endif
1225         return in;
1226     }
1227 
1228     static inline simd16scalar convertSrgb(simd16scalar& in) { return ConvertFloatToSRGB2(in); }
1229 };
1230 
1231 //////////////////////////////////////////////////////////////////////////
1232 /// FormatIntType - Calculate base integer type for pixel components based
1233 ///                 on total number of bits.  Components can be smaller
1234 ///                 that this type, but the entire pixel must not be
1235 ///                 any smaller than this type.
1236 //////////////////////////////////////////////////////////////////////////
1237 template <uint32_t bits, bool bits8 = bits <= 8, bool bits16 = bits <= 16>
1238 struct FormatIntType
1239 {
1240     typedef uint32_t TYPE;
1241 };
1242 
1243 template <uint32_t bits>
1244 struct FormatIntType<bits, true, true>
1245 {
1246     typedef uint8_t TYPE;
1247 };
1248 
1249 template <uint32_t bits>
1250 struct FormatIntType<bits, false, true>
1251 {
1252     typedef uint16_t TYPE;
1253 };
1254 
1255 //////////////////////////////////////////////////////////////////////////
1256 /// Format1 - Bitfield for single component formats.
1257 //////////////////////////////////////////////////////////////////////////
1258 template <uint32_t x>
1259 union Format1
1260 {
1261     typedef typename FormatIntType<x>::TYPE TYPE;
1262     struct
1263     {
1264         TYPE r : x;
1265     };
1266 
1267     ///@ The following are here to provide full template needed in Formats.
1268     struct
1269     {
1270         TYPE g : x;
1271     };
1272     struct
1273     {
1274         TYPE b : x;
1275     };
1276     struct
1277     {
1278         TYPE a : x;
1279     };
1280 };
1281 
1282 //////////////////////////////////////////////////////////////////////////
1283 /// Format2 - Bitfield for 2 component formats.
1284 //////////////////////////////////////////////////////////////////////////
1285 template <uint32_t x, uint32_t y>
1286 union Format2
1287 {
1288     typedef typename FormatIntType<x + y>::TYPE TYPE;
1289 
1290     struct
1291     {
1292         TYPE r : x;
1293         TYPE g : y;
1294     };
1295     struct
1296     {
1297         ///@ The following are here to provide full template needed in Formats.
1298         TYPE b : x;
1299         TYPE a : y;
1300     };
1301 };
1302 
1303 //////////////////////////////////////////////////////////////////////////
1304 /// Format3 - Bitfield for 3 component formats.
1305 //////////////////////////////////////////////////////////////////////////
1306 template <uint32_t x, uint32_t y, uint32_t z>
1307 union Format3
1308 {
1309     typedef typename FormatIntType<x + y + z>::TYPE TYPE;
1310 
1311     struct
1312     {
1313         TYPE r : x;
1314         TYPE g : y;
1315         TYPE b : z;
1316     };
1317     TYPE a; ///@note This is here to provide full template needed in Formats.
1318 };
1319 
1320 //////////////////////////////////////////////////////////////////////////
1321 /// Format4 - Bitfield for 4 component formats.
1322 //////////////////////////////////////////////////////////////////////////
1323 template <uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1324 struct Format4
1325 {
1326     typedef typename FormatIntType<x + y + z + w>::TYPE TYPE;
1327 
1328     TYPE r : x;
1329     TYPE g : y;
1330     TYPE b : z;
1331     TYPE a : w;
1332 };
1333 
1334 //////////////////////////////////////////////////////////////////////////
1335 /// ComponentTraits - Default components
1336 //////////////////////////////////////////////////////////////////////////
1337 template <uint32_t x, uint32_t y, uint32_t z, uint32_t w>
1338 struct Defaults
1339 {
1340     INLINE static uint32_t GetDefault(uint32_t comp)
1341     {
1342         static const uint32_t defaults[4]{x, y, z, w};
1343         return defaults[comp];
1344     }
1345 };
1346 
1347 //////////////////////////////////////////////////////////////////////////
1348 /// ComponentTraits - Component type traits.
1349 //////////////////////////////////////////////////////////////////////////
1350 template <SWR_TYPE X,
1351           uint32_t NumBitsX,
1352           SWR_TYPE Y        = SWR_TYPE_UNKNOWN,
1353           uint32_t NumBitsY = 0,
1354           SWR_TYPE Z        = SWR_TYPE_UNKNOWN,
1355           uint32_t NumBitsZ = 0,
1356           SWR_TYPE W        = SWR_TYPE_UNKNOWN,
1357           uint32_t NumBitsW = 0>
1358 struct ComponentTraits
1359 {
1360     INLINE static SWR_TYPE GetType(uint32_t comp)
1361     {
1362         static const SWR_TYPE CompType[4]{X, Y, Z, W};
1363         return CompType[comp];
1364     }
1365 
1366     INLINE static constexpr uint32_t GetConstBPC(uint32_t comp)
1367     {
1368         return (comp == 3) ? NumBitsW
1369                            : ((comp == 2) ? NumBitsZ : ((comp == 1) ? NumBitsY : NumBitsX));
1370     }
1371 
1372     INLINE static uint32_t GetBPC(uint32_t comp)
1373     {
1374         static const uint32_t MyBpc[4]{NumBitsX, NumBitsY, NumBitsZ, NumBitsW};
1375         return MyBpc[comp];
1376     }
1377 
1378     INLINE static bool isNormalized(uint32_t comp)
1379     {
1380         switch (comp)
1381         {
1382         case 0:
1383             return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
1384         case 1:
1385             return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
1386         case 2:
1387             return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
1388         case 3:
1389             return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
1390         }
1391         SWR_INVALID("Invalid component: %d", comp);
1392         return false;
1393     }
1394 
1395     INLINE static float toFloat(uint32_t comp)
1396     {
1397         switch (comp)
1398         {
1399         case 0:
1400             return TypeTraits<X, NumBitsX>::toFloat();
1401         case 1:
1402             return TypeTraits<Y, NumBitsY>::toFloat();
1403         case 2:
1404             return TypeTraits<Z, NumBitsZ>::toFloat();
1405         case 3:
1406             return TypeTraits<W, NumBitsW>::toFloat();
1407         }
1408         SWR_INVALID("Invalid component: %d", comp);
1409         return TypeTraits<X, NumBitsX>::toFloat();
1410     }
1411 
1412     INLINE static float fromFloat(uint32_t comp)
1413     {
1414         switch (comp)
1415         {
1416         case 0:
1417             return TypeTraits<X, NumBitsX>::fromFloat();
1418         case 1:
1419             return TypeTraits<Y, NumBitsY>::fromFloat();
1420         case 2:
1421             return TypeTraits<Z, NumBitsZ>::fromFloat();
1422         case 3:
1423             return TypeTraits<W, NumBitsW>::fromFloat();
1424         }
1425         SWR_INVALID("Invalid component: %d", comp);
1426         return TypeTraits<X, NumBitsX>::fromFloat();
1427     }
1428 
1429     INLINE static void loadSOA(uint32_t comp, const uint8_t* pSrc, simdscalar& dst)
1430     {
1431         switch (comp)
1432         {
1433         case 0:
1434             dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1435             return;
1436         case 1:
1437             dst = TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
1438             return;
1439         case 2:
1440             dst = TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
1441             return;
1442         case 3:
1443             dst = TypeTraits<W, NumBitsW>::loadSOA(pSrc);
1444             return;
1445         }
1446         SWR_INVALID("Invalid component: %d", comp);
1447         dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc);
1448     }
1449 
1450     INLINE static void storeSOA(uint32_t comp, uint8_t* pDst, simdscalar const& src)
1451     {
1452         switch (comp)
1453         {
1454         case 0:
1455             TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1456             return;
1457         case 1:
1458             TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1459             return;
1460         case 2:
1461             TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1462             return;
1463         case 3:
1464             TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1465             return;
1466         }
1467         SWR_INVALID("Invalid component: %d", comp);
1468     }
1469 
1470     INLINE static simdscalar unpack(uint32_t comp, simdscalar& in)
1471     {
1472         simdscalar out;
1473         switch (comp)
1474         {
1475         case 0:
1476             out = TypeTraits<X, NumBitsX>::unpack(in);
1477             break;
1478         case 1:
1479             out = TypeTraits<Y, NumBitsY>::unpack(in);
1480             break;
1481         case 2:
1482             out = TypeTraits<Z, NumBitsZ>::unpack(in);
1483             break;
1484         case 3:
1485             out = TypeTraits<W, NumBitsW>::unpack(in);
1486             break;
1487         default:
1488             SWR_INVALID("Invalid component: %d", comp);
1489             out = in;
1490             break;
1491         }
1492         return out;
1493     }
1494 
1495     INLINE static simdscalar pack(uint32_t comp, simdscalar& in)
1496     {
1497         simdscalar out;
1498         switch (comp)
1499         {
1500         case 0:
1501             out = TypeTraits<X, NumBitsX>::pack(in);
1502             break;
1503         case 1:
1504             out = TypeTraits<Y, NumBitsY>::pack(in);
1505             break;
1506         case 2:
1507             out = TypeTraits<Z, NumBitsZ>::pack(in);
1508             break;
1509         case 3:
1510             out = TypeTraits<W, NumBitsW>::pack(in);
1511             break;
1512         default:
1513             SWR_INVALID("Invalid component: %d", comp);
1514             out = in;
1515             break;
1516         }
1517         return out;
1518     }
1519 
1520     INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar& in)
1521     {
1522         switch (comp)
1523         {
1524         case 0:
1525             return TypeTraits<X, NumBitsX>::convertSrgb(in);
1526         case 1:
1527             return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1528         case 2:
1529             return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1530         case 3:
1531             return TypeTraits<W, NumBitsW>::convertSrgb(in);
1532         }
1533         SWR_INVALID("Invalid component: %d", comp);
1534         return TypeTraits<X, NumBitsX>::convertSrgb(in);
1535     }
1536 
1537     INLINE static void SIMDCALL loadSOA(uint32_t comp, const uint8_t* pSrc, simd16scalar& dst)
1538     {
1539         switch (comp)
1540         {
1541         case 0:
1542             dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1543             return;
1544         case 1:
1545             dst = TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
1546             return;
1547         case 2:
1548             dst = TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
1549             return;
1550         case 3:
1551             dst = TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
1552             return;
1553         }
1554         SWR_INVALID("Invalid component: %d", comp);
1555         dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
1556     }
1557 
1558     INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t* pDst, simd16scalar const& src)
1559     {
1560         switch (comp)
1561         {
1562         case 0:
1563             TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1564             return;
1565         case 1:
1566             TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
1567             return;
1568         case 2:
1569             TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
1570             return;
1571         case 3:
1572             TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
1573             return;
1574         }
1575         SWR_INVALID("Invalid component: %d", comp);
1576         TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
1577     }
1578 
1579     INLINE static simd16scalar unpack(uint32_t comp, simd16scalar& in)
1580     {
1581         switch (comp)
1582         {
1583         case 0:
1584             return TypeTraits<X, NumBitsX>::unpack(in);
1585         case 1:
1586             return TypeTraits<Y, NumBitsY>::unpack(in);
1587         case 2:
1588             return TypeTraits<Z, NumBitsZ>::unpack(in);
1589         case 3:
1590             return TypeTraits<W, NumBitsW>::unpack(in);
1591         }
1592         SWR_INVALID("Invalid component: %d", comp);
1593         return TypeTraits<X, NumBitsX>::unpack(in);
1594     }
1595 
1596     INLINE static simd16scalar pack(uint32_t comp, simd16scalar& in)
1597     {
1598         switch (comp)
1599         {
1600         case 0:
1601             return TypeTraits<X, NumBitsX>::pack(in);
1602         case 1:
1603             return TypeTraits<Y, NumBitsY>::pack(in);
1604         case 2:
1605             return TypeTraits<Z, NumBitsZ>::pack(in);
1606         case 3:
1607             return TypeTraits<W, NumBitsW>::pack(in);
1608         }
1609         SWR_INVALID("Invalid component: %d", comp);
1610         return TypeTraits<X, NumBitsX>::pack(in);
1611     }
1612 
1613     INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar& in)
1614     {
1615         switch (comp)
1616         {
1617         case 0:
1618             return TypeTraits<X, NumBitsX>::convertSrgb(in);
1619         case 1:
1620             return TypeTraits<Y, NumBitsY>::convertSrgb(in);
1621         case 2:
1622             return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
1623         case 3:
1624             return TypeTraits<W, NumBitsW>::convertSrgb(in);
1625         }
1626         SWR_INVALID("Invalid component: %d", comp);
1627         return TypeTraits<X, NumBitsX>::convertSrgb(in);
1628     }
1629 };
1630