1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file utils.h
24 *
25 * @brief Utilities used by SWR core related to pixel formats.
26 *
27 ******************************************************************************/
28 #pragma once
29 
30 #include "core/utils.h"
31 #include "common/simdintrin.h"
32 
33 INLINE
vTranspose(simd4scalar & row0,simd4scalar & row1,simd4scalar & row2,simd4scalar & row3)34 void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
35 {
36     simd4scalari row0i = SIMD128::castps_si(row0);
37     simd4scalari row1i = SIMD128::castps_si(row1);
38     simd4scalari row2i = SIMD128::castps_si(row2);
39     simd4scalari row3i = SIMD128::castps_si(row3);
40 
41     simd4scalari vTemp = row2i;
42     row2i = SIMD128::unpacklo_epi32(row2i, row3i);
43     vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
44 
45     row3i = row0i;
46     row0i = SIMD128::unpacklo_epi32(row0i, row1i);
47     row3i = SIMD128::unpackhi_epi32(row3i, row1i);
48 
49     row1i = row0i;
50     row0i = SIMD128::unpacklo_epi64(row0i, row2i);
51     row1i = SIMD128::unpackhi_epi64(row1i, row2i);
52 
53     row2i = row3i;
54     row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
55     row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
56 
57     row0 = SIMD128::castsi_ps(row0i);
58     row1 = SIMD128::castsi_ps(row1i);
59     row2 = SIMD128::castsi_ps(row2i);
60     row3 = SIMD128::castsi_ps(row3i);
61 }
62 
63 INLINE
vTranspose(simd4scalari & row0,simd4scalari & row1,simd4scalari & row2,simd4scalari & row3)64 void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
65 {
66     simd4scalari vTemp = row2;
67     row2 = SIMD128::unpacklo_epi32(row2, row3);
68     vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
69 
70     row3 = row0;
71     row0 = SIMD128::unpacklo_epi32(row0, row1);
72     row3 = SIMD128::unpackhi_epi32(row3, row1);
73 
74     row1 = row0;
75     row0 = SIMD128::unpacklo_epi64(row0, row2);
76     row1 = SIMD128::unpackhi_epi64(row1, row2);
77 
78     row2 = row3;
79     row2 = SIMD128::unpacklo_epi64(row2, vTemp);
80     row3 = SIMD128::unpackhi_epi64(row3, vTemp);
81 }
82 
83 #if KNOB_SIMD_WIDTH == 8
84 INLINE
vTranspose3x8(simd4scalar (& vDst)[8],const simdscalar & vSrc0,const simdscalar & vSrc1,const simdscalar & vSrc2)85 void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
86 {
87     simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);                  //x0z0x1z1 x4z4x5z5
88     simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps());     //y0w0y1w1 y4w4y5w5
89     simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);              //x0y0z0w0 x4y4z4w4
90     simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);              //x1y1z1w1 x5y5z5w5
91 
92     r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                             //x2z2x3z3 x6z6x7z7
93     r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps());                //y2w2y3w3 y6w6yw77
94     simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);              //x2y2z2w2 x6y6z6w6
95     simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);              //x3y3z3w3 x7y7z7w7
96 
97     vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
98     vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
99     vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
100     vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
101 
102     vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
103     vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
104     vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
105     vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
106 }
107 
108 INLINE
vTranspose4x8(simd4scalar (& vDst)[8],const simdscalar & vSrc0,const simdscalar & vSrc1,const simdscalar & vSrc2,const simdscalar & vSrc3)109 void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
110 {
111     simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);      //x0z0x1z1 x4z4x5z5
112     simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3);      //y0w0y1w1 y4w4y5w5
113     simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);  //x0y0z0w0 x4y4z4w4
114     simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);  //x1y1z1w1 x5y5z5w5
115 
116     r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                 //x2z2x3z3 x6z6x7z7
117     r1rx = _simd_unpackhi_ps(vSrc1, vSrc3);                 //y2w2y3w3 y6w6yw77
118     simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);  //x2y2z2w2 x6y6z6w6
119     simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);  //x3y3z3w3 x7y7z7w7
120 
121     vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
122     vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
123     vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
124     vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
125 
126     vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
127     vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
128     vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
129     vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
130 }
131 
132 #if ENABLE_AVX512_SIMD16
133 INLINE
vTranspose4x16(simd16scalar (& dst)[4],const simd16scalar & src0,const simd16scalar & src1,const simd16scalar & src2,const simd16scalar & src3)134 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
135 {
136     const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
137 
138     simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
139     simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
140     simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
141     simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
142 
143     simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
144     simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
145     simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
146     simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
147 
148     dst[0] = _simd16_unpacklo_ps(rblo, galo);
149     dst[1] = _simd16_unpackhi_ps(rblo, galo);
150     dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
151     dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
152 }
153 
154 #endif
155 INLINE
vTranspose8x8(simdscalar (& vDst)[8],const simdscalar & vMask0,const simdscalar & vMask1,const simdscalar & vMask2,const simdscalar & vMask3,const simdscalar & vMask4,const simdscalar & vMask5,const simdscalar & vMask6,const simdscalar & vMask7)156 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7)
157 {
158     simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
159     simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
160     simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
161     simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
162     simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
163     simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
164     simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
165     simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
166     simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
167     simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
168     simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
169     simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
170     simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
171     simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
172     simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
173     simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
174     vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
175     vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
176     vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
177     vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
178     vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
179     vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
180     vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
181     vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
182 }
183 
184 INLINE
vTranspose8x8(simdscalar (& vDst)[8],const simdscalari & vMask0,const simdscalari & vMask1,const simdscalari & vMask2,const simdscalari & vMask3,const simdscalari & vMask4,const simdscalari & vMask5,const simdscalari & vMask6,const simdscalari & vMask7)185 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7)
186 {
187     vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3),
188         _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7));
189 }
190 #endif
191 
192 //////////////////////////////////////////////////////////////////////////
193 /// TranposeSingleComponent
194 //////////////////////////////////////////////////////////////////////////
195 template<uint32_t bpp>
196 struct TransposeSingleComponent
197 {
198     //////////////////////////////////////////////////////////////////////////
199     /// @brief Pass-thru for single component.
200     /// @param pSrc - source data in SOA form
201     /// @param pDst - output data in AOS form
TransposeTransposeSingleComponent202     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
203     {
204         memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
205     }
206 #if ENABLE_AVX512_SIMD16
207 
Transpose_16TransposeSingleComponent208     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
209     {
210         memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
211     }
212 #endif
213 };
214 
215 //////////////////////////////////////////////////////////////////////////
216 /// Transpose8_8_8_8
217 //////////////////////////////////////////////////////////////////////////
218 struct Transpose8_8_8_8
219 {
220     //////////////////////////////////////////////////////////////////////////
221     /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
222     /// @param pSrc - source data in SOA form
223     /// @param pDst - output data in AOS form
TransposeTranspose8_8_8_8224     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
225     {
226         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
227 
228 #if KNOB_SIMD_WIDTH == 8
229 #if KNOB_ARCH <= KNOB_ARCH_AVX
230         simd4scalari c0c1 = src.v4[0];                                                          // rrrrrrrrgggggggg
231         simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1));  // bbbbbbbbaaaaaaaa
232         simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
233         simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
234         simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
235         simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
236         simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
237         simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
238         SIMD128::store_si((simd4scalari*)pDst, c0123lo);
239         SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
240 #else
241         simdscalari dst01 = _simd_shuffle_epi8(src,
242             _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
243         simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
244         dst23 = _simd_shuffle_epi8(dst23,
245             _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
246         simdscalari dst = _simd_or_si(dst01, dst23);
247         _simd_store_si((simdscalari*)pDst, dst);
248 #endif
249 #else
250 #error Unsupported vector width
251 #endif
252     }
253 #if ENABLE_AVX512_SIMD16
254 
Transpose_16Transpose8_8_8_8255     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
256     {
257         simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
258         simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
259         simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
260         simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
261 
262         simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
263         simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
264         simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
265         simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
266 
267         simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
268         simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
269         simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
270 
271         simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
272 
273         _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);             // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
274     }
275 #endif
276 };
277 
278 //////////////////////////////////////////////////////////////////////////
279 /// Transpose8_8_8
280 //////////////////////////////////////////////////////////////////////////
281 struct Transpose8_8_8
282 {
283     //////////////////////////////////////////////////////////////////////////
284     /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
285     /// @param pSrc - source data in SOA form
286     /// @param pDst - output data in AOS form
287     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
288 #if ENABLE_AVX512_SIMD16
289 
290     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
291 #endif
292 };
293 
294 //////////////////////////////////////////////////////////////////////////
295 /// Transpose8_8
296 //////////////////////////////////////////////////////////////////////////
297 struct Transpose8_8
298 {
299     //////////////////////////////////////////////////////////////////////////
300     /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
301     /// @param pSrc - source data in SOA form
302     /// @param pDst - output data in AOS form
TransposeTranspose8_8303     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
304     {
305 #if KNOB_SIMD_WIDTH == 8
306         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
307 
308         simd4scalari rg = src.v4[0];           // rrrrrrrr gggggggg
309         simd4scalari g = SIMD128::unpackhi_epi64(rg, rg);             // gggggggg gggggggg
310         rg = SIMD128::unpacklo_epi8(rg, g);
311         SIMD128::store_si((simd4scalari*)pDst, rg);
312 #else
313 #error Unsupported vector width
314 #endif
315     }
316 #if ENABLE_AVX512_SIMD16
317 
Transpose_16Transpose8_8318     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
319     {
320         simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
321         simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
322 
323         simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
324         simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
325 
326         simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
327 
328         simdscalari dst = _simd_or_si(cvt0, shl1);
329 
330         _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);                 // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
331     }
332 #endif
333 };
334 
335 //////////////////////////////////////////////////////////////////////////
336 /// Transpose32_32_32_32
337 //////////////////////////////////////////////////////////////////////////
338 struct Transpose32_32_32_32
339 {
340     //////////////////////////////////////////////////////////////////////////
341     /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
342     /// @param pSrc - source data in SOA form
343     /// @param pDst - output data in AOS form
TransposeTranspose32_32_32_32344     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
345     {
346 #if KNOB_SIMD_WIDTH == 8
347         simdscalar src0 = _simd_load_ps((const float*)pSrc);
348         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
349         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
350         simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
351 
352         simd4scalar vDst[8];
353         vTranspose4x8(vDst, src0, src1, src2, src3);
354         SIMD128::store_ps((float*)pDst, vDst[0]);
355         SIMD128::store_ps((float*)pDst+4, vDst[1]);
356         SIMD128::store_ps((float*)pDst+8, vDst[2]);
357         SIMD128::store_ps((float*)pDst+12, vDst[3]);
358         SIMD128::store_ps((float*)pDst+16, vDst[4]);
359         SIMD128::store_ps((float*)pDst+20, vDst[5]);
360         SIMD128::store_ps((float*)pDst+24, vDst[6]);
361         SIMD128::store_ps((float*)pDst+28, vDst[7]);
362 #else
363 #error Unsupported vector width
364 #endif
365     }
366 #if ENABLE_AVX512_SIMD16
367 
Transpose_16Transpose32_32_32_32368     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
369     {
370         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
371         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
372         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
373         simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
374 
375         simd16scalar dst[4];
376 
377         vTranspose4x16(dst, src0, src1, src2, src3);
378 
379         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
380         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
381         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
382         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
383     }
384 #endif
385 };
386 
387 //////////////////////////////////////////////////////////////////////////
388 /// Transpose32_32_32
389 //////////////////////////////////////////////////////////////////////////
390 struct Transpose32_32_32
391 {
392     //////////////////////////////////////////////////////////////////////////
393     /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
394     /// @param pSrc - source data in SOA form
395     /// @param pDst - output data in AOS form
TransposeTranspose32_32_32396     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
397     {
398 #if KNOB_SIMD_WIDTH == 8
399         simdscalar src0 = _simd_load_ps((const float*)pSrc);
400         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
401         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
402 
403         simd4scalar vDst[8];
404         vTranspose3x8(vDst, src0, src1, src2);
405         SIMD128::store_ps((float*)pDst, vDst[0]);
406         SIMD128::store_ps((float*)pDst + 4, vDst[1]);
407         SIMD128::store_ps((float*)pDst + 8, vDst[2]);
408         SIMD128::store_ps((float*)pDst + 12, vDst[3]);
409         SIMD128::store_ps((float*)pDst + 16, vDst[4]);
410         SIMD128::store_ps((float*)pDst + 20, vDst[5]);
411         SIMD128::store_ps((float*)pDst + 24, vDst[6]);
412         SIMD128::store_ps((float*)pDst + 28, vDst[7]);
413 #else
414 #error Unsupported vector width
415 #endif
416     }
417 #if ENABLE_AVX512_SIMD16
418 
Transpose_16Transpose32_32_32419     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
420     {
421         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
422         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
423         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
424         simd16scalar src3 = _simd16_setzero_ps();
425 
426         simd16scalar dst[4];
427 
428         vTranspose4x16(dst, src0, src1, src2, src3);
429 
430         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
431         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
432         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
433         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
434     }
435 #endif
436 };
437 
438 //////////////////////////////////////////////////////////////////////////
439 /// Transpose32_32
440 //////////////////////////////////////////////////////////////////////////
441 struct Transpose32_32
442 {
443     //////////////////////////////////////////////////////////////////////////
444     /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
445     /// @param pSrc - source data in SOA form
446     /// @param pDst - output data in AOS form
TransposeTranspose32_32447     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
448     {
449 #if KNOB_SIMD_WIDTH == 8
450         const float* pfSrc = (const float*)pSrc;
451         simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
452         simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
453         simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
454         simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
455 
456         simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
457         simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
458         simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
459         simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
460 
461         float* pfDst = (float*)pDst;
462         SIMD128::store_ps(pfDst + 0, dst0);
463         SIMD128::store_ps(pfDst + 4, dst1);
464         SIMD128::store_ps(pfDst + 8, dst2);
465         SIMD128::store_ps(pfDst + 12, dst3);
466 #else
467 #error Unsupported vector width
468 #endif
469     }
470 #if ENABLE_AVX512_SIMD16
471 
Transpose_16Transpose32_32472     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
473     {
474         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));                 // rrrrrrrrrrrrrrrr
475         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);            // gggggggggggggggg
476 
477         simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                                        // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
478         simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                                        // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
479 
480         simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44);  // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
481         simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE);  // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
482 
483         simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8);  // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
484         simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8);  // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
485 
486         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);                               // rgrgrgrgrgrgrgrg
487         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1);                               // rgrgrgrgrgrgrgrg
488     }
489 #endif
490 };
491 
492 //////////////////////////////////////////////////////////////////////////
493 /// Transpose16_16_16_16
494 //////////////////////////////////////////////////////////////////////////
495 struct Transpose16_16_16_16
496 {
497     //////////////////////////////////////////////////////////////////////////
498     /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
499     /// @param pSrc - source data in SOA form
500     /// @param pDst - output data in AOS form
TransposeTranspose16_16_16_16501     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
502     {
503 #if KNOB_SIMD_WIDTH == 8
504         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
505         simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
506 
507         simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
508         simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
509         simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
510         simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
511 
512         simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
513         simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
514         simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
515         simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
516 
517         simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
518         simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
519         simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
520         simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
521 
522         SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
523         SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
524         SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
525         SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
526 #else
527 #error Unsupported vector width
528 #endif
529     }
530 #if ENABLE_AVX512_SIMD16
531 
Transpose_16Transpose16_16_16_16532     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
533     {
534         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
535         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
536         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
537         simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3);          // aaaaaaaaaaaaaaaa
538 
539         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
540         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
541         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
542         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
543 
544         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
545         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
546         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
547         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
548 
549         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
550         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
551         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
552         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
553 
554         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
555         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
556         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
557         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
558     }
559 #endif
560 };
561 
562 //////////////////////////////////////////////////////////////////////////
563 /// Transpose16_16_16
564 //////////////////////////////////////////////////////////////////////////
565 struct Transpose16_16_16
566 {
567     //////////////////////////////////////////////////////////////////////////
568     /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
569     /// @param pSrc - source data in SOA form
570     /// @param pDst - output data in AOS form
TransposeTranspose16_16_16571     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
572     {
573 #if KNOB_SIMD_WIDTH == 8
574         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
575 
576         simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
577         simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
578         simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
579         simd4scalari src_a = SIMD128::setzero_si();
580 
581         simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
582         simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
583         simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
584         simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
585 
586         simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
587         simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
588         simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
589         simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
590 
591         SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
592         SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
593         SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
594         SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
595 #else
596 #error Unsupported vector width
597 #endif
598     }
599 #if ENABLE_AVX512_SIMD16
600 
Transpose_16Transpose16_16_16601     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
602     {
603         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
604         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
605         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
606         simdscalari src3 = _simd_setzero_si();                                                      // aaaaaaaaaaaaaaaa
607 
608         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
609         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
610         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
611         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
612 
613         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
614         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
615         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
616         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
617 
618         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
619         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
620         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
621         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
622 
623         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
624         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
625         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
626         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
627     }
628 #endif
629 };
630 
631 //////////////////////////////////////////////////////////////////////////
632 /// Transpose16_16
633 //////////////////////////////////////////////////////////////////////////
634 struct Transpose16_16
635 {
636     //////////////////////////////////////////////////////////////////////////
637     /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
638     /// @param pSrc - source data in SOA form
639     /// @param pDst - output data in AOS form
TransposeTranspose16_16640     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
641     {
642 #if KNOB_SIMD_WIDTH == 8
643         simdscalar src = _simd_load_ps((const float*)pSrc);
644 
645         simd4scalar comp0 = _simd_extractf128_ps(src, 0);
646         simd4scalar comp1 = _simd_extractf128_ps(src, 1);
647 
648         simd4scalari comp0i = SIMD128::castps_si(comp0);
649         simd4scalari comp1i = SIMD128::castps_si(comp1);
650 
651         simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
652         simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
653 
654         SIMD128::store_si((simd4scalari*)pDst, resLo);
655         SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
656 #else
657 #error Unsupported vector width
658 #endif
659     }
660 #if ENABLE_AVX512_SIMD16
661 
Transpose_16Transpose16_16662     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
663     {
664         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
665         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
666 
667         simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
668         simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
669 
670         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20);     // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
671         simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31);     // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
672 
673         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgrgrgrgrgrgrgrg
674         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgrgrgrgrgrgrgrg
675     }
676 #endif
677 };
678 
679 //////////////////////////////////////////////////////////////////////////
680 /// Transpose24_8
681 //////////////////////////////////////////////////////////////////////////
682 struct Transpose24_8
683 {
684     //////////////////////////////////////////////////////////////////////////
685     /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
686     /// @param pSrc - source data in SOA form
687     /// @param pDst - output data in AOS form
688     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
689 #if ENABLE_AVX512_SIMD16
690 
691     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
692 #endif
693 };
694 
695 //////////////////////////////////////////////////////////////////////////
696 /// Transpose32_8_24
697 //////////////////////////////////////////////////////////////////////////
698 struct Transpose32_8_24
699 {
700     //////////////////////////////////////////////////////////////////////////
701     /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
702     /// @param pSrc - source data in SOA form
703     /// @param pDst - output data in AOS form
704     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
705 #if ENABLE_AVX512_SIMD16
706 
707     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
708 #endif
709 };
710 
711 //////////////////////////////////////////////////////////////////////////
712 /// Transpose4_4_4_4
713 //////////////////////////////////////////////////////////////////////////
714 struct Transpose4_4_4_4
715 {
716     //////////////////////////////////////////////////////////////////////////
717     /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
718     /// @param pSrc - source data in SOA form
719     /// @param pDst - output data in AOS form
720     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
721 #if ENABLE_AVX512_SIMD16
722 
723     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
724 #endif
725 };
726 
727 //////////////////////////////////////////////////////////////////////////
728 /// Transpose5_6_5
729 //////////////////////////////////////////////////////////////////////////
730 struct Transpose5_6_5
731 {
732     //////////////////////////////////////////////////////////////////////////
733     /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
734     /// @param pSrc - source data in SOA form
735     /// @param pDst - output data in AOS form
736     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
737 #if ENABLE_AVX512_SIMD16
738 
739     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
740 #endif
741 };
742 
743 //////////////////////////////////////////////////////////////////////////
744 /// Transpose9_9_9_5
745 //////////////////////////////////////////////////////////////////////////
746 struct Transpose9_9_9_5
747 {
748     //////////////////////////////////////////////////////////////////////////
749     /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
750     /// @param pSrc - source data in SOA form
751     /// @param pDst - output data in AOS form
752     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
753 #if ENABLE_AVX512_SIMD16
754 
755     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
756 #endif
757 };
758 
759 //////////////////////////////////////////////////////////////////////////
760 /// Transpose5_5_5_1
761 //////////////////////////////////////////////////////////////////////////
762 struct Transpose5_5_5_1
763 {
764     //////////////////////////////////////////////////////////////////////////
765     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
766     /// @param pSrc - source data in SOA form
767     /// @param pDst - output data in AOS form
768     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
769 #if ENABLE_AVX512_SIMD16
770 
771     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
772 #endif
773 };
774 
775 //////////////////////////////////////////////////////////////////////////
776 /// Transpose1_5_5_5
777 //////////////////////////////////////////////////////////////////////////
778 struct Transpose1_5_5_5
779 {
780     //////////////////////////////////////////////////////////////////////////
781     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
782     /// @param pSrc - source data in SOA form
783     /// @param pDst - output data in AOS form
784     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
785 };
786 
787 //////////////////////////////////////////////////////////////////////////
788 /// Transpose10_10_10_2
789 //////////////////////////////////////////////////////////////////////////
790 struct Transpose10_10_10_2
791 {
792     //////////////////////////////////////////////////////////////////////////
793     /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
794     /// @param pSrc - source data in SOA form
795     /// @param pDst - output data in AOS form
796     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
797 #if ENABLE_AVX512_SIMD16
798 
799     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
800 #endif
801 };
802 
803 //////////////////////////////////////////////////////////////////////////
804 /// Transpose11_11_10
805 //////////////////////////////////////////////////////////////////////////
806 struct Transpose11_11_10
807 {
808     //////////////////////////////////////////////////////////////////////////
809     /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
810     /// @param pSrc - source data in SOA form
811     /// @param pDst - output data in AOS form
812     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
813 #if ENABLE_AVX512_SIMD16
814 
815     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
816 #endif
817 };
818 
819 //////////////////////////////////////////////////////////////////////////
820 /// Transpose64
821 //////////////////////////////////////////////////////////////////////////
822 struct Transpose64
823 {
824     //////////////////////////////////////////////////////////////////////////
825     /// @brief Performs an SOA to AOS conversion
826     /// @param pSrc - source data in SOA form
827     /// @param pDst - output data in AOS form
828     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
829 #if ENABLE_AVX512_SIMD16
830 
831     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
832 #endif
833 };
834 
835 //////////////////////////////////////////////////////////////////////////
836 /// Transpose64_64
837 //////////////////////////////////////////////////////////////////////////
838 struct Transpose64_64
839 {
840     //////////////////////////////////////////////////////////////////////////
841     /// @brief Performs an SOA to AOS conversion
842     /// @param pSrc - source data in SOA form
843     /// @param pDst - output data in AOS form
844     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
845 #if ENABLE_AVX512_SIMD16
846 
847     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
848 #endif
849 };
850 
851 //////////////////////////////////////////////////////////////////////////
852 /// Transpose64_64_64
853 //////////////////////////////////////////////////////////////////////////
854 struct Transpose64_64_64
855 {
856     //////////////////////////////////////////////////////////////////////////
857     /// @brief Performs an SOA to AOS conversion
858     /// @param pSrc - source data in SOA form
859     /// @param pDst - output data in AOS form
860     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
861 #if ENABLE_AVX512_SIMD16
862 
863     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
864 #endif
865 };
866 
867 //////////////////////////////////////////////////////////////////////////
868 /// Transpose64_64_64_64
869 //////////////////////////////////////////////////////////////////////////
870 struct Transpose64_64_64_64
871 {
872     //////////////////////////////////////////////////////////////////////////
873     /// @brief Performs an SOA to AOS conversion
874     /// @param pSrc - source data in SOA form
875     /// @param pDst - output data in AOS form
876     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
877 #if ENABLE_AVX512_SIMD16
878 
879     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
880 #endif
881 };
882 
883