1 /****************************************************************************
2 * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file StoreTile.h
24 *
25 * @brief Functionality for Store.
26 *
27 ******************************************************************************/
28 #pragma once
29 
30 #include "common/os.h"
31 #include "common/formats.h"
32 #include "core/context.h"
33 #include "core/rdtsc_core.h"
34 #include "core/format_conversion.h"
35 
36 #include "memory/TilingFunctions.h"
37 #include "memory/Convert.h"
38 #include "memory/SurfaceState.h"
39 #include "core/multisample.h"
40 
41 #include <array>
42 #include <sstream>
43 
44 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
45 
46 // Function pointer to different storing functions for color, depth, and stencil based on incoming formats.
47 typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
48 
49 //////////////////////////////////////////////////////////////////////////
50 /// Store Raster Tile Function Tables.
51 //////////////////////////////////////////////////////////////////////////
52 extern PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
53 extern PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
54 extern PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS];
55 
56 void InitStoreTilesTable_Linear_1();
57 void InitStoreTilesTable_Linear_2();
58 void InitStoreTilesTable_TileX_1();
59 void InitStoreTilesTable_TileX_2();
60 void InitStoreTilesTable_TileY_1();
61 void InitStoreTilesTable_TileY_2();
62 void InitStoreTilesTable_TileW();
63 void InitStoreTilesTable();
64 
65 //////////////////////////////////////////////////////////////////////////
66 /// StorePixels
67 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
68 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
69 /// @param ppDsts   - Array of destination pointers.  Each pointer is
70 ///                   to a single row of at most 16B.
71 /// @tparam NumDests - Number of destination pointers.  Each pair of
72 ///                    pointers is for a 16-byte column of two rows.
73 //////////////////////////////////////////////////////////////////////////
74 template <size_t PixelSize, size_t NumDests>
75 struct StorePixels
76 {
77     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
78 };
79 
80 //////////////////////////////////////////////////////////////////////////
81 /// StorePixels (32-bit pixel specialization)
82 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
83 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
84 /// @param ppDsts   - Array of destination pointers.  Each pointer is
85 ///                   to a single row of at most 16B.
86 /// @tparam NumDests - Number of destination pointers.  Each pair of
87 ///                    pointers is for a 16-byte column of two rows.
88 //////////////////////////////////////////////////////////////////////////
89 template <>
90 struct StorePixels<8, 2>
91 {
92     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
93     {
94         // Each 4-pixel row is 4 bytes.
95         const uint16_t* pPixSrc = (const uint16_t*)pSrc;
96 
97         // Unswizzle from SWR-Z order
98         uint16_t* pRow = (uint16_t*)ppDsts[0];
99         pRow[0] = pPixSrc[0];
100         pRow[1] = pPixSrc[2];
101 
102         pRow = (uint16_t*)ppDsts[1];
103         pRow[0] = pPixSrc[1];
104         pRow[1] = pPixSrc[3];
105     }
106 };
107 
108 template <>
109 struct StorePixels<8, 4>
110 {
111     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
112     {
113         // 8 x 2 bytes = 16 bytes, 16 pixels
114         const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
115 
116         uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
117 
118         // Unswizzle from SWR-Z order
119         ppDsts16[0][0] = pSrc16[0];     // 0 1
120         ppDsts16[0][1] = pSrc16[2];     // 4 5
121 
122         ppDsts16[1][0] = pSrc16[1];     // 2 3
123         ppDsts16[1][1] = pSrc16[3];     // 6 7
124 
125         ppDsts16[2][0] = pSrc16[4];     // 8 9
126         ppDsts16[2][1] = pSrc16[6];     // C D
127 
128         ppDsts16[3][0] = pSrc16[5];     // A B
129         ppDsts16[3][1] = pSrc16[7];     // E F
130     }
131 };
132 
133 //////////////////////////////////////////////////////////////////////////
134 /// StorePixels (32-bit pixel specialization)
135 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
136 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
137 /// @param ppDsts   - Array of destination pointers.  Each pointer is
138 ///                   to a single row of at most 16B.
139 /// @tparam NumDests - Number of destination pointers.  Each pair of
140 ///                    pointers is for a 16-byte column of two rows.
141 //////////////////////////////////////////////////////////////////////////
142 template <>
143 struct StorePixels<16, 2>
144 {
145     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
146     {
147         // Each 4-pixel row is 8 bytes.
148         const uint32_t* pPixSrc = (const uint32_t*)pSrc;
149 
150         // Unswizzle from SWR-Z order
151         uint32_t* pRow = (uint32_t*)ppDsts[0];
152         pRow[0] = pPixSrc[0];
153         pRow[1] = pPixSrc[2];
154 
155         pRow = (uint32_t*)ppDsts[1];
156         pRow[0] = pPixSrc[1];
157         pRow[1] = pPixSrc[3];
158     }
159 };
160 
161 template <>
162 struct StorePixels<16, 4>
163 {
164     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
165     {
166         // 8 x 4 bytes = 32 bytes, 16 pixels
167         const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
168 
169         uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
170 
171         // Unswizzle from SWR-Z order
172         ppDsts32[0][0] = pSrc32[0];     // 0 1
173         ppDsts32[0][1] = pSrc32[2];     // 4 5
174 
175         ppDsts32[1][0] = pSrc32[1];     // 2 3
176         ppDsts32[1][1] = pSrc32[3];     // 6 7
177 
178         ppDsts32[2][0] = pSrc32[4];     // 8 9
179         ppDsts32[2][1] = pSrc32[6];     // C D
180 
181         ppDsts32[3][0] = pSrc32[5];     // A B
182         ppDsts32[3][1] = pSrc32[7];     // E F
183     }
184 };
185 
186 //////////////////////////////////////////////////////////////////////////
187 /// StorePixels (32-bit pixel specialization)
188 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
189 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
190 /// @param ppDsts   - Array of destination pointers.  Each pointer is
191 ///                   to a single row of at most 16B.
192 /// @tparam NumDests - Number of destination pointers.  Each pair of
193 ///                    pointers is for a 16-byte column of two rows.
194 //////////////////////////////////////////////////////////////////////////
195 template <>
196 struct StorePixels<32, 2>
197 {
198     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
199     {
200         // Each 4-pixel row is 16-bytes
201         simd4scalari *pZRow01 = (simd4scalari*)pSrc;
202         simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
203         simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
204 
205         simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
206         simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
207 
208         SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
209         SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
210     }
211 };
212 
213 template <>
214 struct StorePixels<32, 4>
215 {
216     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
217     {
218         // 4 x 16 bytes = 64 bytes, 16 pixels
219         const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
220 
221         simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
222 
223         // Unswizzle from SWR-Z order
224         simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]);                        // 0 1 2 3
225         simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]);                        // 4 5 6 7
226         simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]);                        // 8 9 A B
227         simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]);                        // C D E F
228 
229         SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1));   // 0 1 4 5
230         SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1));   // 2 3 6 7
231         SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3));   // 8 9 C D
232         SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3));   // A B E F
233     }
234 };
235 
236 //////////////////////////////////////////////////////////////////////////
237 /// StorePixels (32-bit pixel specialization)
238 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
239 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
240 /// @param ppDsts   - Array of destination pointers.  Each pointer is
241 ///                   to a single row of at most 16B.
242 /// @tparam NumDests - Number of destination pointers.  Each pair of
243 ///                    pointers is for a 16-byte column of two rows.
244 //////////////////////////////////////////////////////////////////////////
245 template <>
246 struct StorePixels<64, 4>
247 {
248     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
249     {
250         // Each 4-pixel row is 32 bytes.
251         const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
252 
253         // order of pointers match SWR-Z layout
254         simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
255         *pvDsts[0] = pPixSrc[0];
256         *pvDsts[1] = pPixSrc[1];
257         *pvDsts[2] = pPixSrc[2];
258         *pvDsts[3] = pPixSrc[3];
259     }
260 };
261 
262 template <>
263 struct StorePixels<64, 8>
264 {
265     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
266     {
267         // 8 x 16 bytes = 128 bytes, 16 pixels
268         const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
269 
270         simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
271 
272         // order of pointers match SWR-Z layout
273         *ppDsts128[0] = pSrc128[0];     // 0 1
274         *ppDsts128[1] = pSrc128[1];     // 2 3
275         *ppDsts128[2] = pSrc128[2];     // 4 5
276         *ppDsts128[3] = pSrc128[3];     // 6 7
277         *ppDsts128[4] = pSrc128[4];     // 8 9
278         *ppDsts128[5] = pSrc128[5];     // A B
279         *ppDsts128[6] = pSrc128[6];     // C D
280         *ppDsts128[7] = pSrc128[7];     // E F
281     }
282 };
283 
284 //////////////////////////////////////////////////////////////////////////
285 /// StorePixels (32-bit pixel specialization)
286 /// @brief Stores a 4x2 (AVX) raster-tile to two rows.
287 /// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
288 /// @param ppDsts   - Array of destination pointers.  Each pointer is
289 ///                   to a single row of at most 16B.
290 /// @tparam NumDests - Number of destination pointers.  Each pair of
291 ///                    pointers is for a 16-byte column of two rows.
292 //////////////////////////////////////////////////////////////////////////
293 template <>
294 struct StorePixels<128, 8>
295 {
296     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
297     {
298         // Each 4-pixel row is 64 bytes.
299         const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
300 
301         // Unswizzle from SWR-Z order
302         simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
303         *pvDsts[0] = pPixSrc[0];
304         *pvDsts[1] = pPixSrc[2];
305         *pvDsts[2] = pPixSrc[1];
306         *pvDsts[3] = pPixSrc[3];
307         *pvDsts[4] = pPixSrc[4];
308         *pvDsts[5] = pPixSrc[6];
309         *pvDsts[6] = pPixSrc[5];
310         *pvDsts[7] = pPixSrc[7];
311     }
312 };
313 
314 template <>
315 struct StorePixels<128, 16>
316 {
317     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
318     {
319         // 16 x 16 bytes = 256 bytes, 16 pixels
320         const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
321 
322         simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
323 
324         for (uint32_t i = 0; i < 16; i += 4)
325         {
326             *ppDsts128[i + 0] = pSrc128[i + 0];
327             *ppDsts128[i + 1] = pSrc128[i + 2];
328             *ppDsts128[i + 2] = pSrc128[i + 1];
329             *ppDsts128[i + 3] = pSrc128[i + 3];
330         }
331     }
332 };
333 
334 //////////////////////////////////////////////////////////////////////////
335 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
336 //////////////////////////////////////////////////////////////////////////
337 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
338 struct ConvertPixelsSOAtoAOS
339 {
340     //////////////////////////////////////////////////////////////////////////
341     /// @brief Converts a SIMD from the Hot Tile to the destination format
342     ///        and converts from SOA to AOS.
343     /// @param pSrc - Pointer to raster tile.
344     /// @param pDst - Pointer to destination surface or deswizzling buffer.
345     template <size_t NumDests>
346     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
347     {
348         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
349 
350         OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES] = {0};
351         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES] = {0};
352 
353         // Convert from SrcFormat --> DstFormat
354         simd16vector src;
355         LoadSOA<SrcFormat>(pSrc, src);
356         StoreSOA<DstFormat>(src, soaTile);
357 
358         // Convert from SOA --> AOS
359         FormatTraits<DstFormat>::TransposeT::Transpose_simd16(soaTile, aosTile);
360 
361         // Store data into destination
362         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
363     }
364 };
365 
366 //////////////////////////////////////////////////////////////////////////
367 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
368 /// Specialization for no format conversion
369 //////////////////////////////////////////////////////////////////////////
370 template<SWR_FORMAT Format>
371 struct ConvertPixelsSOAtoAOS<Format, Format>
372 {
373     //////////////////////////////////////////////////////////////////////////
374     /// @brief Converts a SIMD from the Hot Tile to the destination format
375     ///        and converts from SOA to AOS.
376     /// @param pSrc - Pointer to raster tile.
377     /// @param pDst - Pointer to destination surface or deswizzling buffer.
378     template <size_t NumDests>
379     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
380     {
381         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
382 
383         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
384 
385         // Convert from SOA --> AOS
386         FormatTraits<Format>::TransposeT::Transpose_simd16(pSrc, aosTile);
387 
388         // Store data into destination
389         StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
390     }
391 };
392 
393 //////////////////////////////////////////////////////////////////////////
394 /// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
395 //////////////////////////////////////////////////////////////////////////
396 template<>
397 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
398 {
399     //////////////////////////////////////////////////////////////////////////
400     /// @brief Converts a SIMD from the Hot Tile to the destination format
401     ///        and converts from SOA to AOS.
402     /// @param pSrc - Pointer to raster tile.
403     /// @param pDst - Pointer to destination surface or deswizzling buffer.
404     template <size_t NumDests>
405     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
406     {
407         static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
408         static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
409 
410         static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
411 
412         OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
413 
414         // Load hot-tile
415         simd16vector src, dst;
416         LoadSOA<SrcFormat>(pSrc, src);
417 
418         // deswizzle
419         dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
420         dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
421         dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
422 
423         // clamp
424         dst.x = Clamp<DstFormat>(dst.x, 0);
425         dst.y = Clamp<DstFormat>(dst.y, 1);
426         dst.z = Clamp<DstFormat>(dst.z, 2);
427 
428         // normalize
429         dst.x = Normalize<DstFormat>(dst.x, 0);
430         dst.y = Normalize<DstFormat>(dst.y, 1);
431         dst.z = Normalize<DstFormat>(dst.z, 2);
432 
433         // pack
434         simd16scalari packed = _simd16_castps_si(dst.x);
435 
436         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
437         SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
438 
439         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
440         packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
441 
442         // pack low 16 bits of each 32 bit lane to low 128 bits of dst
443         uint32_t *pPacked = (uint32_t*)&packed;
444         uint16_t *pAosTile = (uint16_t*)&aosTile[0];
445         for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
446         {
447             *pAosTile++ = *pPacked++;
448         }
449 
450         // Store data into destination
451         StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
452     }
453 };
454 
455 //////////////////////////////////////////////////////////////////////////
456 /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
457 //////////////////////////////////////////////////////////////////////////
458 template<>
459 struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
460 {
461     static const SWR_FORMAT SrcFormat = R32_FLOAT;
462     static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
463 
464     //////////////////////////////////////////////////////////////////////////
465     /// @brief Converts a SIMD from the Hot Tile to the destination format
466     ///        and converts from SOA to AOS.
467     /// @param pSrc - Pointer to raster tile.
468     /// @param pDst - Pointer to destination surface or deswizzling buffer.
469     template <size_t NumDests>
470     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
471     {
472         simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
473 
474         // clamp
475         const simd16scalar zero = _simd16_setzero_ps();
476         const simd16scalar ones = _simd16_set1_ps(1.0f);
477 
478         comp = _simd16_max_ps(comp, zero);
479         comp = _simd16_min_ps(comp, ones);
480 
481         // normalize
482         comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
483 
484         simd16scalari temp = _simd16_cvtps_epi32(comp);
485 
486         // swizzle
487         temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
488 
489         // merge/store data into destination but don't overwrite the X8 bits
490         simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
491         simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
492 
493         simd16scalari dest = _simd16_setzero_si();
494 
495         dest = _simd16_insert_si(dest, destlo, 0);
496         dest = _simd16_insert_si(dest, desthi, 1);
497 
498         simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF);
499 
500         dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
501 
502         _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
503         _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
504     }
505 };
506 
507 template<SWR_FORMAT DstFormat>
508 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
509 {
510     // swizzle rgba -> bgra while we load
511     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
512     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
513     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
514     simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
515 
516     // clamp
517     const simd16scalar zero = _simd16_setzero_ps();
518     const simd16scalar ones = _simd16_set1_ps(1.0f);
519 
520     comp0 = _simd16_max_ps(comp0, zero);
521     comp0 = _simd16_min_ps(comp0, ones);
522 
523     comp1 = _simd16_max_ps(comp1, zero);
524     comp1 = _simd16_min_ps(comp1, ones);
525 
526     comp2 = _simd16_max_ps(comp2, zero);
527     comp2 = _simd16_min_ps(comp2, ones);
528 
529     comp3 = _simd16_max_ps(comp3, zero);
530     comp3 = _simd16_min_ps(comp3, ones);
531 
532     // gamma-correct only rgb
533     if (FormatTraits<DstFormat>::isSRGB)
534     {
535         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
536         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
537         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
538     }
539 
540     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
541     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
542     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
543     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
544     comp3 = _simd16_mul_ps(comp3, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
545 
546     // moving to 16 wide integer vector types
547     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
548     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
549     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
550     simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
551 
552     // SOA to AOS conversion
553     src1 = _simd16_slli_epi32(src1,  8);
554     src2 = _simd16_slli_epi32(src2, 16);
555     src3 = _simd16_slli_epi32(src3, 24);
556 
557     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));  // 0 1 2 3 4 5 6 7 8 9 A B C D E F
558 
559     // de-swizzle conversion
560 #if 1
561     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
562     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
563 
564     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
565 
566 #else
567     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
568 
569 #endif
570     // store 8x2 memory order:
571     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
572     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
573     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
574     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
575 }
576 
577 template<SWR_FORMAT DstFormat>
578 INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
579 {
580     static const uint32_t offset = sizeof(simdscalar);
581 
582     // swizzle rgba -> bgra while we load
583     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
584     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
585     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
586     simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa
587 
588     // clamp
589     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
590     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
591 
592     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
593     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
594 
595     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
596     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
597 
598     vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
599     vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
600 
601     if (FormatTraits<DstFormat>::isSRGB)
602     {
603         // Gamma-correct only rgb
604         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
605         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
606         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
607     }
608 
609     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
610     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
611     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
612     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
613     vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
614 
615     // moving to 8 wide integer vector types
616     simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
617     simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
618     simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
619     simdscalari src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
620 
621 #if KNOB_ARCH <= KNOB_ARCH_AVX
622 
623     // splitting into two sets of 4 wide integer vector types
624     // because AVX doesn't have instructions to support this operation at 8 wide
625     simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
626     simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
627     simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
628     simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
629 
630     simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
631     simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
632     simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
633     simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
634 
635     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
636     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
637     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
638     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
639     srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
640     srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
641 
642     srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
643     srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
644 
645     srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
646     srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
647 
648     srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
649     srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
650 
651     // unpack into rows that get the tiling order correct
652     simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
653     simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
654 
655     simdscalari final = _mm256_castsi128_si256(vRow00);
656     final = _mm256_insertf128_si256(final, vRow10, 1);
657 
658 #else
659 
660     // logic is as above, only wider
661     src1 = _mm256_slli_si256(src1, 1);
662     src2 = _mm256_slli_si256(src2, 2);
663     src3 = _mm256_slli_si256(src3, 3);
664 
665     src0 = _mm256_or_si256(src0, src1);
666     src2 = _mm256_or_si256(src2, src3);
667 
668     simdscalari final = _mm256_or_si256(src0, src2);
669 
670     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
671     final = _mm256_permute4x64_epi64(final, 0xD8);
672 #endif
673 
674     _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
675 }
676 
677 template<SWR_FORMAT DstFormat>
678 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
679 {
680     // swizzle rgba -> bgra while we load
681     simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
682     simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
683     simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
684 
685     // clamp
686     const simd16scalar zero = _simd16_setzero_ps();
687     const simd16scalar ones = _simd16_set1_ps(1.0f);
688 
689     comp0 = _simd16_max_ps(comp0, zero);
690     comp0 = _simd16_min_ps(comp0, ones);
691 
692     comp1 = _simd16_max_ps(comp1, zero);
693     comp1 = _simd16_min_ps(comp1, ones);
694 
695     comp2 = _simd16_max_ps(comp2, zero);
696     comp2 = _simd16_min_ps(comp2, ones);
697 
698     // gamma-correct only rgb
699     if (FormatTraits<DstFormat>::isSRGB)
700     {
701         comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
702         comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
703         comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
704     }
705 
706     // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
707     comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
708     comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
709     comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
710 
711     // moving to 16 wide integer vector types
712     simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
713     simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
714     simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
715 
716     // SOA to AOS conversion
717     src1 = _simd16_slli_epi32(src1,  8);
718     src2 = _simd16_slli_epi32(src2, 16);
719 
720     simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2);                       // 0 1 2 3 4 5 6 7 8 9 A B C D E F
721 
722     // de-swizzle conversion
723 #if 1
724     simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0)         // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
725     simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1)         // 4 5 6 7 4 5 6 7 C D E F C D E F
726 
727     final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0)                   // 0 1 4 5 2 3 6 7 8 9 C D A B E F
728 
729 #else
730     final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
731 
732 #endif
733     // store 8x2 memory order:
734     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
735     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
736     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
737     _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
738 }
739 
740 template<SWR_FORMAT DstFormat>
741 INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
742 {
743     static const uint32_t offset = sizeof(simdscalar);
744 
745     // swizzle rgba -> bgra while we load
746     simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr
747     simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
748     simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb
749                                                                                                             // clamp
750     vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
751     vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
752 
753     vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
754     vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
755 
756     vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
757     vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
758 
759     if (FormatTraits<DstFormat>::isSRGB)
760     {
761         // Gamma-correct only rgb
762         vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
763         vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
764         vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
765     }
766 
767     // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
768     vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
769     vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
770     vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
771 
772     // moving to 8 wide integer vector types
773     simdscalari src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
774     simdscalari src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg
775     simdscalari src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb
776 
777 #if KNOB_ARCH <= KNOB_ARCH_AVX
778 
779     // splitting into two sets of 4 wide integer vector types
780     // because AVX doesn't have instructions to support this operation at 8 wide
781     simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
782     simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
783     simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
784 
785     simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
786     simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
787     simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
788 
789     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
790     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
791     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
792     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
793 
794     srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
795 
796     srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
797 
798     srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
799     srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
800 
801     // unpack into rows that get the tiling order correct
802     simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
803     simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
804 
805     simdscalari final = _mm256_castsi128_si256(vRow00);
806     final = _mm256_insertf128_si256(final, vRow10, 1);
807 
808 #else
809 
810                                               // logic is as above, only wider
811     src1 = _mm256_slli_si256(src1, 1);
812     src2 = _mm256_slli_si256(src2, 2);
813 
814     src0 = _mm256_or_si256(src0, src1);
815 
816     simdscalari final = _mm256_or_si256(src0, src2);
817 
818     // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
819     final = _mm256_permute4x64_epi64(final, 0xD8);
820 
821 #endif
822 
823     _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
824 }
825 
826 template<>
827 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
828 {
829     template <size_t NumDests>
830     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
831     {
832         FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
833     }
834 };
835 
836 template<>
837 struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
838 {
839     template <size_t NumDests>
840     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
841     {
842         FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
843     }
844 };
845 
846 template<>
847 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
848 {
849     template <size_t NumDests>
850     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
851     {
852         FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
853     }
854 };
855 
856 template<>
857 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
858 {
859     template <size_t NumDests>
860     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
861     {
862         FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
863     }
864 };
865 
866 template<>
867 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
868 {
869     template <size_t NumDests>
870     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
871     {
872         FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
873     }
874 };
875 
876 template<>
877 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
878 {
879     template <size_t NumDests>
880     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
881     {
882         FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
883     }
884 };
885 
886 template<>
887 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
888 {
889     template <size_t NumDests>
890     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
891     {
892         FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
893     }
894 };
895 
896 template<>
897 struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
898 {
899     template <size_t NumDests>
900     INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
901     {
902         FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
903     }
904 };
905 
906 //////////////////////////////////////////////////////////////////////////
907 /// StoreRasterTile
908 //////////////////////////////////////////////////////////////////////////
909 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
910 struct StoreRasterTile
911 {
912     //////////////////////////////////////////////////////////////////////////
913     /// @brief Retrieve color from hot tile source which is always float.
914     /// @param pSrc - Pointer to raster tile.
915     /// @param x, y - Coordinates to raster tile.
916     /// @param output - output color
917     INLINE static void GetSwizzledSrcColor(
918         uint8_t* pSrc,
919         uint32_t x, uint32_t y,
920         float outputColor[4])
921     {
922         typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
923 
924         SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
925 
926         // Compute which simd tile we're accessing within 8x8 tile.
927         //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
928         uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
929 
930         SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
931 
932         uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
933 
934         pSimdTile->GetSwizzledColor(simdOffset, outputColor);
935     }
936 
937     //////////////////////////////////////////////////////////////////////////
938     /// @brief Stores an 8x8 raster tile to the destination surface.
939     /// @param pSrc - Pointer to raster tile.
940     /// @param pDstSurface - Destination surface state
941     /// @param x, y - Coordinates to raster tile.
942     INLINE static void Store(
943         uint8_t *pSrc,
944         SWR_SURFACE_STATE* pDstSurface,
945         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
946     {
947         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
948         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
949 
950         // For each raster tile pixel (rx, ry)
951         for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
952         {
953             for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
954             {
955                 // Perform bounds checking.
956                 if (((x + rx) < lodWidth) &&
957                     ((y + ry) < lodHeight))
958                 {
959                     float srcColor[4];
960                     GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
961 
962                     uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
963                         pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex,
964                         sampleNum, pDstSurface->lod, pDstSurface);
965                     {
966                         ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
967                     }
968                 }
969             }
970         }
971     }
972 
973     //////////////////////////////////////////////////////////////////////////
974     /// @brief Resolves an 8x8 raster tile to the resolve destination surface.
975     /// @param pSrc - Pointer to raster tile.
976     /// @param pDstSurface - Destination surface state
977     /// @param x, y - Coordinates to raster tile.
978     /// @param sampleOffset - Offset between adjacent multisamples
979     INLINE static void Resolve(
980         uint8_t *pSrc,
981         SWR_SURFACE_STATE* pDstSurface,
982         uint32_t x, uint32_t y, uint32_t sampleOffset, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
983     {
984         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
985         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
986 
987         float oneOverNumSamples = 1.0f / pDstSurface->numSamples;
988 
989         // For each raster tile pixel (rx, ry)
990         for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
991         {
992             for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
993             {
994                 // Perform bounds checking.
995                 if (((x + rx) < lodWidth) &&
996                         ((y + ry) < lodHeight))
997                 {
998                     // Sum across samples
999                     float resolveColor[4] = {0};
1000                     for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1001                     {
1002                         float sampleColor[4] = {0};
1003                         uint8_t *pSampleSrc = pSrc + sampleOffset * sampleNum;
1004                         GetSwizzledSrcColor(pSampleSrc, rx, ry, sampleColor);
1005                         resolveColor[0] += sampleColor[0];
1006                         resolveColor[1] += sampleColor[1];
1007                         resolveColor[2] += sampleColor[2];
1008                         resolveColor[3] += sampleColor[3];
1009                     }
1010 
1011                     // Divide by numSamples to average
1012                     resolveColor[0] *= oneOverNumSamples;
1013                     resolveColor[1] *= oneOverNumSamples;
1014                     resolveColor[2] *= oneOverNumSamples;
1015                     resolveColor[3] *= oneOverNumSamples;
1016 
1017                     // Use the resolve surface state
1018                     SWR_SURFACE_STATE* pResolveSurface = (SWR_SURFACE_STATE*)pDstSurface->xpAuxBaseAddress;
1019                     uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>((x + rx), (y + ry),
1020                         pResolveSurface->arrayIndex + renderTargetArrayIndex, pResolveSurface->arrayIndex + renderTargetArrayIndex,
1021                         0, pResolveSurface->lod, pResolveSurface);
1022                     {
1023                         ConvertPixelFromFloat<DstFormat>(pDst, resolveColor);
1024                     }
1025                 }
1026             }
1027         }
1028     }
1029 
1030 };
1031 
1032 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1033 struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
1034 {};
1035 
1036 //////////////////////////////////////////////////////////////////////////
1037 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
1038 //////////////////////////////////////////////////////////////////////////
1039 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1040 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
1041 {
1042     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
1043     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1044     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1045 
1046     //////////////////////////////////////////////////////////////////////////
1047     /// @brief Stores an 8x8 raster tile to the destination surface.
1048     /// @param pSrc - Pointer to raster tile.
1049     /// @param pDstSurface - Destination surface state
1050     /// @param x, y - Coordinates to raster tile.
1051     INLINE static void Store(
1052         uint8_t *pSrc,
1053         SWR_SURFACE_STATE* pDstSurface,
1054         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1055     {
1056         // Punt non-full tiles to generic store
1057         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1058         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1059 
1060         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1061         {
1062             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1063         }
1064 
1065         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1066             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1067 
1068         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1069         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1070 
1071         uint8_t* ppDsts[] =
1072         {
1073             pDst,                                           // row 0, col 0
1074             pDst + pDstSurface->pitch,                      // row 1, col 0
1075             pDst + dx / 2,                                  // row 0, col 1
1076             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1077         };
1078 
1079         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1080         {
1081             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1082             {
1083                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1084 
1085                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1086 
1087                 ppDsts[0] += dx;
1088                 ppDsts[1] += dx;
1089                 ppDsts[2] += dx;
1090                 ppDsts[3] += dx;
1091             }
1092 
1093             ppDsts[0] += dy;
1094             ppDsts[1] += dy;
1095             ppDsts[2] += dy;
1096             ppDsts[3] += dy;
1097         }
1098     }
1099 };
1100 
1101 //////////////////////////////////////////////////////////////////////////
1102 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
1103 //////////////////////////////////////////////////////////////////////////
1104 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1105 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat>
1106 {
1107     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
1108     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1109     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1110 
1111     //////////////////////////////////////////////////////////////////////////
1112     /// @brief Stores an 8x8 raster tile to the destination surface.
1113     /// @param pSrc - Pointer to raster tile.
1114     /// @param pDstSurface - Destination surface state
1115     /// @param x, y - Coordinates to raster tile.
1116     INLINE static void Store(
1117         uint8_t *pSrc,
1118         SWR_SURFACE_STATE* pDstSurface,
1119         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1120     {
1121         // Punt non-full tiles to generic store
1122         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1123         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1124 
1125         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1126         {
1127             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1128         }
1129 
1130         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1131             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1132 
1133         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1134         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1135 
1136         uint8_t* ppDsts[] =
1137         {
1138             pDst,                                           // row 0, col 0
1139             pDst + pDstSurface->pitch,                      // row 1, col 0
1140             pDst + dx / 2,                                  // row 0, col 1
1141             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1142         };
1143 
1144         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1145         {
1146             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1147             {
1148                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1149 
1150                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1151 
1152                 ppDsts[0] += dx;
1153                 ppDsts[1] += dx;
1154                 ppDsts[2] += dx;
1155                 ppDsts[3] += dx;
1156             }
1157 
1158             ppDsts[0] += dy;
1159             ppDsts[1] += dy;
1160             ppDsts[2] += dy;
1161             ppDsts[3] += dy;
1162         }
1163     }
1164 };
1165 
1166 //////////////////////////////////////////////////////////////////////////
1167 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
1168 //////////////////////////////////////////////////////////////////////////
1169 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1170 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat>
1171 {
1172     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
1173     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1174     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1175 
1176     //////////////////////////////////////////////////////////////////////////
1177     /// @brief Stores an 8x8 raster tile to the destination surface.
1178     /// @param pSrc - Pointer to raster tile.
1179     /// @param pDstSurface - Destination surface state
1180     /// @param x, y - Coordinates to raster tile.
1181     INLINE static void Store(
1182         uint8_t *pSrc,
1183         SWR_SURFACE_STATE* pDstSurface,
1184         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1185     {
1186         // Punt non-full tiles to generic store
1187         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1188         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1189 
1190         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1191         {
1192             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1193         }
1194 
1195         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1196             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1197 
1198         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1199         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1200 
1201         uint8_t* ppDsts[] =
1202         {
1203             pDst,                                           // row 0, col 0
1204             pDst + pDstSurface->pitch,                      // row 1, col 0
1205             pDst + dx / 2,                                  // row 0, col 1
1206             pDst + pDstSurface->pitch + dx / 2              // row 1, col 1
1207         };
1208 
1209         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1210         {
1211             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1212             {
1213                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1214 
1215                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1216 
1217                 ppDsts[0] += dx;
1218                 ppDsts[1] += dx;
1219                 ppDsts[2] += dx;
1220                 ppDsts[3] += dx;
1221             }
1222 
1223             ppDsts[0] += dy;
1224             ppDsts[1] += dy;
1225             ppDsts[2] += dy;
1226             ppDsts[3] += dy;
1227         }
1228     }
1229 };
1230 
1231 //////////////////////////////////////////////////////////////////////////
1232 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
1233 //////////////////////////////////////////////////////////////////////////
1234 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1235 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
1236 {
1237     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
1238     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1239     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1240     static const size_t MAX_DST_COLUMN_BYTES = 16;
1241 
1242     //////////////////////////////////////////////////////////////////////////
1243     /// @brief Stores an 8x8 raster tile to the destination surface.
1244     /// @param pSrc - Pointer to raster tile.
1245     /// @param pDstSurface - Destination surface state
1246     /// @param x, y - Coordinates to raster tile.
1247     INLINE static void Store(
1248         uint8_t *pSrc,
1249         SWR_SURFACE_STATE* pDstSurface,
1250         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1251     {
1252         // Punt non-full tiles to generic store
1253         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1254         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1255 
1256         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1257         {
1258             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1259         }
1260 
1261         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1262             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1263 
1264         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1265         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1266 
1267         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1268         static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
1269 
1270         uint8_t *ppDsts[] =
1271         {
1272             pDst,                                                               // row 0, col 0
1273             pDst + pDstSurface->pitch,                                          // row 1, col 0
1274             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
1275             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
1276             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
1277             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
1278             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
1279             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3                // row 1, col 3
1280         };
1281 
1282         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1283         {
1284             // Raster tile width is same as simd16 tile width
1285             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1286 
1287             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1288 
1289             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1290 
1291             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1292             {
1293                 ppDsts[i] += dy;
1294             }
1295         }
1296     }
1297 };
1298 
1299 //////////////////////////////////////////////////////////////////////////
1300 /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
1301 //////////////////////////////////////////////////////////////////////////
1302 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1303 struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
1304 {
1305     typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
1306     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1307     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1308     static const size_t MAX_DST_COLUMN_BYTES = 16;
1309 
1310     //////////////////////////////////////////////////////////////////////////
1311     /// @brief Stores an 8x8 raster tile to the destination surface.
1312     /// @param pSrc - Pointer to raster tile.
1313     /// @param pDstSurface - Destination surface state
1314     /// @param x, y - Coordinates to raster tile.
1315     INLINE static void Store(
1316         uint8_t *pSrc,
1317         SWR_SURFACE_STATE* pDstSurface,
1318         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1319     {
1320         // Punt non-full tiles to generic store
1321         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1322         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1323 
1324         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1325         {
1326             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1327         }
1328 
1329         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1330             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1331 
1332         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1333         const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
1334 
1335         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1336         static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
1337 
1338         uint8_t* ppDsts[] =
1339         {
1340             pDst,                                                               // row 0, col 0
1341             pDst + pDstSurface->pitch,                                          // row 1, col 0
1342             pDst + MAX_DST_COLUMN_BYTES,                                        // row 0, col 1
1343             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,                   // row 1, col 1
1344             pDst + MAX_DST_COLUMN_BYTES * 2,                                    // row 0, col 2
1345             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2,               // row 1, col 2
1346             pDst + MAX_DST_COLUMN_BYTES * 3,                                    // row 0, col 3
1347             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3,               // row 1, col 3
1348             pDst + MAX_DST_COLUMN_BYTES * 4,                                    // row 0, col 4
1349             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 4,               // row 1, col 4
1350             pDst + MAX_DST_COLUMN_BYTES * 5,                                    // row 0, col 5
1351             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 5,               // row 1, col 5
1352             pDst + MAX_DST_COLUMN_BYTES * 6,                                    // row 0, col 6
1353             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 6,               // row 1, col 6
1354             pDst + MAX_DST_COLUMN_BYTES * 7,                                    // row 0, col 7
1355             pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 7,               // row 1, col 7
1356         };
1357 
1358         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1359         {
1360             // Raster tile width is same as simd16 tile width
1361             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1362 
1363             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1364 
1365             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1366 
1367             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1368             {
1369                 ppDsts[i] += dy;
1370             }
1371         }
1372     }
1373 };
1374 
1375 //////////////////////////////////////////////////////////////////////////
1376 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
1377 //////////////////////////////////////////////////////////////////////////
1378 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1379 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
1380 {
1381     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
1382     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1383 
1384     //////////////////////////////////////////////////////////////////////////
1385     /// @brief Stores an 8x8 raster tile to the destination surface.
1386     /// @param pSrc - Pointer to raster tile.
1387     /// @param pDstSurface - Destination surface state
1388     /// @param x, y - Coordinates to raster tile.
1389     INLINE static void Store(
1390         uint8_t *pSrc,
1391         SWR_SURFACE_STATE* pDstSurface,
1392         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1393     {
1394         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1395 
1396         // Punt non-full tiles to generic store
1397         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1398         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1399 
1400         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1401         {
1402             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1403         }
1404 
1405         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1406         // We can compute the offsets to each column within the raster tile once and increment from these.
1407         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1408         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1409             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1410 
1411         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1412 
1413         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1414         uint8_t *ppDsts[] =
1415         {
1416             pDst,
1417             pDst + DestRowWidthBytes,
1418             pDst + DestRowWidthBytes / 4,
1419             pDst + DestRowWidthBytes + DestRowWidthBytes / 4
1420         };
1421 
1422         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1423         {
1424             // Raster tile width is same as simd16 tile width
1425             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1426 
1427             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1428 
1429             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1430 
1431             ppDsts[0] += dy;
1432             ppDsts[1] += dy;
1433             ppDsts[2] += dy;
1434             ppDsts[3] += dy;
1435         }
1436     }
1437 };
1438 
1439 //////////////////////////////////////////////////////////////////////////
1440 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
1441 //////////////////////////////////////////////////////////////////////////
1442 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1443 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
1444 {
1445     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
1446     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1447 
1448     //////////////////////////////////////////////////////////////////////////
1449     /// @brief Stores an 8x8 raster tile to the destination surface.
1450     /// @param pSrc - Pointer to raster tile.
1451     /// @param pDstSurface - Destination surface state
1452     /// @param x, y - Coordinates to raster tile.
1453     INLINE static void Store(
1454         uint8_t *pSrc,
1455         SWR_SURFACE_STATE* pDstSurface,
1456         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1457     {
1458         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1459 
1460         // Punt non-full tiles to generic store
1461         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1462         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1463 
1464         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1465         {
1466             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1467         }
1468 
1469         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1470         // We can compute the offsets to each column within the raster tile once and increment from these.
1471         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1472         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1473             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1474 
1475         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1476 
1477         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1478         uint8_t *ppDsts[] =
1479         {
1480             pDst,
1481             pDst + DestRowWidthBytes,
1482             pDst + DestRowWidthBytes / 2,
1483             pDst + DestRowWidthBytes + DestRowWidthBytes / 2
1484         };
1485 
1486         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1487         {
1488             // Raster tile width is same as simd16 tile width
1489             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1490 
1491             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1492 
1493             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1494 
1495             ppDsts[0] += dy;
1496             ppDsts[1] += dy;
1497             ppDsts[2] += dy;
1498             ppDsts[3] += dy;
1499         }
1500     }
1501 };
1502 
1503 //////////////////////////////////////////////////////////////////////////
1504 /// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
1505 //////////////////////////////////////////////////////////////////////////
1506 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1507 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
1508 {
1509     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1510     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1511     static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
1512 
1513     //////////////////////////////////////////////////////////////////////////
1514     /// @brief Stores an 8x8 raster tile to the destination surface.
1515     /// @param pSrc - Pointer to raster tile.
1516     /// @param pDstSurface - Destination surface state
1517     /// @param x, y - Coordinates to raster tile.
1518     INLINE static void Store(
1519         uint8_t *pSrc,
1520         SWR_SURFACE_STATE* pDstSurface,
1521         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1522     {
1523         static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
1524 
1525         // Punt non-full tiles to generic store
1526         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1527         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1528 
1529         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1530         {
1531             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1532         }
1533 
1534         // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
1535         // We can compute the offsets to each column within the raster tile once and increment from these.
1536         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1537             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1538 
1539         const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1540         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
1541 
1542         uint8_t* ppDsts[] =
1543         {
1544             pDst,                                           // row 0, col 0
1545             pDst + DestRowWidthBytes,                       // row 1, col 0
1546             pDst + dx / 2,                                  // row 0, col 1
1547             pDst + DestRowWidthBytes + dx / 2               // row 1, col 1
1548         };
1549 
1550         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1551         {
1552             for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
1553             {
1554                 ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1555 
1556                 pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1557 
1558                 ppDsts[0] += dx;
1559                 ppDsts[1] += dx;
1560                 ppDsts[2] += dx;
1561                 ppDsts[3] += dx;
1562             }
1563 
1564             ppDsts[0] += dy;
1565             ppDsts[1] += dy;
1566             ppDsts[2] += dy;
1567             ppDsts[3] += dy;
1568         }
1569     }
1570 };
1571 
1572 //////////////////////////////////////////////////////////////////////////
1573 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
1574 //////////////////////////////////////////////////////////////////////////
1575 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1576 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat>
1577 {
1578     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
1579     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1580 
1581     //////////////////////////////////////////////////////////////////////////
1582     /// @brief Stores an 8x8 raster tile to the destination surface.
1583     /// @param pSrc - Pointer to raster tile.
1584     /// @param pDstSurface - Destination surface state
1585     /// @param x, y - Coordinates to raster tile.
1586     INLINE static void Store(
1587         uint8_t *pSrc,
1588         SWR_SURFACE_STATE* pDstSurface,
1589         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1590     {
1591         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1592         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
1593 
1594         // Punt non-full tiles to generic store
1595         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1596         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1597 
1598         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1599         {
1600             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1601         }
1602 
1603         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1604         // We can compute the offsets to each column within the raster tile once and increment from these.
1605         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1606         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1607             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1608 
1609         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1610         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1611 
1612         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1613         uint8_t *ppDsts[] =
1614         {
1615             pDst,                                           // row 0, col 0
1616             pDst + DestRowWidthBytes,                       // row 1, col 0
1617             pDst + DestColumnBytes,                         // row 0, col 1
1618             pDst + DestRowWidthBytes + DestColumnBytes      // row 1, col 1
1619         };
1620 
1621         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1622         {
1623             // Raster tile width is same as simd16 tile width
1624             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1625 
1626             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1627 
1628             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1629 
1630             ppDsts[0] += dy;
1631             ppDsts[1] += dy;
1632             ppDsts[2] += dy;
1633             ppDsts[3] += dy;
1634         }
1635     }
1636 };
1637 
1638 //////////////////////////////////////////////////////////////////////////
1639 /// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
1640 //////////////////////////////////////////////////////////////////////////
1641 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1642 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
1643 {
1644     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
1645     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1646 
1647     //////////////////////////////////////////////////////////////////////////
1648     /// @brief Stores an 8x8 raster tile to the destination surface.
1649     /// @param pSrc - Pointer to raster tile.
1650     /// @param pDstSurface - Destination surface state
1651     /// @param x, y - Coordinates to raster tile.
1652     INLINE static void Store(
1653         uint8_t *pSrc,
1654         SWR_SURFACE_STATE* pDstSurface,
1655         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1656     {
1657         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1658         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
1659 
1660         // Punt non-full tiles to generic store
1661         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1662         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1663 
1664         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1665         {
1666             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1667         }
1668 
1669         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1670         // We can compute the offsets to each column within the raster tile once and increment from these.
1671         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1672         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1673             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1674 
1675         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1676         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1677 
1678         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1679         uint8_t *ppDsts[] =
1680         {
1681             pDst,                                           // row 0, col 0
1682             pDst + DestRowWidthBytes,                       // row 1, col 0
1683             pDst + DestColumnBytes,                         // row 0, col 1
1684             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
1685             pDst + DestColumnBytes * 2,                     // row 0, col 2
1686             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
1687             pDst + DestColumnBytes * 3,                     // row 0, col 3
1688             pDst + DestRowWidthBytes + DestColumnBytes * 3  // row 1, col 3
1689         };
1690 
1691         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1692         {
1693             // Raster tile width is same as simd16 tile width
1694             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1695 
1696             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1697 
1698             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1699 
1700             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1701             {
1702                 ppDsts[i] += dy;
1703             }
1704         }
1705     }
1706 };
1707 
1708 //////////////////////////////////////////////////////////////////////////
1709 /// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
1710 //////////////////////////////////////////////////////////////////////////
1711 template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1712 struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
1713 {
1714     typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> GenericStoreTile;
1715     static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
1716 
1717     //////////////////////////////////////////////////////////////////////////
1718     /// @brief Stores an 8x8 raster tile to the destination surface.
1719     /// @param pSrc - Pointer to raster tile.
1720     /// @param pDstSurface - Destination surface state
1721     /// @param x, y - Coordinates to raster tile.
1722     INLINE static void Store(
1723         uint8_t *pSrc,
1724         SWR_SURFACE_STATE* pDstSurface,
1725         uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
1726     {
1727         static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
1728         static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
1729 
1730         // Punt non-full tiles to generic store
1731         uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
1732         uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
1733 
1734         if (x + KNOB_TILE_X_DIM > lodWidth || y + KNOB_TILE_Y_DIM > lodHeight)
1735         {
1736             return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
1737         }
1738 
1739         // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
1740         // We can compute the offsets to each column within the raster tile once and increment from these.
1741         // There will be 4 8x2 simd tiles in an 8x8 raster tile.
1742         uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
1743             pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
1744 
1745         // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
1746         const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
1747 
1748         // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
1749         uint8_t *ppDsts[] =
1750         {
1751             pDst,                                           // row 0, col 0
1752             pDst + DestRowWidthBytes,                       // row 1, col 0
1753             pDst + DestColumnBytes,                         // row 0, col 1
1754             pDst + DestRowWidthBytes + DestColumnBytes,     // row 1, col 1
1755             pDst + DestColumnBytes * 2,                     // row 0, col 2
1756             pDst + DestRowWidthBytes + DestColumnBytes * 2, // row 1, col 2
1757             pDst + DestColumnBytes * 3,                     // row 0, col 3
1758             pDst + DestRowWidthBytes + DestColumnBytes * 3, // row 1, col 3
1759             pDst + DestColumnBytes * 4,                     // row 0, col 4
1760             pDst + DestRowWidthBytes + DestColumnBytes * 4, // row 1, col 4
1761             pDst + DestColumnBytes * 5,                     // row 0, col 5
1762             pDst + DestRowWidthBytes + DestColumnBytes * 5, // row 1, col 5
1763             pDst + DestColumnBytes * 6,                     // row 0, col 6
1764             pDst + DestRowWidthBytes + DestColumnBytes * 6, // row 1, col 6
1765             pDst + DestColumnBytes * 7,                     // row 0, col 7
1766             pDst + DestRowWidthBytes + DestColumnBytes * 7  // row 1, col 7
1767         };
1768 
1769         for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
1770         {
1771             // Raster tile width is same as simd16 tile width
1772             static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
1773 
1774             ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
1775 
1776             pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
1777 
1778             for (uint32_t i = 0; i < ARRAY_SIZE(ppDsts); i += 1)
1779             {
1780                 ppDsts[i] += dy;
1781             }
1782         }
1783     }
1784 };
1785 
1786 //////////////////////////////////////////////////////////////////////////
1787 /// StoreMacroTile - Stores a macro tile which consists of raster tiles.
1788 //////////////////////////////////////////////////////////////////////////
1789 template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
1790 struct StoreMacroTile
1791 {
1792     //////////////////////////////////////////////////////////////////////////
1793     /// @brief Stores a macrotile to the destination surface using safe implementation.
1794     /// @param pSrc - Pointer to macro tile.
1795     /// @param pDstSurface - Destination surface state
1796     /// @param x, y - Coordinates to macro tile
1797     static void StoreGeneric(
1798         uint8_t *pSrcHotTile,
1799         SWR_SURFACE_STATE* pDstSurface,
1800         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1801     {
1802         PFN_STORE_TILES_INTERNAL pfnStore;
1803         pfnStore = StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1804 
1805         // Store each raster tile from the hot tile to the destination surface.
1806         for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1807         {
1808             for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1809             {
1810                 for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1811                 {
1812                     pfnStore(pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1813                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1814                 }
1815             }
1816         }
1817 
1818     }
1819 
1820     typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
1821     //////////////////////////////////////////////////////////////////////////
1822     /// @brief Stores a macrotile to the destination surface.
1823     /// @param pSrc - Pointer to macro tile.
1824     /// @param pDstSurface - Destination surface state
1825     /// @param x, y - Coordinates to macro tile
1826     static void Store(
1827         uint8_t *pSrcHotTile,
1828         SWR_SURFACE_STATE* pDstSurface,
1829         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
1830     {
1831         PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
1832 
1833         for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1834         {
1835             size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(
1836                 0,
1837                 0,
1838                 pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
1839                 pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
1840                 sampleNum,
1841                 pDstSurface->lod,
1842                 pDstSurface);
1843 
1844             // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
1845             bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) ||
1846                 (pDstSurface->bInterleavedSamples);
1847 
1848             pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
1849         }
1850 
1851         // Save original for pSrcHotTile resolve.
1852         uint8_t *pResolveSrcHotTile = pSrcHotTile;
1853 
1854         // Store each raster tile from the hot tile to the destination surface.
1855         for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1856         {
1857             for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1858             {
1859                 for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
1860                 {
1861                     pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
1862                     pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1863                 }
1864             }
1865         }
1866 
1867         if (pDstSurface->xpAuxBaseAddress)
1868         {
1869             uint32_t sampleOffset = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
1870             // Store each raster tile from the hot tile to the destination surface.
1871             for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
1872             {
1873                 for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
1874                 {
1875                     StoreRasterTile<TTraits, SrcFormat, DstFormat>::Resolve(pResolveSrcHotTile, pDstSurface, (x + col), (y + row), sampleOffset, renderTargetArrayIndex);
1876                     pResolveSrcHotTile += sampleOffset * pDstSurface->numSamples;
1877                 }
1878             }
1879         }
1880     }
1881 };
1882 
1883 //////////////////////////////////////////////////////////////////////////
1884 /// InitStoreTilesTable - Helper for setting up the tables.
1885 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1886 void InitStoreTilesTableColor_Half1(
1887     PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
1888 {
1889     table[TTileMode][R32G32B32A32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
1890     table[TTileMode][R32G32B32A32_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
1891     table[TTileMode][R32G32B32A32_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
1892     table[TTileMode][R32G32B32X32_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
1893     table[TTileMode][R32G32B32A32_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SSCALED>::Store;
1894     table[TTileMode][R32G32B32A32_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_USCALED>::Store;
1895     table[TTileMode][R32G32B32_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
1896     table[TTileMode][R32G32B32_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
1897     table[TTileMode][R32G32B32_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
1898     table[TTileMode][R32G32B32_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_SSCALED>::Store;
1899     table[TTileMode][R32G32B32_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 96>, R32G32B32A32_FLOAT, R32G32B32_USCALED>::Store;
1900     table[TTileMode][R16G16B16A16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
1901     table[TTileMode][R16G16B16A16_SNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
1902     table[TTileMode][R16G16B16A16_SINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
1903     table[TTileMode][R16G16B16A16_UINT]             = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
1904     table[TTileMode][R16G16B16A16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
1905     table[TTileMode][R32G32_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
1906     table[TTileMode][R32G32_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
1907     table[TTileMode][R32G32_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
1908     table[TTileMode][R32_FLOAT_X8X24_TYPELESS]      = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
1909     table[TTileMode][X32_TYPELESS_G8X24_UINT]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, X32_TYPELESS_G8X24_UINT>::Store;
1910     table[TTileMode][R16G16B16X16_UNORM]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
1911     table[TTileMode][R16G16B16X16_FLOAT]            = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
1912     table[TTileMode][R16G16B16A16_SSCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SSCALED>::Store;
1913     table[TTileMode][R16G16B16A16_USCALED]          = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R16G16B16A16_USCALED>::Store;
1914     table[TTileMode][R32G32_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_SSCALED>::Store;
1915     table[TTileMode][R32G32_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 64>, R32G32B32A32_FLOAT, R32G32_USCALED>::Store;
1916     table[TTileMode][B8G8R8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
1917     table[TTileMode][B8G8R8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
1918     table[TTileMode][R10G10B10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
1919     table[TTileMode][R10G10B10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
1920     table[TTileMode][R10G10B10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
1921     table[TTileMode][R8G8B8A8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
1922     table[TTileMode][R8G8B8A8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
1923     table[TTileMode][R8G8B8A8_SNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
1924     table[TTileMode][R8G8B8A8_SINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
1925     table[TTileMode][R8G8B8A8_UINT]                 = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
1926     table[TTileMode][R16G16_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
1927     table[TTileMode][R16G16_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
1928     table[TTileMode][R16G16_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
1929     table[TTileMode][R16G16_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
1930     table[TTileMode][R16G16_FLOAT]                  = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
1931     table[TTileMode][B10G10R10A2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
1932     table[TTileMode][B10G10R10A2_UNORM_SRGB]        = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
1933     table[TTileMode][R11G11B10_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
1934     table[TTileMode][R10G10B10_FLOAT_A2_UNORM]      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10_FLOAT_A2_UNORM>::StoreGeneric;
1935     table[TTileMode][R32_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
1936     table[TTileMode][R32_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
1937     table[TTileMode][R32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
1938     table[TTileMode][R24_UNORM_X8_TYPELESS]         = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreGeneric;
1939     table[TTileMode][X24_TYPELESS_G8_UINT]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, X24_TYPELESS_G8_UINT>::StoreGeneric;
1940     table[TTileMode][A32_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
1941     table[TTileMode][B8G8R8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
1942     table[TTileMode][B8G8R8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
1943     table[TTileMode][R8G8B8X8_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
1944     table[TTileMode][R8G8B8X8_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
1945 }
1946 
1947 template <SWR_TILE_MODE TTileMode, size_t NumTileModesT, size_t ArraySizeT>
1948 void InitStoreTilesTableColor_Half2(
1949     PFN_STORE_TILES(&table)[NumTileModesT][ArraySizeT])
1950 {
1951     table[TTileMode][R9G9B9E5_SHAREDEXP]            = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R9G9B9E5_SHAREDEXP>::StoreGeneric;
1952     table[TTileMode][B10G10R10X2_UNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
1953     table[TTileMode][R10G10B10X2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10X2_USCALED>::StoreGeneric;
1954     table[TTileMode][R8G8B8A8_SSCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SSCALED>::Store;
1955     table[TTileMode][R8G8B8A8_USCALED]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R8G8B8A8_USCALED>::Store;
1956     table[TTileMode][R16G16_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_SSCALED>::Store;
1957     table[TTileMode][R16G16_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R16G16_USCALED>::Store;
1958     table[TTileMode][R32_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_SSCALED>::Store;
1959     table[TTileMode][R32_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R32_USCALED>::Store;
1960     table[TTileMode][B5G6R5_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
1961     table[TTileMode][B5G6R5_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
1962     table[TTileMode][B5G5R5A1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
1963     table[TTileMode][B5G5R5A1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
1964     table[TTileMode][B4G4R4A4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
1965     table[TTileMode][B4G4R4A4_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
1966     table[TTileMode][R8G8_UNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
1967     table[TTileMode][R8G8_SNORM]                    = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
1968     table[TTileMode][R8G8_SINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
1969     table[TTileMode][R8G8_UINT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
1970     table[TTileMode][R16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
1971     table[TTileMode][R16_SNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
1972     table[TTileMode][R16_SINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
1973     table[TTileMode][R16_UINT]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
1974     table[TTileMode][R16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
1975     table[TTileMode][A16_UNORM]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
1976     table[TTileMode][A16_FLOAT]                     = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
1977     table[TTileMode][B5G5R5X1_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
1978     table[TTileMode][B5G5R5X1_UNORM_SRGB]           = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
1979     table[TTileMode][R8G8_SSCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_SSCALED>::Store;
1980     table[TTileMode][R8G8_USCALED]                  = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R8G8_USCALED>::Store;
1981     table[TTileMode][R16_SSCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_SSCALED>::Store;
1982     table[TTileMode][R16_USCALED]                   = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, R16_USCALED>::Store;
1983     table[TTileMode][A1B5G5R5_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A1B5G5R5_UNORM>::StoreGeneric;
1984     table[TTileMode][A4B4G4R4_UNORM]                = StoreMacroTile<TilingTraits<TTileMode, 16>, R32G32B32A32_FLOAT, A4B4G4R4_UNORM>::StoreGeneric;
1985     table[TTileMode][R8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
1986     table[TTileMode][R8_SNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
1987     table[TTileMode][R8_SINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
1988     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
1989     table[TTileMode][A8_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
1990     table[TTileMode][R8_SSCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_SSCALED>::Store;
1991     table[TTileMode][R8_USCALED]                    = StoreMacroTile<TilingTraits<TTileMode, 8>, R32G32B32A32_FLOAT, R8_USCALED>::Store;
1992     table[TTileMode][R8G8B8_UNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
1993     table[TTileMode][R8G8B8_SNORM]                  = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
1994     table[TTileMode][R8G8B8_SSCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SSCALED>::Store;
1995     table[TTileMode][R8G8B8_USCALED]                = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_USCALED>::Store;
1996     table[TTileMode][R16G16B16_FLOAT]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
1997     table[TTileMode][R16G16B16_UNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
1998     table[TTileMode][R16G16B16_SNORM]               = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
1999     table[TTileMode][R16G16B16_SSCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SSCALED>::Store;
2000     table[TTileMode][R16G16B16_USCALED]             = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_USCALED>::Store;
2001     table[TTileMode][R8G8B8_UNORM_SRGB]             = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
2002     table[TTileMode][R16G16B16_UINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
2003     table[TTileMode][R16G16B16_SINT]                = StoreMacroTile<TilingTraits<TTileMode, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
2004     table[TTileMode][R10G10B10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
2005     table[TTileMode][R10G10B10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_USCALED>::StoreGeneric;
2006     table[TTileMode][R10G10B10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SSCALED>::StoreGeneric;
2007     table[TTileMode][R10G10B10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
2008     table[TTileMode][B10G10R10A2_SNORM]             = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
2009     table[TTileMode][B10G10R10A2_USCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_USCALED>::StoreGeneric;
2010     table[TTileMode][B10G10R10A2_SSCALED]           = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SSCALED>::StoreGeneric;
2011     table[TTileMode][B10G10R10A2_UINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
2012     table[TTileMode][B10G10R10A2_SINT]              = StoreMacroTile<TilingTraits<TTileMode, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
2013     table[TTileMode][R8G8B8_UINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
2014     table[TTileMode][R8G8B8_SINT]                   = StoreMacroTile<TilingTraits<TTileMode, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
2015 }
2016 
2017 //////////////////////////////////////////////////////////////////////////
2018 /// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
2019 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2020 void InitStoreTilesTableDepth(
2021     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2022 {
2023    table[TTileMode][R32_FLOAT]                      = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R32_FLOAT>::Store;
2024    table[TTileMode][R32_FLOAT_X8X24_TYPELESS]       = StoreMacroTile<TilingTraits<TTileMode, 64>, R32_FLOAT, R32_FLOAT_X8X24_TYPELESS>::Store;
2025    table[TTileMode][R24_UNORM_X8_TYPELESS]          = StoreMacroTile<TilingTraits<TTileMode, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
2026    table[TTileMode][R16_UNORM]                      = StoreMacroTile<TilingTraits<TTileMode, 16>, R32_FLOAT, R16_UNORM>::Store;
2027 }
2028 
2029 template <SWR_TILE_MODE TTileMode, size_t NumTileModes, size_t ArraySizeT>
2030 void InitStoreTilesTableStencil(
2031     PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
2032 {
2033     table[TTileMode][R8_UINT]                       = StoreMacroTile<TilingTraits<TTileMode, 8>, R8_UINT, R8_UINT>::Store;
2034 }
2035 
2036 
2037 //////////////////////////////////////////////////////////////////////////
2038 /// @brief Deswizzles and stores a full hottile to a render surface
2039 /// @param hPrivateContext - Handle to private DC
2040 /// @param srcFormat - Format for hot tile.
2041 /// @param renderTargetIndex - Index to destination render target
2042 /// @param x, y - Coordinates to raster tile.
2043 /// @param pSrcHotTile - Pointer to Hot Tile
2044 void SwrStoreHotTileToSurface(
2045         HANDLE hWorkerPrivateData,
2046         SWR_SURFACE_STATE *pDstSurface,
2047 	 BucketManager* pBucketMgr,
2048         SWR_FORMAT srcFormat,
2049         SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
2050         uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
2051         uint8_t *pSrcHotTile);
2052