1 /*
2  * Mesa 3-D graphics library
3  *
4  * Copyright 2012 Intel Corporation
5  * Copyright 2013 Google
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sublicense, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial portions
17  * of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  * Authors:
28  *    Chad Versace <chad.versace@linux.intel.com>
29  *    Frank Henigman <fjhenigman@google.com>
30  */
31 
32 #include <string.h>
33 
34 #include "util/macros.h"
35 
36 #include "brw_context.h"
37 #include "intel_tiled_memcpy.h"
38 
39 #if defined(__SSSE3__)
40 #include <tmmintrin.h>
41 #elif defined(__SSE2__)
42 #include <emmintrin.h>
43 #endif
44 
45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
46 
47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48 #define ALIGN_UP(a, b) ALIGN(a, b)
49 
50 /* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
51  * unitless).  A "span" is the most number of bytes we can copy from linear
52  * to tiled without needing to calculate a new destination address.
53  */
54 static const uint32_t xtile_width = 512;
55 static const uint32_t xtile_height = 8;
56 static const uint32_t xtile_span = 64;
57 static const uint32_t ytile_width = 128;
58 static const uint32_t ytile_height = 32;
59 static const uint32_t ytile_span = 16;
60 
61 static inline uint32_t
ror(uint32_t n,uint32_t d)62 ror(uint32_t n, uint32_t d)
63 {
64    return (n >> d) | (n << (32 - d));
65 }
66 
67 static inline uint32_t
bswap32(uint32_t n)68 bswap32(uint32_t n)
69 {
70 #if defined(HAVE___BUILTIN_BSWAP32)
71    return __builtin_bswap32(n);
72 #else
73    return (n >> 24) |
74           ((n >> 8) & 0x0000ff00) |
75           ((n << 8) & 0x00ff0000) |
76           (n << 24);
77 #endif
78 }
79 
80 /**
81  * Copy RGBA to BGRA - swap R and B.
82  */
83 static inline void *
rgba8_copy(void * dst,const void * src,size_t bytes)84 rgba8_copy(void *dst, const void *src, size_t bytes)
85 {
86    uint32_t *d = dst;
87    uint32_t const *s = src;
88 
89    assert(bytes % 4 == 0);
90 
91    while (bytes >= 4) {
92       *d = ror(bswap32(*s), 8);
93       d += 1;
94       s += 1;
95       bytes -= 4;
96    }
97    return dst;
98 }
99 
100 #ifdef __SSSE3__
101 static const uint8_t rgba8_permutation[16] =
102    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
103 
104 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)105 rgba8_copy_16_aligned_dst(void *dst, const void *src)
106 {
107    _mm_store_si128(dst,
108                    _mm_shuffle_epi8(_mm_loadu_si128(src),
109                                     *(__m128i *)rgba8_permutation));
110 }
111 
112 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)113 rgba8_copy_16_aligned_src(void *dst, const void *src)
114 {
115    _mm_storeu_si128(dst,
116                     _mm_shuffle_epi8(_mm_load_si128(src),
117                                      *(__m128i *)rgba8_permutation));
118 }
119 
120 #elif defined(__SSE2__)
121 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)122 rgba8_copy_16_aligned_dst(void *dst, const void *src)
123 {
124    __m128i srcreg, dstreg, agmask, ag, rb, br;
125 
126    agmask = _mm_set1_epi32(0xFF00FF00);
127    srcreg = _mm_loadu_si128((__m128i *)src);
128 
129    rb = _mm_andnot_si128(agmask, srcreg);
130    ag = _mm_and_si128(agmask, srcreg);
131    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
132                             _MM_SHUFFLE(2, 3, 0, 1));
133    dstreg = _mm_or_si128(ag, br);
134 
135    _mm_store_si128((__m128i *)dst, dstreg);
136 }
137 
138 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)139 rgba8_copy_16_aligned_src(void *dst, const void *src)
140 {
141    __m128i srcreg, dstreg, agmask, ag, rb, br;
142 
143    agmask = _mm_set1_epi32(0xFF00FF00);
144    srcreg = _mm_load_si128((__m128i *)src);
145 
146    rb = _mm_andnot_si128(agmask, srcreg);
147    ag = _mm_and_si128(agmask, srcreg);
148    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
149                             _MM_SHUFFLE(2, 3, 0, 1));
150    dstreg = _mm_or_si128(ag, br);
151 
152    _mm_storeu_si128((__m128i *)dst, dstreg);
153 }
154 #endif
155 
156 /**
157  * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
158  */
159 static inline void *
rgba8_copy_aligned_dst(void * dst,const void * src,size_t bytes)160 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
161 {
162    assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
163 
164 #if defined(__SSSE3__) || defined(__SSE2__)
165    if (bytes == 64) {
166       rgba8_copy_16_aligned_dst(dst +  0, src +  0);
167       rgba8_copy_16_aligned_dst(dst + 16, src + 16);
168       rgba8_copy_16_aligned_dst(dst + 32, src + 32);
169       rgba8_copy_16_aligned_dst(dst + 48, src + 48);
170       return dst;
171    }
172 
173    while (bytes >= 16) {
174       rgba8_copy_16_aligned_dst(dst, src);
175       src += 16;
176       dst += 16;
177       bytes -= 16;
178    }
179 #endif
180 
181    rgba8_copy(dst, src, bytes);
182 
183    return dst;
184 }
185 
186 /**
187  * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
188  */
189 static inline void *
rgba8_copy_aligned_src(void * dst,const void * src,size_t bytes)190 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
191 {
192    assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
193 
194 #if defined(__SSSE3__) || defined(__SSE2__)
195    if (bytes == 64) {
196       rgba8_copy_16_aligned_src(dst +  0, src +  0);
197       rgba8_copy_16_aligned_src(dst + 16, src + 16);
198       rgba8_copy_16_aligned_src(dst + 32, src + 32);
199       rgba8_copy_16_aligned_src(dst + 48, src + 48);
200       return dst;
201    }
202 
203    while (bytes >= 16) {
204       rgba8_copy_16_aligned_src(dst, src);
205       src += 16;
206       dst += 16;
207       bytes -= 16;
208    }
209 #endif
210 
211    rgba8_copy(dst, src, bytes);
212 
213    return dst;
214 }
215 
216 /**
217  * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
218  * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
219  * The first and last ranges must be shorter than a "span" (the longest linear
220  * stretch within a tile) and the middle must equal a whole number of spans.
221  * Ranges may be empty.  The region copied must land entirely within one tile.
222  * 'dst' is the start of the tile and 'src' is the corresponding
223  * address to copy from, though copying begins at (x0, y0).
224  * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
225  * Swizzling flips bit 6 in the copy destination offset, when certain other
226  * bits are set in it.
227  */
228 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
229                              uint32_t y0, uint32_t y1,
230                              char *dst, const char *src,
231                              int32_t linear_pitch,
232                              uint32_t swizzle_bit,
233                              mem_copy_fn mem_copy);
234 
235 /**
236  * Copy texture data from linear to X tile layout.
237  *
238  * \copydoc tile_copy_fn
239  *
240  * The mem_copy parameters allow the user to specify an alternative mem_copy
241  * function that, for instance, may do RGBA -> BGRA swizzling.  The first
242  * function must handle any memory alignment while the second function must
243  * only handle 16-byte alignment in whichever side (source or destination) is
244  * tiled.
245  */
246 static inline void
linear_to_xtiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy,mem_copy_fn mem_copy_align16)247 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
248                  uint32_t y0, uint32_t y1,
249                  char *dst, const char *src,
250                  int32_t src_pitch,
251                  uint32_t swizzle_bit,
252                  mem_copy_fn mem_copy,
253                  mem_copy_fn mem_copy_align16)
254 {
255    /* The copy destination offset for each range copied is the sum of
256     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
257     */
258    uint32_t xo, yo;
259 
260    src += (ptrdiff_t)y0 * src_pitch;
261 
262    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
263       /* Bits 9 and 10 of the copy destination offset control swizzling.
264        * Only 'yo' contributes to those bits in the total offset,
265        * so calculate 'swizzle' just once per row.
266        * Move bits 9 and 10 three and four places respectively down
267        * to bit 6 and xor them.
268        */
269       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
270 
271       mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
272 
273       for (xo = x1; xo < x2; xo += xtile_span) {
274          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
275       }
276 
277       mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
278 
279       src += src_pitch;
280    }
281 }
282 
283 /**
284  * Copy texture data from linear to Y tile layout.
285  *
286  * \copydoc tile_copy_fn
287  */
288 static inline void
linear_to_ytiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy,mem_copy_fn mem_copy_align16)289 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
290                  uint32_t y0, uint32_t y1,
291                  char *dst, const char *src,
292                  int32_t src_pitch,
293                  uint32_t swizzle_bit,
294                  mem_copy_fn mem_copy,
295                  mem_copy_fn mem_copy_align16)
296 {
297    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
298     * as the tile).  Thus the destination offset for (x,y) is the sum of:
299     *   (x % column_width)                    // position within column
300     *   (x / column_width) * bytes_per_column // column number * bytes per column
301     *   y * column_width
302     *
303     * The copy destination offset for each range copied is the sum of
304     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
305     */
306    const uint32_t column_width = ytile_span;
307    const uint32_t bytes_per_column = column_width * ytile_height;
308 
309    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
310    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
311 
312    /* Bit 9 of the destination offset control swizzling.
313     * Only the X offset contributes to bit 9 of the total offset,
314     * so swizzle can be calculated in advance for these X positions.
315     * Move bit 9 three places down to bit 6.
316     */
317    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
318    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
319 
320    uint32_t x, yo;
321 
322    src += (ptrdiff_t)y0 * src_pitch;
323 
324    for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
325       uint32_t xo = xo1;
326       uint32_t swizzle = swizzle1;
327 
328       mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
329 
330       /* Step by spans/columns.  As it happens, the swizzle bit flips
331        * at each step so we don't need to calculate it explicitly.
332        */
333       for (x = x1; x < x2; x += ytile_span) {
334          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
335          xo += bytes_per_column;
336          swizzle ^= swizzle_bit;
337       }
338 
339       mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
340 
341       src += src_pitch;
342    }
343 }
344 
345 /**
346  * Copy texture data from X tile layout to linear.
347  *
348  * \copydoc tile_copy_fn
349  */
350 static inline void
xtiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy,mem_copy_fn mem_copy_align16)351 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
352                  uint32_t y0, uint32_t y1,
353                  char *dst, const char *src,
354                  int32_t dst_pitch,
355                  uint32_t swizzle_bit,
356                  mem_copy_fn mem_copy,
357                  mem_copy_fn mem_copy_align16)
358 {
359    /* The copy destination offset for each range copied is the sum of
360     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
361     */
362    uint32_t xo, yo;
363 
364    dst += (ptrdiff_t)y0 * dst_pitch;
365 
366    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
367       /* Bits 9 and 10 of the copy destination offset control swizzling.
368        * Only 'yo' contributes to those bits in the total offset,
369        * so calculate 'swizzle' just once per row.
370        * Move bits 9 and 10 three and four places respectively down
371        * to bit 6 and xor them.
372        */
373       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
374 
375       mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
376 
377       for (xo = x1; xo < x2; xo += xtile_span) {
378          mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
379       }
380 
381       mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
382 
383       dst += dst_pitch;
384    }
385 }
386 
387  /**
388  * Copy texture data from Y tile layout to linear.
389  *
390  * \copydoc tile_copy_fn
391  */
392 static inline void
ytiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy,mem_copy_fn mem_copy_align16)393 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
394                  uint32_t y0, uint32_t y1,
395                  char *dst, const char *src,
396                  int32_t dst_pitch,
397                  uint32_t swizzle_bit,
398                  mem_copy_fn mem_copy,
399                  mem_copy_fn mem_copy_align16)
400 {
401    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
402     * as the tile).  Thus the destination offset for (x,y) is the sum of:
403     *   (x % column_width)                    // position within column
404     *   (x / column_width) * bytes_per_column // column number * bytes per column
405     *   y * column_width
406     *
407     * The copy destination offset for each range copied is the sum of
408     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
409     */
410    const uint32_t column_width = ytile_span;
411    const uint32_t bytes_per_column = column_width * ytile_height;
412 
413    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
414    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
415 
416    /* Bit 9 of the destination offset control swizzling.
417     * Only the X offset contributes to bit 9 of the total offset,
418     * so swizzle can be calculated in advance for these X positions.
419     * Move bit 9 three places down to bit 6.
420     */
421    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
422    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
423 
424    uint32_t x, yo;
425 
426    dst += (ptrdiff_t)y0 * dst_pitch;
427 
428    for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
429       uint32_t xo = xo1;
430       uint32_t swizzle = swizzle1;
431 
432       mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
433 
434       /* Step by spans/columns.  As it happens, the swizzle bit flips
435        * at each step so we don't need to calculate it explicitly.
436        */
437       for (x = x1; x < x2; x += ytile_span) {
438          mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
439          xo += bytes_per_column;
440          swizzle ^= swizzle_bit;
441       }
442 
443       mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
444 
445       dst += dst_pitch;
446    }
447 }
448 
449 
450 /**
451  * Copy texture data from linear to X tile layout, faster.
452  *
453  * Same as \ref linear_to_xtiled but faster, because it passes constant
454  * parameters for common cases, allowing the compiler to inline code
455  * optimized for those cases.
456  *
457  * \copydoc tile_copy_fn
458  */
459 static FLATTEN void
linear_to_xtiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy)460 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
461                         uint32_t y0, uint32_t y1,
462                         char *dst, const char *src,
463                         int32_t src_pitch,
464                         uint32_t swizzle_bit,
465                         mem_copy_fn mem_copy)
466 {
467    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
468       if (mem_copy == memcpy)
469          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
470                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
471       else if (mem_copy == rgba8_copy)
472          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
473                                  dst, src, src_pitch, swizzle_bit,
474                                  rgba8_copy, rgba8_copy_aligned_dst);
475       else
476          unreachable("not reached");
477    } else {
478       if (mem_copy == memcpy)
479          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
480                                  dst, src, src_pitch, swizzle_bit,
481                                  memcpy, memcpy);
482       else if (mem_copy == rgba8_copy)
483          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
484                                  dst, src, src_pitch, swizzle_bit,
485                                  rgba8_copy, rgba8_copy_aligned_dst);
486       else
487          unreachable("not reached");
488    }
489    linear_to_xtiled(x0, x1, x2, x3, y0, y1,
490                     dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
491 }
492 
493 /**
494  * Copy texture data from linear to Y tile layout, faster.
495  *
496  * Same as \ref linear_to_ytiled but faster, because it passes constant
497  * parameters for common cases, allowing the compiler to inline code
498  * optimized for those cases.
499  *
500  * \copydoc tile_copy_fn
501  */
502 static FLATTEN void
linear_to_ytiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy)503 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
504                         uint32_t y0, uint32_t y1,
505                         char *dst, const char *src,
506                         int32_t src_pitch,
507                         uint32_t swizzle_bit,
508                         mem_copy_fn mem_copy)
509 {
510    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
511       if (mem_copy == memcpy)
512          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
513                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
514       else if (mem_copy == rgba8_copy)
515          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
516                                  dst, src, src_pitch, swizzle_bit,
517                                  rgba8_copy, rgba8_copy_aligned_dst);
518       else
519          unreachable("not reached");
520    } else {
521       if (mem_copy == memcpy)
522          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
523                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
524       else if (mem_copy == rgba8_copy)
525          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
526                                  dst, src, src_pitch, swizzle_bit,
527                                  rgba8_copy, rgba8_copy_aligned_dst);
528       else
529          unreachable("not reached");
530    }
531    linear_to_ytiled(x0, x1, x2, x3, y0, y1,
532                     dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
533 }
534 
535 /**
536  * Copy texture data from X tile layout to linear, faster.
537  *
538  * Same as \ref xtile_to_linear but faster, because it passes constant
539  * parameters for common cases, allowing the compiler to inline code
540  * optimized for those cases.
541  *
542  * \copydoc tile_copy_fn
543  */
544 static FLATTEN void
xtiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy)545 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
546                         uint32_t y0, uint32_t y1,
547                         char *dst, const char *src,
548                         int32_t dst_pitch,
549                         uint32_t swizzle_bit,
550                         mem_copy_fn mem_copy)
551 {
552    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
553       if (mem_copy == memcpy)
554          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
555                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
556       else if (mem_copy == rgba8_copy)
557          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
558                                  dst, src, dst_pitch, swizzle_bit,
559                                  rgba8_copy, rgba8_copy_aligned_src);
560       else
561          unreachable("not reached");
562    } else {
563       if (mem_copy == memcpy)
564          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
565                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
566       else if (mem_copy == rgba8_copy)
567          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
568                                  dst, src, dst_pitch, swizzle_bit,
569                                  rgba8_copy, rgba8_copy_aligned_src);
570       else
571          unreachable("not reached");
572    }
573    xtiled_to_linear(x0, x1, x2, x3, y0, y1,
574                     dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
575 }
576 
577 /**
578  * Copy texture data from Y tile layout to linear, faster.
579  *
580  * Same as \ref ytile_to_linear but faster, because it passes constant
581  * parameters for common cases, allowing the compiler to inline code
582  * optimized for those cases.
583  *
584  * \copydoc tile_copy_fn
585  */
586 static FLATTEN void
ytiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,mem_copy_fn mem_copy)587 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
588                         uint32_t y0, uint32_t y1,
589                         char *dst, const char *src,
590                         int32_t dst_pitch,
591                         uint32_t swizzle_bit,
592                         mem_copy_fn mem_copy)
593 {
594    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
595       if (mem_copy == memcpy)
596          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
597                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
598       else if (mem_copy == rgba8_copy)
599          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
600                                  dst, src, dst_pitch, swizzle_bit,
601                                  rgba8_copy, rgba8_copy_aligned_src);
602       else
603          unreachable("not reached");
604    } else {
605       if (mem_copy == memcpy)
606          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
607                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
608       else if (mem_copy == rgba8_copy)
609          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
610                                  dst, src, dst_pitch, swizzle_bit,
611                                  rgba8_copy, rgba8_copy_aligned_src);
612       else
613          unreachable("not reached");
614    }
615    ytiled_to_linear(x0, x1, x2, x3, y0, y1,
616                     dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
617 }
618 
619 /**
620  * Copy from linear to tiled texture.
621  *
622  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
623  * pieces that do not cross tile boundaries and copy each piece with a tile
624  * copy function (\ref tile_copy_fn).
625  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
626  * The Y range is in pixels (i.e. unitless).
627  * 'dst' is the start of the texture and 'src' is the corresponding
628  * address to copy from, though copying begins at (xt1, yt1).
629  */
630 void
linear_to_tiled(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,uint32_t dst_pitch,int32_t src_pitch,bool has_swizzling,enum isl_tiling tiling,mem_copy_fn mem_copy)631 linear_to_tiled(uint32_t xt1, uint32_t xt2,
632                 uint32_t yt1, uint32_t yt2,
633                 char *dst, const char *src,
634                 uint32_t dst_pitch, int32_t src_pitch,
635                 bool has_swizzling,
636                 enum isl_tiling tiling,
637                 mem_copy_fn mem_copy)
638 {
639    tile_copy_fn tile_copy;
640    uint32_t xt0, xt3;
641    uint32_t yt0, yt3;
642    uint32_t xt, yt;
643    uint32_t tw, th, span;
644    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
645 
646    if (tiling == ISL_TILING_X) {
647       tw = xtile_width;
648       th = xtile_height;
649       span = xtile_span;
650       tile_copy = linear_to_xtiled_faster;
651    } else if (tiling == ISL_TILING_Y0) {
652       tw = ytile_width;
653       th = ytile_height;
654       span = ytile_span;
655       tile_copy = linear_to_ytiled_faster;
656    } else {
657       unreachable("unsupported tiling");
658    }
659 
660    /* Round out to tile boundaries. */
661    xt0 = ALIGN_DOWN(xt1, tw);
662    xt3 = ALIGN_UP  (xt2, tw);
663    yt0 = ALIGN_DOWN(yt1, th);
664    yt3 = ALIGN_UP  (yt2, th);
665 
666    /* Loop over all tiles to which we have something to copy.
667     * 'xt' and 'yt' are the origin of the destination tile, whether copying
668     * copying a full or partial tile.
669     * tile_copy() copies one tile or partial tile.
670     * Looping x inside y is the faster memory access pattern.
671     */
672    for (yt = yt0; yt < yt3; yt += th) {
673       for (xt = xt0; xt < xt3; xt += tw) {
674          /* The area to update is [x0,x3) x [y0,y1).
675           * May not want the whole tile, hence the min and max.
676           */
677          uint32_t x0 = MAX2(xt1, xt);
678          uint32_t y0 = MAX2(yt1, yt);
679          uint32_t x3 = MIN2(xt2, xt + tw);
680          uint32_t y1 = MIN2(yt2, yt + th);
681 
682          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
683           * the middle interval is the longest span-aligned part.
684           * The sub-ranges could be empty.
685           */
686          uint32_t x1, x2;
687          x1 = ALIGN_UP(x0, span);
688          if (x1 > x3)
689             x1 = x2 = x3;
690          else
691             x2 = ALIGN_DOWN(x3, span);
692 
693          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
694          assert(x1 - x0 < span && x3 - x2 < span);
695          assert(x3 - x0 <= tw);
696          assert((x2 - x1) % span == 0);
697 
698          /* Translate by (xt,yt) for single-tile copier. */
699          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
700                    y0-yt, y1-yt,
701                    dst + (ptrdiff_t) xt * th + (ptrdiff_t) yt * dst_pitch,
702                    src + (ptrdiff_t) xt      + (ptrdiff_t) yt * src_pitch,
703                    src_pitch,
704                    swizzle_bit,
705                    mem_copy);
706       }
707    }
708 }
709 
710 /**
711  * Copy from tiled to linear texture.
712  *
713  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
714  * pieces that do not cross tile boundaries and copy each piece with a tile
715  * copy function (\ref tile_copy_fn).
716  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
717  * The Y range is in pixels (i.e. unitless).
718  * 'dst' is the start of the texture and 'src' is the corresponding
719  * address to copy from, though copying begins at (xt1, yt1).
720  */
721 void
tiled_to_linear(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,int32_t dst_pitch,uint32_t src_pitch,bool has_swizzling,enum isl_tiling tiling,mem_copy_fn mem_copy)722 tiled_to_linear(uint32_t xt1, uint32_t xt2,
723                 uint32_t yt1, uint32_t yt2,
724                 char *dst, const char *src,
725                 int32_t dst_pitch, uint32_t src_pitch,
726                 bool has_swizzling,
727                 enum isl_tiling tiling,
728                 mem_copy_fn mem_copy)
729 {
730    tile_copy_fn tile_copy;
731    uint32_t xt0, xt3;
732    uint32_t yt0, yt3;
733    uint32_t xt, yt;
734    uint32_t tw, th, span;
735    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
736 
737    if (tiling == ISL_TILING_X) {
738       tw = xtile_width;
739       th = xtile_height;
740       span = xtile_span;
741       tile_copy = xtiled_to_linear_faster;
742    } else if (tiling == ISL_TILING_Y0) {
743       tw = ytile_width;
744       th = ytile_height;
745       span = ytile_span;
746       tile_copy = ytiled_to_linear_faster;
747    } else {
748       unreachable("unsupported tiling");
749    }
750 
751    /* Round out to tile boundaries. */
752    xt0 = ALIGN_DOWN(xt1, tw);
753    xt3 = ALIGN_UP  (xt2, tw);
754    yt0 = ALIGN_DOWN(yt1, th);
755    yt3 = ALIGN_UP  (yt2, th);
756 
757    /* Loop over all tiles to which we have something to copy.
758     * 'xt' and 'yt' are the origin of the destination tile, whether copying
759     * copying a full or partial tile.
760     * tile_copy() copies one tile or partial tile.
761     * Looping x inside y is the faster memory access pattern.
762     */
763    for (yt = yt0; yt < yt3; yt += th) {
764       for (xt = xt0; xt < xt3; xt += tw) {
765          /* The area to update is [x0,x3) x [y0,y1).
766           * May not want the whole tile, hence the min and max.
767           */
768          uint32_t x0 = MAX2(xt1, xt);
769          uint32_t y0 = MAX2(yt1, yt);
770          uint32_t x3 = MIN2(xt2, xt + tw);
771          uint32_t y1 = MIN2(yt2, yt + th);
772 
773          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
774           * the middle interval is the longest span-aligned part.
775           * The sub-ranges could be empty.
776           */
777          uint32_t x1, x2;
778          x1 = ALIGN_UP(x0, span);
779          if (x1 > x3)
780             x1 = x2 = x3;
781          else
782             x2 = ALIGN_DOWN(x3, span);
783 
784          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
785          assert(x1 - x0 < span && x3 - x2 < span);
786          assert(x3 - x0 <= tw);
787          assert((x2 - x1) % span == 0);
788 
789          /* Translate by (xt,yt) for single-tile copier. */
790          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
791                    y0-yt, y1-yt,
792                    dst + (ptrdiff_t) xt      + (ptrdiff_t) yt * dst_pitch,
793                    src + (ptrdiff_t) xt * th + (ptrdiff_t) yt * src_pitch,
794                    dst_pitch,
795                    swizzle_bit,
796                    mem_copy);
797       }
798    }
799 }
800 
801 
802 /**
803  * Determine which copy function to use for the given format combination
804  *
805  * The only two possible copy functions which are ever returned are a
806  * direct memcpy and a RGBA <-> BGRA copy function.  Since RGBA -> BGRA and
807  * BGRA -> RGBA are exactly the same operation (and memcpy is obviously
808  * symmetric), it doesn't matter whether the copy is from the tiled image
809  * to the untiled or vice versa.  The copy function required is the same in
810  * either case so this function can be used.
811  *
812  * \param[in]  tiledFormat The format of the tiled image
813  * \param[in]  format      The GL format of the client data
814  * \param[in]  type        The GL type of the client data
815  * \param[out] mem_copy    Will be set to one of either the standard
816  *                         library's memcpy or a different copy function
817  *                         that performs an RGBA to BGRA conversion
818  * \param[out] cpp         Number of bytes per channel
819  *
820  * \return true if the format and type combination are valid
821  */
intel_get_memcpy(mesa_format tiledFormat,GLenum format,GLenum type,mem_copy_fn * mem_copy,uint32_t * cpp)822 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
823                       GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp)
824 {
825    if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
826        !(format == GL_RGBA || format == GL_BGRA))
827       return false; /* Invalid type/format combination */
828 
829    if ((tiledFormat == MESA_FORMAT_L_UNORM8 && format == GL_LUMINANCE) ||
830        (tiledFormat == MESA_FORMAT_A_UNORM8 && format == GL_ALPHA)) {
831       *cpp = 1;
832       *mem_copy = memcpy;
833    } else if ((tiledFormat == MESA_FORMAT_B8G8R8A8_UNORM) ||
834               (tiledFormat == MESA_FORMAT_B8G8R8X8_UNORM) ||
835               (tiledFormat == MESA_FORMAT_B8G8R8A8_SRGB) ||
836               (tiledFormat == MESA_FORMAT_B8G8R8X8_SRGB)) {
837       *cpp = 4;
838       if (format == GL_BGRA) {
839          *mem_copy = memcpy;
840       } else if (format == GL_RGBA) {
841          *mem_copy = rgba8_copy;
842       }
843    } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
844               (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
845               (tiledFormat == MESA_FORMAT_R8G8B8A8_SRGB) ||
846               (tiledFormat == MESA_FORMAT_R8G8B8X8_SRGB)) {
847       *cpp = 4;
848       if (format == GL_BGRA) {
849          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
850           * use the same function.
851           */
852          *mem_copy = rgba8_copy;
853       } else if (format == GL_RGBA) {
854          *mem_copy = memcpy;
855       }
856    }
857 
858    if (!(*mem_copy))
859       return false;
860 
861    return true;
862 }
863