1 /*
2  * Copyright 2014 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkTextureCompressor_LATC.h"
9 #include "SkTextureCompressor_Blitter.h"
10 #include "SkTextureCompressor_Utils.h"
11 
12 #include "SkBlitter.h"
13 #include "SkEndian.h"
14 
15 // Compression options. In general, the slow version is much more accurate, but
16 // much slower. The fast option is much faster, but much less accurate. YMMV.
17 #define COMPRESS_LATC_SLOW 0
18 #define COMPRESS_LATC_FAST 1
19 
20 ////////////////////////////////////////////////////////////////////////////////
21 
22 // Generates an LATC palette. LATC constructs
23 // a palette of eight colors from LUM0 and LUM1 using the algorithm:
24 //
25 // LUM0,              if lum0 > lum1 and code(x,y) == 0
26 // LUM1,              if lum0 > lum1 and code(x,y) == 1
27 // (6*LUM0+  LUM1)/7, if lum0 > lum1 and code(x,y) == 2
28 // (5*LUM0+2*LUM1)/7, if lum0 > lum1 and code(x,y) == 3
29 // (4*LUM0+3*LUM1)/7, if lum0 > lum1 and code(x,y) == 4
30 // (3*LUM0+4*LUM1)/7, if lum0 > lum1 and code(x,y) == 5
31 // (2*LUM0+5*LUM1)/7, if lum0 > lum1 and code(x,y) == 6
32 // (  LUM0+6*LUM1)/7, if lum0 > lum1 and code(x,y) == 7
33 //
34 // LUM0,              if lum0 <= lum1 and code(x,y) == 0
35 // LUM1,              if lum0 <= lum1 and code(x,y) == 1
36 // (4*LUM0+  LUM1)/5, if lum0 <= lum1 and code(x,y) == 2
37 // (3*LUM0+2*LUM1)/5, if lum0 <= lum1 and code(x,y) == 3
38 // (2*LUM0+3*LUM1)/5, if lum0 <= lum1 and code(x,y) == 4
39 // (  LUM0+4*LUM1)/5, if lum0 <= lum1 and code(x,y) == 5
40 // 0,                 if lum0 <= lum1 and code(x,y) == 6
41 // 255,               if lum0 <= lum1 and code(x,y) == 7
42 
43 static const int kLATCPaletteSize = 8;
generate_latc_palette(uint8_t palette[],uint8_t lum0,uint8_t lum1)44 static void generate_latc_palette(uint8_t palette[], uint8_t lum0, uint8_t lum1) {
45     palette[0] = lum0;
46     palette[1] = lum1;
47     if (lum0 > lum1) {
48         for (int i = 1; i < 7; i++) {
49             palette[i+1] = ((7-i)*lum0 + i*lum1) / 7;
50         }
51     } else {
52         for (int i = 1; i < 5; i++) {
53             palette[i+1] = ((5-i)*lum0 + i*lum1) / 5;
54         }
55         palette[6] = 0;
56         palette[7] = 255;
57     }
58 }
59 
60 ////////////////////////////////////////////////////////////////////////////////
61 
62 #if COMPRESS_LATC_SLOW
63 
64 ////////////////////////////////////////////////////////////////////////////////
65 //
66 // Utility Functions
67 //
68 ////////////////////////////////////////////////////////////////////////////////
69 
70 // Absolute difference between two values. More correct than SkTAbs(a - b)
71 // because it works on unsigned values.
abs_diff(const T & a,const T & b)72 template <typename T> inline T abs_diff(const T &a, const T &b) {
73     return (a > b) ? (a - b) : (b - a);
74 }
75 
is_extremal(uint8_t pixel)76 static bool is_extremal(uint8_t pixel) {
77     return 0 == pixel || 255 == pixel;
78 }
79 
80 typedef uint64_t (*A84x4To64BitProc)(const uint8_t block[]);
81 
82 // This function is used by both R11 EAC and LATC to compress 4x4 blocks
83 // of 8-bit alpha into 64-bit values that comprise the compressed data.
84 // For both formats, we need to make sure that the dimensions of the
85 // src pixels are divisible by 4, and copy 4x4 blocks one at a time
86 // for compression.
compress_4x4_a8_to_64bit(uint8_t * dst,const uint8_t * src,int width,int height,size_t rowBytes,A84x4To64BitProc proc)87 static bool compress_4x4_a8_to_64bit(uint8_t* dst, const uint8_t* src,
88                                      int width, int height, size_t rowBytes,
89                                      A84x4To64BitProc proc) {
90     // Make sure that our data is well-formed enough to be considered for compression
91     if (0 == width || 0 == height || (width % 4) != 0 || (height % 4) != 0) {
92         return false;
93     }
94 
95     int blocksX = width >> 2;
96     int blocksY = height >> 2;
97 
98     uint8_t block[16];
99     uint64_t* encPtr = reinterpret_cast<uint64_t*>(dst);
100     for (int y = 0; y < blocksY; ++y) {
101         for (int x = 0; x < blocksX; ++x) {
102             // Load block
103             for (int k = 0; k < 4; ++k) {
104                 memcpy(block + k*4, src + k*rowBytes + 4*x, 4);
105             }
106 
107             // Compress it
108             *encPtr = proc(block);
109             ++encPtr;
110         }
111         src += 4 * rowBytes;
112     }
113 
114     return true;
115 }
116 
117 ////////////////////////////////////////////////////////////////////////////////
118 //
119 // LATC compressor
120 //
121 ////////////////////////////////////////////////////////////////////////////////
122 
123 // LATC compressed texels down into square 4x4 blocks
124 static const int kLATCBlockSize = 4;
125 static const int kLATCPixelsPerBlock = kLATCBlockSize * kLATCBlockSize;
126 
127 // Compress a block by using the bounding box of the pixels. It is assumed that
128 // there are no extremal pixels in this block otherwise we would have used
129 // compressBlockBBIgnoreExtremal.
compress_latc_block_bb(const uint8_t pixels[])130 static uint64_t compress_latc_block_bb(const uint8_t pixels[]) {
131     uint8_t minVal = 255;
132     uint8_t maxVal = 0;
133     for (int i = 0; i < kLATCPixelsPerBlock; ++i) {
134         minVal = SkTMin(pixels[i], minVal);
135         maxVal = SkTMax(pixels[i], maxVal);
136     }
137 
138     SkASSERT(!is_extremal(minVal));
139     SkASSERT(!is_extremal(maxVal));
140 
141     uint8_t palette[kLATCPaletteSize];
142     generate_latc_palette(palette, maxVal, minVal);
143 
144     uint64_t indices = 0;
145     for (int i = kLATCPixelsPerBlock - 1; i >= 0; --i) {
146 
147         // Find the best palette index
148         uint8_t bestError = abs_diff(pixels[i], palette[0]);
149         uint8_t idx = 0;
150         for (int j = 1; j < kLATCPaletteSize; ++j) {
151             uint8_t error = abs_diff(pixels[i], palette[j]);
152             if (error < bestError) {
153                 bestError = error;
154                 idx = j;
155             }
156         }
157 
158         indices <<= 3;
159         indices |= idx;
160     }
161 
162     return
163         SkEndian_SwapLE64(
164             static_cast<uint64_t>(maxVal) |
165             (static_cast<uint64_t>(minVal) << 8) |
166             (indices << 16));
167 }
168 
169 // Compress a block by using the bounding box of the pixels without taking into
170 // account the extremal values. The generated palette will contain extremal values
171 // and fewer points along the line segment to interpolate.
compress_latc_block_bb_ignore_extremal(const uint8_t pixels[])172 static uint64_t compress_latc_block_bb_ignore_extremal(const uint8_t pixels[]) {
173     uint8_t minVal = 255;
174     uint8_t maxVal = 0;
175     for (int i = 0; i < kLATCPixelsPerBlock; ++i) {
176         if (is_extremal(pixels[i])) {
177             continue;
178         }
179 
180         minVal = SkTMin(pixels[i], minVal);
181         maxVal = SkTMax(pixels[i], maxVal);
182     }
183 
184     SkASSERT(!is_extremal(minVal));
185     SkASSERT(!is_extremal(maxVal));
186 
187     uint8_t palette[kLATCPaletteSize];
188     generate_latc_palette(palette, minVal, maxVal);
189 
190     uint64_t indices = 0;
191     for (int i = kLATCPixelsPerBlock - 1; i >= 0; --i) {
192 
193         // Find the best palette index
194         uint8_t idx = 0;
195         if (is_extremal(pixels[i])) {
196             if (0xFF == pixels[i]) {
197                 idx = 7;
198             } else if (0 == pixels[i]) {
199                 idx = 6;
200             } else {
201                 SkFAIL("Pixel is extremal but not really?!");
202             }
203         } else {
204             uint8_t bestError = abs_diff(pixels[i], palette[0]);
205             for (int j = 1; j < kLATCPaletteSize - 2; ++j) {
206                 uint8_t error = abs_diff(pixels[i], palette[j]);
207                 if (error < bestError) {
208                     bestError = error;
209                     idx = j;
210                 }
211             }
212         }
213 
214         indices <<= 3;
215         indices |= idx;
216     }
217 
218     return
219         SkEndian_SwapLE64(
220             static_cast<uint64_t>(minVal) |
221             (static_cast<uint64_t>(maxVal) << 8) |
222             (indices << 16));
223 }
224 
225 
226 // Compress LATC block. Each 4x4 block of pixels is decompressed by LATC from two
227 // values LUM0 and LUM1, and an index into the generated palette. Details of how
228 // the palette is generated can be found in the comments of generatePalette above.
229 //
230 // We choose which palette type to use based on whether or not 'pixels' contains
231 // any extremal values (0 or 255). If there are extremal values, then we use the
232 // palette that has the extremal values built in. Otherwise, we use the full bounding
233 // box.
234 
compress_latc_block(const uint8_t pixels[])235 static uint64_t compress_latc_block(const uint8_t pixels[]) {
236     // Collect unique pixels
237     int nUniquePixels = 0;
238     uint8_t uniquePixels[kLATCPixelsPerBlock];
239     for (int i = 0; i < kLATCPixelsPerBlock; ++i) {
240         bool foundPixel = false;
241         for (int j = 0; j < nUniquePixels; ++j) {
242             foundPixel = foundPixel || uniquePixels[j] == pixels[i];
243         }
244 
245         if (!foundPixel) {
246             uniquePixels[nUniquePixels] = pixels[i];
247             ++nUniquePixels;
248         }
249     }
250 
251     // If there's only one unique pixel, then our compression is easy.
252     if (1 == nUniquePixels) {
253         return SkEndian_SwapLE64(pixels[0] | (pixels[0] << 8));
254 
255     // Similarly, if there are only two unique pixels, then our compression is
256     // easy again: place the pixels in the block header, and assign the indices
257     // with one or zero depending on which pixel they belong to.
258     } else if (2 == nUniquePixels) {
259         uint64_t outBlock = 0;
260         for (int i = kLATCPixelsPerBlock - 1; i >= 0; --i) {
261             int idx = 0;
262             if (pixels[i] == uniquePixels[1]) {
263                 idx = 1;
264             }
265 
266             outBlock <<= 3;
267             outBlock |= idx;
268         }
269         outBlock <<= 16;
270         outBlock |= (uniquePixels[0] | (uniquePixels[1] << 8));
271         return SkEndian_SwapLE64(outBlock);
272     }
273 
274     // Count non-maximal pixel values
275     int nonExtremalPixels = 0;
276     for (int i = 0; i < nUniquePixels; ++i) {
277         if (!is_extremal(uniquePixels[i])) {
278             ++nonExtremalPixels;
279         }
280     }
281 
282     // If all the pixels are nonmaximal then compute the palette using
283     // the bounding box of all the pixels.
284     if (nonExtremalPixels == nUniquePixels) {
285         // This is really just for correctness, in all of my tests we
286         // never take this step. We don't lose too much perf here because
287         // most of the processing in this function is worth it for the
288         // 1 == nUniquePixels optimization.
289         return compress_latc_block_bb(pixels);
290     } else {
291         return compress_latc_block_bb_ignore_extremal(pixels);
292     }
293 }
294 
295 #endif  // COMPRESS_LATC_SLOW
296 
297 ////////////////////////////////////////////////////////////////////////////////
298 
299 #if COMPRESS_LATC_FAST
300 
301 // Take the top three bits of each index and pack them into the low 12
302 // bits of the integer.
pack_index(uint32_t x)303 static inline uint32_t pack_index(uint32_t x) {
304     // Pack it in...
305 #if defined (SK_CPU_BENDIAN)
306     return
307         (x >> 24) |
308         ((x >> 13) & 0x38) |
309         ((x >> 2) & 0x1C0) |
310         ((x << 9) & 0xE00);
311 #else
312     return
313         (x & 0x7) |
314         ((x >> 5) & 0x38) |
315         ((x >> 10) & 0x1C0) |
316         ((x >> 15) & 0xE00);
317 #endif
318 }
319 
320 // Converts each 8-bit byte in the integer into an LATC index, and then packs
321 // the indices into the low 12 bits of the integer.
convert_index(uint32_t x)322 static inline uint32_t convert_index(uint32_t x) {
323     // Since the palette is
324     // 255, 0, 219, 182, 146, 109, 73, 36
325     // we need to map the high three bits of each byte in the integer
326     // from
327     // 0 1 2 3 4 5 6 7
328     // to
329     // 1 7 6 5 4 3 2 0
330     //
331     // This first operation takes the mapping from
332     // 0 1 2 3 4 5 6 7  -->  7 6 5 4 3 2 1 0
333     x = 0x07070707 - SkTextureCompressor::ConvertToThreeBitIndex(x);
334 
335     // mask is 1 if index is non-zero
336     const uint32_t mask = (x | (x >> 1) | (x >> 2)) & 0x01010101;
337 
338     // add mask:
339     // 7 6 5 4 3 2 1 0 --> 8 7 6 5 4 3 2 0
340     x = (x + mask);
341 
342     // Handle overflow:
343     // 8 7 6 5 4 3 2 0 --> 9 7 6 5 4 3 2 0
344     x |= (x >> 3) & 0x01010101;
345 
346     // Mask out high bits:
347     // 9 7 6 5 4 3 2 0 --> 1 7 6 5 4 3 2 0
348     x &= 0x07070707;
349 
350     return pack_index(x);
351 }
352 
353 typedef uint64_t (*PackIndicesProc)(const uint8_t* alpha, size_t rowBytes);
354 template<PackIndicesProc packIndicesProc>
compress_a8_latc_block(uint8_t ** dstPtr,const uint8_t * src,size_t rowBytes)355 static void compress_a8_latc_block(uint8_t** dstPtr, const uint8_t* src, size_t rowBytes) {
356     *(reinterpret_cast<uint64_t*>(*dstPtr)) =
357         SkEndian_SwapLE64(0xFF | (packIndicesProc(src, rowBytes) << 16));
358     *dstPtr += 8;
359 }
360 
PackRowMajor(const uint8_t * indices,size_t rowBytes)361 inline uint64_t PackRowMajor(const uint8_t *indices, size_t rowBytes) {
362     uint64_t result = 0;
363     for (int i = 0; i < 4; ++i) {
364         const uint32_t idx = *(reinterpret_cast<const uint32_t*>(indices + i*rowBytes));
365         result |= static_cast<uint64_t>(convert_index(idx)) << 12*i;
366     }
367     return result;
368 }
369 
PackColumnMajor(const uint8_t * indices,size_t rowBytes)370 inline uint64_t PackColumnMajor(const uint8_t *indices, size_t rowBytes) {
371     // !SPEED! Blarg, this is kind of annoying. SSE4 can make this
372     // a LOT faster.
373     uint8_t transposed[16];
374     for (int i = 0; i < 4; ++i) {
375         for (int j = 0; j < 4; ++j) {
376             transposed[j*4+i] = indices[i*rowBytes + j];
377         }
378     }
379 
380     return PackRowMajor(transposed, 4);
381 }
382 
compress_4x4_a8_latc(uint8_t * dst,const uint8_t * src,int width,int height,size_t rowBytes)383 static bool compress_4x4_a8_latc(uint8_t* dst, const uint8_t* src,
384                                  int width, int height, size_t rowBytes) {
385 
386     if (width < 0 || ((width % 4) != 0) || height < 0 || ((height % 4) != 0)) {
387         return false;
388     }
389 
390     uint8_t** dstPtr = &dst;
391     for (int y = 0; y < height; y += 4) {
392         for (int x = 0; x < width; x += 4) {
393             compress_a8_latc_block<PackRowMajor>(dstPtr, src + y*rowBytes + x, rowBytes);
394         }
395     }
396 
397     return true;
398 }
399 
CompressA8LATCBlockVertical(uint8_t * dst,const uint8_t block[])400 void CompressA8LATCBlockVertical(uint8_t* dst, const uint8_t block[]) {
401     compress_a8_latc_block<PackColumnMajor>(&dst, block, 4);
402 }
403 
404 #endif  // COMPRESS_LATC_FAST
405 
decompress_latc_block(uint8_t * dst,int dstRowBytes,const uint8_t * src)406 void decompress_latc_block(uint8_t* dst, int dstRowBytes, const uint8_t* src) {
407     uint64_t block = SkEndian_SwapLE64(*(reinterpret_cast<const uint64_t *>(src)));
408     uint8_t lum0 = block & 0xFF;
409     uint8_t lum1 = (block >> 8) & 0xFF;
410 
411     uint8_t palette[kLATCPaletteSize];
412     generate_latc_palette(palette, lum0, lum1);
413 
414     block >>= 16;
415     for (int j = 0; j < 4; ++j) {
416         for (int i = 0; i < 4; ++i) {
417             dst[i] = palette[block & 0x7];
418             block >>= 3;
419         }
420         dst += dstRowBytes;
421     }
422 }
423 
424 // This is the type passed as the CompressorType argument of the compressed
425 // blitter for the LATC format. The static functions required to be in this
426 // struct are documented in SkTextureCompressor_Blitter.h
427 struct CompressorLATC {
CompressA8VerticalCompressorLATC428     static inline void CompressA8Vertical(uint8_t* dst, const uint8_t block[]) {
429         compress_a8_latc_block<PackColumnMajor>(&dst, block, 4);
430     }
431 
CompressA8HorizontalCompressorLATC432     static inline void CompressA8Horizontal(uint8_t* dst, const uint8_t* src,
433                                             int srcRowBytes) {
434         compress_a8_latc_block<PackRowMajor>(&dst, src, srcRowBytes);
435     }
436 
437 #if PEDANTIC_BLIT_RECT
UpdateBlockCompressorLATC438     static inline void UpdateBlock(uint8_t* dst, const uint8_t* src, int srcRowBytes,
439                                    const uint8_t* mask) {
440         // Pack the mask
441         uint64_t cmpMask = 0;
442         for (int i = 0; i < 4; ++i) {
443             const uint32_t idx = *(reinterpret_cast<const uint32_t*>(src + i*srcRowBytes));
444             cmpMask |= static_cast<uint64_t>(pack_index(idx)) << 12*i;
445         }
446         cmpMask = SkEndian_SwapLE64(cmpMask << 16); // avoid header
447 
448         uint64_t cmpSrc;
449         uint8_t *cmpSrcPtr = reinterpret_cast<uint8_t*>(&cmpSrc);
450         compress_a8_latc_block<PackRowMajor>(&cmpSrcPtr, src, srcRowBytes);
451 
452         // Mask out header
453         cmpSrc = cmpSrc & cmpMask;
454 
455         // Read destination encoding
456         uint64_t *cmpDst = reinterpret_cast<uint64_t*>(dst);
457 
458         // If the destination is the encoding for a blank block, then we need
459         // to properly set the header
460         if (0 == cmpDst) {
461             *cmpDst = SkTEndian_SwapLE64(0x24924924924900FFULL);
462         }
463 
464         // Set the new indices
465         *cmpDst &= ~cmpMask;
466         *cmpDst |= cmpSrc;
467     }
468 #endif  // PEDANTIC_BLIT_RECT
469 };
470 
471 ////////////////////////////////////////////////////////////////////////////////
472 
473 namespace SkTextureCompressor {
474 
CompressA8ToLATC(uint8_t * dst,const uint8_t * src,int width,int height,size_t rowBytes)475 bool CompressA8ToLATC(uint8_t* dst, const uint8_t* src, int width, int height, size_t rowBytes) {
476 #if COMPRESS_LATC_FAST
477     return compress_4x4_a8_latc(dst, src, width, height, rowBytes);
478 #elif COMPRESS_LATC_SLOW
479     return compress_4x4_a8_to_64bit(dst, src, width, height, rowBytes, compress_latc_block);
480 #else
481 #error "Must choose either fast or slow LATC compression"
482 #endif
483 }
484 
CreateLATCBlitter(int width,int height,void * outputBuffer,SkTBlitterAllocator * allocator)485 SkBlitter* CreateLATCBlitter(int width, int height, void* outputBuffer,
486                              SkTBlitterAllocator* allocator) {
487     if ((width % 4) != 0 || (height % 4) != 0) {
488         return nullptr;
489     }
490 
491 #if COMPRESS_LATC_FAST
492     // Memset the output buffer to an encoding that decodes to zero. We must do this
493     // in order to avoid having uninitialized values in the buffer if the blitter
494     // decides not to write certain scanlines (and skip entire rows of blocks).
495     // In the case of LATC, if everything is zero, then LUM0 and LUM1 are also zero,
496     // and they will only be non-zero (0xFF) if the index is 7. So bzero will do just fine.
497     // (8 bytes per block) * (w * h / 16 blocks) = w * h / 2
498     sk_bzero(outputBuffer, width * height / 2);
499 
500     return allocator->createT<
501         SkTCompressedAlphaBlitter<4, 8, CompressorLATC>, int, int, void* >
502         (width, height, outputBuffer);
503 #elif COMPRESS_LATC_SLOW
504     // TODO (krajcevski)
505     return nullptr;
506 #endif
507 }
508 
DecompressLATC(uint8_t * dst,int dstRowBytes,const uint8_t * src,int width,int height)509 void DecompressLATC(uint8_t* dst, int dstRowBytes, const uint8_t* src, int width, int height) {
510     for (int j = 0; j < height; j += 4) {
511         for (int i = 0; i < width; i += 4) {
512             decompress_latc_block(dst + i, dstRowBytes, src);
513             src += 8;
514         }
515         dst += 4 * dstRowBytes;
516     }
517 }
518 
519 }  // SkTextureCompressor
520