1 /**************************************************************************
2  *
3  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
4  * Copyright (c) 2008 VMware, Inc.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included
14  * in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  **************************************************************************/
25 
26 #include "util/format/u_format.h"
27 #include "util/format/u_format_fxt1.h"
28 #include "util/format_srgb.h"
29 #include "util/u_math.h"
30 
31 #include "u_format_pack.h"
32 
33 #define RCOMP 0
34 #define GCOMP 1
35 #define BCOMP 2
36 #define ACOMP 3
37 
38 #define FXT1_BLOCK_SIZE 16
39 
40 static void
41 fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
42              const void *source, int32_t srcRowStride,
43              void *dest, int32_t destRowStride);
44 
45 static void
46 fxt1_decode_1 (const void *texture, int32_t stride,
47                int32_t i, int32_t j, uint8_t *rgba);
48 
49 /***************************************************************************\
50  * FXT1 encoder
51  *
52  * The encoder was built by reversing the decoder,
53  * and is vaguely based on Texus2 by 3dfx. Note that this code
54  * is merely a proof of concept, since it is highly UNoptimized;
55  * moreover, it is sub-optimal due to initial conditions passed
56  * to Lloyd's algorithm (the interpolation modes are even worse).
57 \***************************************************************************/
58 
59 
60 #define MAX_COMP 4 /* ever needed maximum number of components in texel */
61 #define MAX_VECT 4 /* ever needed maximum number of base vectors to find */
62 #define N_TEXELS 32 /* number of texels in a block (always 32) */
63 #define LL_N_REP 50 /* number of iterations in lloyd's vq */
64 #define LL_RMS_D 10 /* fault tolerance (maximum delta) */
65 #define LL_RMS_E 255 /* fault tolerance (maximum error) */
66 #define ALPHA_TS 2 /* alpha threshold: (255 - ALPHA_TS) deemed opaque */
67 static const uint32_t zero = 0;
68 #define ISTBLACK(v) (memcmp(&(v), &zero, sizeof(zero)) == 0)
69 
70 /*
71  * Define a 64-bit unsigned integer type and macros
72  */
73 #if 1
74 
75 #define FX64_NATIVE 1
76 
77 typedef uint64_t Fx64;
78 
79 #define FX64_MOV32(a, b) a = b
80 #define FX64_OR32(a, b)  a |= b
81 #define FX64_SHL(a, c)   a <<= c
82 
83 #else
84 
85 #define FX64_NATIVE 0
86 
87 typedef struct {
88    uint32_t lo, hi;
89 } Fx64;
90 
91 #define FX64_MOV32(a, b) a.lo = b
92 #define FX64_OR32(a, b)  a.lo |= b
93 
94 #define FX64_SHL(a, c)                                 \
95    do {                                                \
96        if ((c) >= 32) {                                \
97           a.hi = a.lo << ((c) - 32);                   \
98           a.lo = 0;                                    \
99        } else {                                        \
100           a.hi = (a.hi << (c)) | (a.lo >> (32 - (c))); \
101           a.lo <<= (c);                                \
102        }                                               \
103    } while (0)
104 
105 #endif
106 
107 
108 #define F(i) (float)1 /* can be used to obtain an oblong metric: 0.30 / 0.59 / 0.11 */
109 #define SAFECDOT 1 /* for paranoids */
110 
111 #define MAKEIVEC(NV, NC, IV, B, V0, V1)  \
112    do {                                  \
113       /* compute interpolation vector */ \
114       float d2 = 0.0F;                   \
115       float rd2;                         \
116                                          \
117       for (i = 0; i < NC; i++) {         \
118          IV[i] = (V1[i] - V0[i]) * F(i); \
119          d2 += IV[i] * IV[i];            \
120       }                                  \
121       rd2 = (float)NV / d2;              \
122       B = 0;                             \
123       for (i = 0; i < NC; i++) {         \
124          IV[i] *= F(i);                  \
125          B -= IV[i] * V0[i];             \
126          IV[i] *= rd2;                   \
127       }                                  \
128       B = B * rd2 + 0.5f;                \
129    } while (0)
130 
131 #define CALCCDOT(TEXEL, NV, NC, IV, B, V)\
132    do {                                  \
133       float dot = 0.0F;                  \
134       for (i = 0; i < NC; i++) {         \
135          dot += V[i] * IV[i];            \
136       }                                  \
137       TEXEL = (int32_t)(dot + B);        \
138       if (SAFECDOT) {                    \
139          if (TEXEL < 0) {                \
140             TEXEL = 0;                   \
141          } else if (TEXEL > NV) {        \
142             TEXEL = NV;                  \
143          }                               \
144       }                                  \
145    } while (0)
146 
147 
148 static int32_t
fxt1_bestcol(float vec[][MAX_COMP],int32_t nv,uint8_t input[MAX_COMP],int32_t nc)149 fxt1_bestcol (float vec[][MAX_COMP], int32_t nv,
150               uint8_t input[MAX_COMP], int32_t nc)
151 {
152    int32_t i, j, best = -1;
153    float err = 1e9; /* big enough */
154 
155    for (j = 0; j < nv; j++) {
156       float e = 0.0F;
157       for (i = 0; i < nc; i++) {
158          e += (vec[j][i] - input[i]) * (vec[j][i] - input[i]);
159       }
160       if (e < err) {
161          err = e;
162          best = j;
163       }
164    }
165 
166    return best;
167 }
168 
169 
170 static int32_t
fxt1_worst(float vec[MAX_COMP],uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)171 fxt1_worst (float vec[MAX_COMP],
172             uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
173 {
174    int32_t i, k, worst = -1;
175    float err = -1.0F; /* small enough */
176 
177    for (k = 0; k < n; k++) {
178       float e = 0.0F;
179       for (i = 0; i < nc; i++) {
180          e += (vec[i] - input[k][i]) * (vec[i] - input[k][i]);
181       }
182       if (e > err) {
183          err = e;
184          worst = k;
185       }
186    }
187 
188    return worst;
189 }
190 
191 
192 static int32_t
fxt1_variance(uint8_t input[N_TEXELS/2][MAX_COMP],int32_t nc)193 fxt1_variance (uint8_t input[N_TEXELS / 2][MAX_COMP], int32_t nc)
194 {
195    const int n = N_TEXELS / 2;
196    int32_t i, k, best = 0;
197    int32_t sx, sx2;
198    double var, maxvar = -1; /* small enough */
199    double teenth = 1.0 / n;
200 
201    for (i = 0; i < nc; i++) {
202       sx = sx2 = 0;
203       for (k = 0; k < n; k++) {
204          int32_t t = input[k][i];
205          sx += t;
206          sx2 += t * t;
207       }
208       var = sx2 * teenth - sx * sx * teenth * teenth;
209       if (maxvar < var) {
210          maxvar = var;
211          best = i;
212       }
213    }
214 
215    return best;
216 }
217 
218 
219 static int32_t
fxt1_choose(float vec[][MAX_COMP],int32_t nv,uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)220 fxt1_choose (float vec[][MAX_COMP], int32_t nv,
221              uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
222 {
223 #if 0
224    /* Choose colors from a grid.
225     */
226    int32_t i, j;
227 
228    for (j = 0; j < nv; j++) {
229       int32_t m = j * (n - 1) / (nv - 1);
230       for (i = 0; i < nc; i++) {
231          vec[j][i] = input[m][i];
232       }
233    }
234 #else
235    /* Our solution here is to find the darkest and brightest colors in
236     * the 8x4 tile and use those as the two representative colors.
237     * There are probably better algorithms to use (histogram-based).
238     */
239    int32_t i, j, k;
240    int32_t minSum = 2000; /* big enough */
241    int32_t maxSum = -1; /* small enough */
242    int32_t minCol = 0; /* phoudoin: silent compiler! */
243    int32_t maxCol = 0; /* phoudoin: silent compiler! */
244 
245    struct {
246       int32_t flag;
247       int32_t key;
248       int32_t freq;
249       int32_t idx;
250    } hist[N_TEXELS];
251    int32_t lenh = 0;
252 
253    memset(hist, 0, sizeof(hist));
254 
255    for (k = 0; k < n; k++) {
256       int32_t l;
257       int32_t key = 0;
258       int32_t sum = 0;
259       for (i = 0; i < nc; i++) {
260          key <<= 8;
261          key |= input[k][i];
262          sum += input[k][i];
263       }
264       for (l = 0; l < n; l++) {
265          if (!hist[l].flag) {
266             /* alloc new slot */
267             hist[l].flag = !0;
268             hist[l].key = key;
269             hist[l].freq = 1;
270             hist[l].idx = k;
271             lenh = l + 1;
272             break;
273          } else if (hist[l].key == key) {
274             hist[l].freq++;
275             break;
276          }
277       }
278       if (minSum > sum) {
279          minSum = sum;
280          minCol = k;
281       }
282       if (maxSum < sum) {
283          maxSum = sum;
284          maxCol = k;
285       }
286    }
287 
288    if (lenh <= nv) {
289       for (j = 0; j < lenh; j++) {
290          for (i = 0; i < nc; i++) {
291             vec[j][i] = (float)input[hist[j].idx][i];
292          }
293       }
294       for (; j < nv; j++) {
295          for (i = 0; i < nc; i++) {
296             vec[j][i] = vec[0][i];
297          }
298       }
299       return 0;
300    }
301 
302    for (j = 0; j < nv; j++) {
303       for (i = 0; i < nc; i++) {
304          vec[j][i] = ((nv - 1 - j) * input[minCol][i] + j * input[maxCol][i] + (nv - 1) / 2) / (float)(nv - 1);
305       }
306    }
307 #endif
308 
309    return !0;
310 }
311 
312 
313 static int32_t
fxt1_lloyd(float vec[][MAX_COMP],int32_t nv,uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)314 fxt1_lloyd (float vec[][MAX_COMP], int32_t nv,
315             uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
316 {
317    /* Use the generalized lloyd's algorithm for VQ:
318     *     find 4 color vectors.
319     *
320     *     for each sample color
321     *         sort to nearest vector.
322     *
323     *     replace each vector with the centroid of its matching colors.
324     *
325     *     repeat until RMS doesn't improve.
326     *
327     *     if a color vector has no samples, or becomes the same as another
328     *     vector, replace it with the color which is farthest from a sample.
329     *
330     * vec[][MAX_COMP]           initial vectors and resulting colors
331     * nv                        number of resulting colors required
332     * input[N_TEXELS][MAX_COMP] input texels
333     * nc                        number of components in input / vec
334     * n                         number of input samples
335     */
336 
337    int32_t sum[MAX_VECT][MAX_COMP]; /* used to accumulate closest texels */
338    int32_t cnt[MAX_VECT]; /* how many times a certain vector was chosen */
339    float error, lasterror = 1e9;
340 
341    int32_t i, j, k, rep;
342 
343    /* the quantizer */
344    for (rep = 0; rep < LL_N_REP; rep++) {
345       /* reset sums & counters */
346       for (j = 0; j < nv; j++) {
347          for (i = 0; i < nc; i++) {
348             sum[j][i] = 0;
349          }
350          cnt[j] = 0;
351       }
352       error = 0;
353 
354       /* scan whole block */
355       for (k = 0; k < n; k++) {
356 #if 1
357          int32_t best = -1;
358          float err = 1e9; /* big enough */
359          /* determine best vector */
360          for (j = 0; j < nv; j++) {
361             float e = (vec[j][0] - input[k][0]) * (vec[j][0] - input[k][0]) +
362                       (vec[j][1] - input[k][1]) * (vec[j][1] - input[k][1]) +
363                       (vec[j][2] - input[k][2]) * (vec[j][2] - input[k][2]);
364             if (nc == 4) {
365                e += (vec[j][3] - input[k][3]) * (vec[j][3] - input[k][3]);
366             }
367             if (e < err) {
368                err = e;
369                best = j;
370             }
371          }
372 #else
373          int32_t best = fxt1_bestcol(vec, nv, input[k], nc, &err);
374 #endif
375          assert(best >= 0);
376          /* add in closest color */
377          for (i = 0; i < nc; i++) {
378             sum[best][i] += input[k][i];
379          }
380          /* mark this vector as used */
381          cnt[best]++;
382          /* accumulate error */
383          error += err;
384       }
385 
386       /* check RMS */
387       if ((error < LL_RMS_E) ||
388           ((error < lasterror) && ((lasterror - error) < LL_RMS_D))) {
389          return !0; /* good match */
390       }
391       lasterror = error;
392 
393       /* move each vector to the barycenter of its closest colors */
394       for (j = 0; j < nv; j++) {
395          if (cnt[j]) {
396             float div = 1.0F / cnt[j];
397             for (i = 0; i < nc; i++) {
398                vec[j][i] = div * sum[j][i];
399             }
400          } else {
401             /* this vec has no samples or is identical with a previous vec */
402             int32_t worst = fxt1_worst(vec[j], input, nc, n);
403             for (i = 0; i < nc; i++) {
404                vec[j][i] = input[worst][i];
405             }
406          }
407       }
408    }
409 
410    return 0; /* could not converge fast enough */
411 }
412 
413 
414 static void
fxt1_quantize_CHROMA(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])415 fxt1_quantize_CHROMA (uint32_t *cc,
416                       uint8_t input[N_TEXELS][MAX_COMP])
417 {
418    const int32_t n_vect = 4; /* 4 base vectors to find */
419    const int32_t n_comp = 3; /* 3 components: R, G, B */
420    float vec[MAX_VECT][MAX_COMP];
421    int32_t i, j, k;
422    Fx64 hi; /* high quadword */
423    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
424 
425    if (fxt1_choose(vec, n_vect, input, n_comp, N_TEXELS) != 0) {
426       fxt1_lloyd(vec, n_vect, input, n_comp, N_TEXELS);
427    }
428 
429    FX64_MOV32(hi, 4); /* cc-chroma = "010" + unused bit */
430    for (j = n_vect - 1; j >= 0; j--) {
431       for (i = 0; i < n_comp; i++) {
432          /* add in colors */
433          FX64_SHL(hi, 5);
434          FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
435       }
436    }
437    ((Fx64 *)cc)[1] = hi;
438 
439    lohi = lolo = 0;
440    /* right microtile */
441    for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
442       lohi <<= 2;
443       lohi |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
444    }
445    /* left microtile */
446    for (; k >= 0; k--) {
447       lolo <<= 2;
448       lolo |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
449    }
450    cc[1] = lohi;
451    cc[0] = lolo;
452 }
453 
454 
455 static void
fxt1_quantize_ALPHA0(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP],uint8_t reord[N_TEXELS][MAX_COMP],int32_t n)456 fxt1_quantize_ALPHA0 (uint32_t *cc,
457                       uint8_t input[N_TEXELS][MAX_COMP],
458                       uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
459 {
460    const int32_t n_vect = 3; /* 3 base vectors to find */
461    const int32_t n_comp = 4; /* 4 components: R, G, B, A */
462    float vec[MAX_VECT][MAX_COMP];
463    int32_t i, j, k;
464    Fx64 hi; /* high quadword */
465    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
466 
467    /* the last vector indicates zero */
468    for (i = 0; i < n_comp; i++) {
469       vec[n_vect][i] = 0;
470    }
471 
472    /* the first n texels in reord are guaranteed to be non-zero */
473    if (fxt1_choose(vec, n_vect, reord, n_comp, n) != 0) {
474       fxt1_lloyd(vec, n_vect, reord, n_comp, n);
475    }
476 
477    FX64_MOV32(hi, 6); /* alpha = "011" + lerp = 0 */
478    for (j = n_vect - 1; j >= 0; j--) {
479       /* add in alphas */
480       FX64_SHL(hi, 5);
481       FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
482    }
483    for (j = n_vect - 1; j >= 0; j--) {
484       for (i = 0; i < n_comp - 1; i++) {
485          /* add in colors */
486          FX64_SHL(hi, 5);
487          FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
488       }
489    }
490    ((Fx64 *)cc)[1] = hi;
491 
492    lohi = lolo = 0;
493    /* right microtile */
494    for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
495       lohi <<= 2;
496       lohi |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
497    }
498    /* left microtile */
499    for (; k >= 0; k--) {
500       lolo <<= 2;
501       lolo |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
502    }
503    cc[1] = lohi;
504    cc[0] = lolo;
505 }
506 
507 
508 static void
fxt1_quantize_ALPHA1(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])509 fxt1_quantize_ALPHA1 (uint32_t *cc,
510                       uint8_t input[N_TEXELS][MAX_COMP])
511 {
512    const int32_t n_vect = 3; /* highest vector number in each microtile */
513    const int32_t n_comp = 4; /* 4 components: R, G, B, A */
514    float vec[1 + 1 + 1][MAX_COMP]; /* 1.5 extrema for each sub-block */
515    float b, iv[MAX_COMP]; /* interpolation vector */
516    int32_t i, j, k;
517    Fx64 hi; /* high quadword */
518    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
519 
520    int32_t minSum;
521    int32_t maxSum;
522    int32_t minColL = 0, maxColL = 0;
523    int32_t minColR = 0, maxColR = 0;
524    int32_t sumL = 0, sumR = 0;
525    int32_t nn_comp;
526    /* Our solution here is to find the darkest and brightest colors in
527     * the 4x4 tile and use those as the two representative colors.
528     * There are probably better algorithms to use (histogram-based).
529     */
530    nn_comp = n_comp;
531    while ((minColL == maxColL) && nn_comp) {
532        minSum = 2000; /* big enough */
533        maxSum = -1; /* small enough */
534        for (k = 0; k < N_TEXELS / 2; k++) {
535            int32_t sum = 0;
536            for (i = 0; i < nn_comp; i++) {
537                sum += input[k][i];
538            }
539            if (minSum > sum) {
540                minSum = sum;
541                minColL = k;
542            }
543            if (maxSum < sum) {
544                maxSum = sum;
545                maxColL = k;
546            }
547            sumL += sum;
548        }
549 
550        nn_comp--;
551    }
552 
553    nn_comp = n_comp;
554    while ((minColR == maxColR) && nn_comp) {
555        minSum = 2000; /* big enough */
556        maxSum = -1; /* small enough */
557        for (k = N_TEXELS / 2; k < N_TEXELS; k++) {
558            int32_t sum = 0;
559            for (i = 0; i < nn_comp; i++) {
560                sum += input[k][i];
561            }
562            if (minSum > sum) {
563                minSum = sum;
564                minColR = k;
565            }
566            if (maxSum < sum) {
567                maxSum = sum;
568                maxColR = k;
569            }
570            sumR += sum;
571        }
572 
573        nn_comp--;
574    }
575 
576    /* choose the common vector (yuck!) */
577    {
578       int32_t j1, j2;
579       int32_t v1 = 0, v2 = 0;
580       float err = 1e9; /* big enough */
581       float tv[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
582       for (i = 0; i < n_comp; i++) {
583          tv[0][i] = input[minColL][i];
584          tv[1][i] = input[maxColL][i];
585          tv[2][i] = input[minColR][i];
586          tv[3][i] = input[maxColR][i];
587       }
588       for (j1 = 0; j1 < 2; j1++) {
589          for (j2 = 2; j2 < 4; j2++) {
590             float e = 0.0F;
591             for (i = 0; i < n_comp; i++) {
592                e += (tv[j1][i] - tv[j2][i]) * (tv[j1][i] - tv[j2][i]);
593             }
594             if (e < err) {
595                err = e;
596                v1 = j1;
597                v2 = j2;
598             }
599          }
600       }
601       for (i = 0; i < n_comp; i++) {
602          vec[0][i] = tv[1 - v1][i];
603          vec[1][i] = (tv[v1][i] * sumL + tv[v2][i] * sumR) / (sumL + sumR);
604          vec[2][i] = tv[5 - v2][i];
605       }
606    }
607 
608    /* left microtile */
609    cc[0] = 0;
610    if (minColL != maxColL) {
611       /* compute interpolation vector */
612       MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
613 
614       /* add in texels */
615       lolo = 0;
616       for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
617          int32_t texel;
618          /* interpolate color */
619          CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
620          /* add in texel */
621          lolo <<= 2;
622          lolo |= texel;
623       }
624 
625       cc[0] = lolo;
626    }
627 
628    /* right microtile */
629    cc[1] = 0;
630    if (minColR != maxColR) {
631       /* compute interpolation vector */
632       MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[1]);
633 
634       /* add in texels */
635       lohi = 0;
636       for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
637          int32_t texel;
638          /* interpolate color */
639          CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
640          /* add in texel */
641          lohi <<= 2;
642          lohi |= texel;
643       }
644 
645       cc[1] = lohi;
646    }
647 
648    FX64_MOV32(hi, 7); /* alpha = "011" + lerp = 1 */
649    for (j = n_vect - 1; j >= 0; j--) {
650       /* add in alphas */
651       FX64_SHL(hi, 5);
652       FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
653    }
654    for (j = n_vect - 1; j >= 0; j--) {
655       for (i = 0; i < n_comp - 1; i++) {
656          /* add in colors */
657          FX64_SHL(hi, 5);
658          FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
659       }
660    }
661    ((Fx64 *)cc)[1] = hi;
662 }
663 
664 
665 static void
fxt1_quantize_HI(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP],uint8_t reord[N_TEXELS][MAX_COMP],int32_t n)666 fxt1_quantize_HI (uint32_t *cc,
667                   uint8_t input[N_TEXELS][MAX_COMP],
668                   uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
669 {
670    const int32_t n_vect = 6; /* highest vector number */
671    const int32_t n_comp = 3; /* 3 components: R, G, B */
672    float b = 0.0F;       /* phoudoin: silent compiler! */
673    float iv[MAX_COMP];   /* interpolation vector */
674    int32_t i, k;
675    uint32_t hihi; /* high quadword: hi dword */
676 
677    int32_t minSum = 2000; /* big enough */
678    int32_t maxSum = -1; /* small enough */
679    int32_t minCol = 0; /* phoudoin: silent compiler! */
680    int32_t maxCol = 0; /* phoudoin: silent compiler! */
681 
682    /* Our solution here is to find the darkest and brightest colors in
683     * the 8x4 tile and use those as the two representative colors.
684     * There are probably better algorithms to use (histogram-based).
685     */
686    for (k = 0; k < n; k++) {
687       int32_t sum = 0;
688       for (i = 0; i < n_comp; i++) {
689          sum += reord[k][i];
690       }
691       if (minSum > sum) {
692          minSum = sum;
693          minCol = k;
694       }
695       if (maxSum < sum) {
696          maxSum = sum;
697          maxCol = k;
698       }
699    }
700 
701    hihi = 0; /* cc-hi = "00" */
702    for (i = 0; i < n_comp; i++) {
703       /* add in colors */
704       hihi <<= 5;
705       hihi |= reord[maxCol][i] >> 3;
706    }
707    for (i = 0; i < n_comp; i++) {
708       /* add in colors */
709       hihi <<= 5;
710       hihi |= reord[minCol][i] >> 3;
711    }
712    cc[3] = hihi;
713    cc[0] = cc[1] = cc[2] = 0;
714 
715    /* compute interpolation vector */
716    if (minCol != maxCol) {
717       MAKEIVEC(n_vect, n_comp, iv, b, reord[minCol], reord[maxCol]);
718    }
719 
720    /* add in texels */
721    for (k = N_TEXELS - 1; k >= 0; k--) {
722       int32_t t = k * 3;
723       uint32_t *kk = (uint32_t *)((char *)cc + t / 8);
724       int32_t texel = n_vect + 1; /* transparent black */
725 
726       if (!ISTBLACK(input[k])) {
727          if (minCol != maxCol) {
728             /* interpolate color */
729             CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
730             /* add in texel */
731             kk[0] |= texel << (t & 7);
732          }
733       } else {
734          /* add in texel */
735          kk[0] |= texel << (t & 7);
736       }
737    }
738 }
739 
740 
741 static void
fxt1_quantize_MIXED1(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])742 fxt1_quantize_MIXED1 (uint32_t *cc,
743                       uint8_t input[N_TEXELS][MAX_COMP])
744 {
745    const int32_t n_vect = 2; /* highest vector number in each microtile */
746    const int32_t n_comp = 3; /* 3 components: R, G, B */
747    uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
748    float b, iv[MAX_COMP]; /* interpolation vector */
749    int32_t i, j, k;
750    Fx64 hi; /* high quadword */
751    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
752 
753    int32_t minSum;
754    int32_t maxSum;
755    int32_t minColL = 0, maxColL = -1;
756    int32_t minColR = 0, maxColR = -1;
757 
758    /* Our solution here is to find the darkest and brightest colors in
759     * the 4x4 tile and use those as the two representative colors.
760     * There are probably better algorithms to use (histogram-based).
761     */
762    minSum = 2000; /* big enough */
763    maxSum = -1; /* small enough */
764    for (k = 0; k < N_TEXELS / 2; k++) {
765       if (!ISTBLACK(input[k])) {
766          int32_t sum = 0;
767          for (i = 0; i < n_comp; i++) {
768             sum += input[k][i];
769          }
770          if (minSum > sum) {
771             minSum = sum;
772             minColL = k;
773          }
774          if (maxSum < sum) {
775             maxSum = sum;
776             maxColL = k;
777          }
778       }
779    }
780    minSum = 2000; /* big enough */
781    maxSum = -1; /* small enough */
782    for (; k < N_TEXELS; k++) {
783       if (!ISTBLACK(input[k])) {
784          int32_t sum = 0;
785          for (i = 0; i < n_comp; i++) {
786             sum += input[k][i];
787          }
788          if (minSum > sum) {
789             minSum = sum;
790             minColR = k;
791          }
792          if (maxSum < sum) {
793             maxSum = sum;
794             maxColR = k;
795          }
796       }
797    }
798 
799    /* left microtile */
800    if (maxColL == -1) {
801       /* all transparent black */
802       cc[0] = ~0u;
803       for (i = 0; i < n_comp; i++) {
804          vec[0][i] = 0;
805          vec[1][i] = 0;
806       }
807    } else {
808       cc[0] = 0;
809       for (i = 0; i < n_comp; i++) {
810          vec[0][i] = input[minColL][i];
811          vec[1][i] = input[maxColL][i];
812       }
813       if (minColL != maxColL) {
814          /* compute interpolation vector */
815          MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
816 
817          /* add in texels */
818          lolo = 0;
819          for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
820             int32_t texel = n_vect + 1; /* transparent black */
821             if (!ISTBLACK(input[k])) {
822                /* interpolate color */
823                CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
824             }
825             /* add in texel */
826             lolo <<= 2;
827             lolo |= texel;
828          }
829          cc[0] = lolo;
830       }
831    }
832 
833    /* right microtile */
834    if (maxColR == -1) {
835       /* all transparent black */
836       cc[1] = ~0u;
837       for (i = 0; i < n_comp; i++) {
838          vec[2][i] = 0;
839          vec[3][i] = 0;
840       }
841    } else {
842       cc[1] = 0;
843       for (i = 0; i < n_comp; i++) {
844          vec[2][i] = input[minColR][i];
845          vec[3][i] = input[maxColR][i];
846       }
847       if (minColR != maxColR) {
848          /* compute interpolation vector */
849          MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
850 
851          /* add in texels */
852          lohi = 0;
853          for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
854             int32_t texel = n_vect + 1; /* transparent black */
855             if (!ISTBLACK(input[k])) {
856                /* interpolate color */
857                CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
858             }
859             /* add in texel */
860             lohi <<= 2;
861             lohi |= texel;
862          }
863          cc[1] = lohi;
864       }
865    }
866 
867    FX64_MOV32(hi, 9 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
868    for (j = 2 * 2 - 1; j >= 0; j--) {
869       for (i = 0; i < n_comp; i++) {
870          /* add in colors */
871          FX64_SHL(hi, 5);
872          FX64_OR32(hi, vec[j][i] >> 3);
873       }
874    }
875    ((Fx64 *)cc)[1] = hi;
876 }
877 
878 
879 static void
fxt1_quantize_MIXED0(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])880 fxt1_quantize_MIXED0 (uint32_t *cc,
881                       uint8_t input[N_TEXELS][MAX_COMP])
882 {
883    const int32_t n_vect = 3; /* highest vector number in each microtile */
884    const int32_t n_comp = 3; /* 3 components: R, G, B */
885    uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
886    float b, iv[MAX_COMP]; /* interpolation vector */
887    int32_t i, j, k;
888    Fx64 hi; /* high quadword */
889    uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
890 
891    int32_t minColL = 0, maxColL = 0;
892    int32_t minColR = 0, maxColR = 0;
893 #if 0
894    int32_t minSum;
895    int32_t maxSum;
896 
897    /* Our solution here is to find the darkest and brightest colors in
898     * the 4x4 tile and use those as the two representative colors.
899     * There are probably better algorithms to use (histogram-based).
900     */
901    minSum = 2000; /* big enough */
902    maxSum = -1; /* small enough */
903    for (k = 0; k < N_TEXELS / 2; k++) {
904       int32_t sum = 0;
905       for (i = 0; i < n_comp; i++) {
906          sum += input[k][i];
907       }
908       if (minSum > sum) {
909          minSum = sum;
910          minColL = k;
911       }
912       if (maxSum < sum) {
913          maxSum = sum;
914          maxColL = k;
915       }
916    }
917    minSum = 2000; /* big enough */
918    maxSum = -1; /* small enough */
919    for (; k < N_TEXELS; k++) {
920       int32_t sum = 0;
921       for (i = 0; i < n_comp; i++) {
922          sum += input[k][i];
923       }
924       if (minSum > sum) {
925          minSum = sum;
926          minColR = k;
927       }
928       if (maxSum < sum) {
929          maxSum = sum;
930          maxColR = k;
931       }
932    }
933 #else
934    int32_t minVal;
935    int32_t maxVal;
936    int32_t maxVarL = fxt1_variance(input, n_comp);
937    int32_t maxVarR = fxt1_variance(&input[N_TEXELS / 2], n_comp);
938 
939    /* Scan the channel with max variance for lo & hi
940     * and use those as the two representative colors.
941     */
942    minVal = 2000; /* big enough */
943    maxVal = -1; /* small enough */
944    for (k = 0; k < N_TEXELS / 2; k++) {
945       int32_t t = input[k][maxVarL];
946       if (minVal > t) {
947          minVal = t;
948          minColL = k;
949       }
950       if (maxVal < t) {
951          maxVal = t;
952          maxColL = k;
953       }
954    }
955    minVal = 2000; /* big enough */
956    maxVal = -1; /* small enough */
957    for (; k < N_TEXELS; k++) {
958       int32_t t = input[k][maxVarR];
959       if (minVal > t) {
960          minVal = t;
961          minColR = k;
962       }
963       if (maxVal < t) {
964          maxVal = t;
965          maxColR = k;
966       }
967    }
968 #endif
969 
970    /* left microtile */
971    cc[0] = 0;
972    for (i = 0; i < n_comp; i++) {
973       vec[0][i] = input[minColL][i];
974       vec[1][i] = input[maxColL][i];
975    }
976    if (minColL != maxColL) {
977       /* compute interpolation vector */
978       MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
979 
980       /* add in texels */
981       lolo = 0;
982       for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
983          int32_t texel;
984          /* interpolate color */
985          CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
986          /* add in texel */
987          lolo <<= 2;
988          lolo |= texel;
989       }
990 
991       /* funky encoding for LSB of green */
992       if ((int32_t)((lolo >> 1) & 1) != (((vec[1][GCOMP] ^ vec[0][GCOMP]) >> 2) & 1)) {
993          for (i = 0; i < n_comp; i++) {
994             vec[1][i] = input[minColL][i];
995             vec[0][i] = input[maxColL][i];
996          }
997          lolo = ~lolo;
998       }
999 
1000       cc[0] = lolo;
1001    }
1002 
1003    /* right microtile */
1004    cc[1] = 0;
1005    for (i = 0; i < n_comp; i++) {
1006       vec[2][i] = input[minColR][i];
1007       vec[3][i] = input[maxColR][i];
1008    }
1009    if (minColR != maxColR) {
1010       /* compute interpolation vector */
1011       MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
1012 
1013       /* add in texels */
1014       lohi = 0;
1015       for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
1016          int32_t texel;
1017          /* interpolate color */
1018          CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
1019          /* add in texel */
1020          lohi <<= 2;
1021          lohi |= texel;
1022       }
1023 
1024       /* funky encoding for LSB of green */
1025       if ((int32_t)((lohi >> 1) & 1) != (((vec[3][GCOMP] ^ vec[2][GCOMP]) >> 2) & 1)) {
1026          for (i = 0; i < n_comp; i++) {
1027             vec[3][i] = input[minColR][i];
1028             vec[2][i] = input[maxColR][i];
1029          }
1030          lohi = ~lohi;
1031       }
1032 
1033       cc[1] = lohi;
1034    }
1035 
1036    FX64_MOV32(hi, 8 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
1037    for (j = 2 * 2 - 1; j >= 0; j--) {
1038       for (i = 0; i < n_comp; i++) {
1039          /* add in colors */
1040          FX64_SHL(hi, 5);
1041          FX64_OR32(hi, vec[j][i] >> 3);
1042       }
1043    }
1044    ((Fx64 *)cc)[1] = hi;
1045 }
1046 
1047 
1048 static void
fxt1_quantize(uint32_t * cc,const uint8_t * lines[],int32_t comps)1049 fxt1_quantize (uint32_t *cc, const uint8_t *lines[], int32_t comps)
1050 {
1051    int32_t trualpha;
1052    uint8_t reord[N_TEXELS][MAX_COMP];
1053 
1054    uint8_t input[N_TEXELS][MAX_COMP];
1055    int32_t i, k, l;
1056 
1057    if (comps == 3) {
1058       /* make the whole block opaque */
1059       memset(input, -1, sizeof(input));
1060    }
1061 
1062    /* 8 texels each line */
1063    for (l = 0; l < 4; l++) {
1064       for (k = 0; k < 4; k++) {
1065          for (i = 0; i < comps; i++) {
1066             input[k + l * 4][i] = *lines[l]++;
1067          }
1068       }
1069       for (; k < 8; k++) {
1070          for (i = 0; i < comps; i++) {
1071             input[k + l * 4 + 12][i] = *lines[l]++;
1072          }
1073       }
1074    }
1075 
1076    /* block layout:
1077     * 00, 01, 02, 03, 08, 09, 0a, 0b
1078     * 10, 11, 12, 13, 18, 19, 1a, 1b
1079     * 04, 05, 06, 07, 0c, 0d, 0e, 0f
1080     * 14, 15, 16, 17, 1c, 1d, 1e, 1f
1081     */
1082 
1083    /* [dBorca]
1084     * stupidity flows forth from this
1085     */
1086    l = N_TEXELS;
1087    trualpha = 0;
1088    if (comps == 4) {
1089       /* skip all transparent black texels */
1090       l = 0;
1091       for (k = 0; k < N_TEXELS; k++) {
1092          /* test all components against 0 */
1093          if (!ISTBLACK(input[k])) {
1094             /* texel is not transparent black */
1095             memcpy(reord[l], input[k], 4);
1096             if (reord[l][ACOMP] < (255 - ALPHA_TS)) {
1097                /* non-opaque texel */
1098                trualpha = !0;
1099             }
1100             l++;
1101          }
1102       }
1103    }
1104 
1105 #if 0
1106    if (trualpha) {
1107       fxt1_quantize_ALPHA0(cc, input, reord, l);
1108    } else if (l == 0) {
1109       cc[0] = cc[1] = cc[2] = -1;
1110       cc[3] = 0;
1111    } else if (l < N_TEXELS) {
1112       fxt1_quantize_HI(cc, input, reord, l);
1113    } else {
1114       fxt1_quantize_CHROMA(cc, input);
1115    }
1116    (void)fxt1_quantize_ALPHA1;
1117    (void)fxt1_quantize_MIXED1;
1118    (void)fxt1_quantize_MIXED0;
1119 #else
1120    if (trualpha) {
1121       fxt1_quantize_ALPHA1(cc, input);
1122    } else if (l == 0) {
1123       cc[0] = cc[1] = cc[2] = ~0u;
1124       cc[3] = 0;
1125    } else if (l < N_TEXELS) {
1126       fxt1_quantize_MIXED1(cc, input);
1127    } else {
1128       fxt1_quantize_MIXED0(cc, input);
1129    }
1130    (void)fxt1_quantize_ALPHA0;
1131    (void)fxt1_quantize_HI;
1132    (void)fxt1_quantize_CHROMA;
1133 #endif
1134 }
1135 
1136 
1137 
1138 /**
1139  * Upscale an image by replication, not (typical) stretching.
1140  * We use this when the image width or height is less than a
1141  * certain size (4, 8) and we need to upscale an image.
1142  */
1143 static void
upscale_teximage2d(int32_t inWidth,int32_t inHeight,int32_t outWidth,int32_t outHeight,int32_t comps,const uint8_t * src,int32_t srcRowStride,uint8_t * dest)1144 upscale_teximage2d(int32_t inWidth, int32_t inHeight,
1145                    int32_t outWidth, int32_t outHeight,
1146                    int32_t comps, const uint8_t *src, int32_t srcRowStride,
1147                    uint8_t *dest )
1148 {
1149    int32_t i, j, k;
1150 
1151    assert(outWidth >= inWidth);
1152    assert(outHeight >= inHeight);
1153 #if 0
1154    assert(inWidth == 1 || inWidth == 2 || inHeight == 1 || inHeight == 2);
1155    assert((outWidth & 3) == 0);
1156    assert((outHeight & 3) == 0);
1157 #endif
1158 
1159    for (i = 0; i < outHeight; i++) {
1160       const int32_t ii = i % inHeight;
1161       for (j = 0; j < outWidth; j++) {
1162          const int32_t jj = j % inWidth;
1163          for (k = 0; k < comps; k++) {
1164             dest[(i * outWidth + j) * comps + k]
1165                = src[ii * srcRowStride + jj * comps + k];
1166          }
1167       }
1168    }
1169 }
1170 
1171 
1172 static void
fxt1_encode(uint32_t width,uint32_t height,int32_t comps,const void * source,int32_t srcRowStride,void * dest,int32_t destRowStride)1173 fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
1174              const void *source, int32_t srcRowStride,
1175              void *dest, int32_t destRowStride)
1176 {
1177    uint32_t x, y;
1178    const uint8_t *data;
1179    uint32_t *encoded = (uint32_t *)dest;
1180    void *newSource = NULL;
1181 
1182    assert(comps == 3 || comps == 4);
1183 
1184    /* Replicate image if width is not M8 or height is not M4 */
1185    if ((width & 7) | (height & 3)) {
1186       int32_t newWidth = (width + 7) & ~7;
1187       int32_t newHeight = (height + 3) & ~3;
1188       newSource = malloc(comps * newWidth * newHeight * sizeof(uint8_t));
1189       if (!newSource)
1190          return;
1191       upscale_teximage2d(width, height, newWidth, newHeight,
1192                          comps, (const uint8_t *) source,
1193                          srcRowStride, (uint8_t *) newSource);
1194       source = newSource;
1195       width = newWidth;
1196       height = newHeight;
1197       srcRowStride = comps * newWidth;
1198    }
1199 
1200    data = (const uint8_t *) source;
1201    destRowStride = (destRowStride - width * 2) / 4;
1202    for (y = 0; y < height; y += 4) {
1203       uint32_t offs = 0 + (y + 0) * srcRowStride;
1204       for (x = 0; x < width; x += 8) {
1205          const uint8_t *lines[4];
1206          lines[0] = &data[offs];
1207          lines[1] = lines[0] + srcRowStride;
1208          lines[2] = lines[1] + srcRowStride;
1209          lines[3] = lines[2] + srcRowStride;
1210          offs += 8 * comps;
1211          fxt1_quantize(encoded, lines, comps);
1212          /* 128 bits per 8x4 block */
1213          encoded += 4;
1214       }
1215       encoded += destRowStride;
1216    }
1217 
1218    free(newSource);
1219 }
1220 
1221 
1222 /***************************************************************************\
1223  * FXT1 decoder
1224  *
1225  * The decoder is based on GL_3DFX_texture_compression_FXT1
1226  * specification and serves as a concept for the encoder.
1227 \***************************************************************************/
1228 
1229 
1230 /* lookup table for scaling 5 bit colors up to 8 bits */
1231 static const uint8_t _rgb_scale_5[] = {
1232    0,   8,   16,  25,  33,  41,  49,  58,
1233    66,  74,  82,  90,  99,  107, 115, 123,
1234    132, 140, 148, 156, 165, 173, 181, 189,
1235    197, 206, 214, 222, 230, 239, 247, 255
1236 };
1237 
1238 /* lookup table for scaling 6 bit colors up to 8 bits */
1239 static const uint8_t _rgb_scale_6[] = {
1240    0,   4,   8,   12,  16,  20,  24,  28,
1241    32,  36,  40,  45,  49,  53,  57,  61,
1242    65,  69,  73,  77,  81,  85,  89,  93,
1243    97,  101, 105, 109, 113, 117, 121, 125,
1244    130, 134, 138, 142, 146, 150, 154, 158,
1245    162, 166, 170, 174, 178, 182, 186, 190,
1246    194, 198, 202, 206, 210, 215, 219, 223,
1247    227, 231, 235, 239, 243, 247, 251, 255
1248 };
1249 
1250 
1251 #define CC_SEL(cc, which) (((uint32_t *)(cc))[(which) / 32] >> ((which) & 31))
1252 #define UP5(c) _rgb_scale_5[(c) & 31]
1253 #define UP6(c, b) _rgb_scale_6[(((c) & 31) << 1) | ((b) & 1)]
1254 #define LERP(n, t, c0, c1) (((n) - (t)) * (c0) + (t) * (c1) + (n) / 2) / (n)
1255 
1256 
1257 static void
fxt1_decode_1HI(const uint8_t * code,int32_t t,uint8_t * rgba)1258 fxt1_decode_1HI (const uint8_t *code, int32_t t, uint8_t *rgba)
1259 {
1260    const uint32_t *cc;
1261 
1262    t *= 3;
1263    cc = (const uint32_t *)(code + t / 8);
1264    t = (cc[0] >> (t & 7)) & 7;
1265 
1266    if (t == 7) {
1267       rgba[RCOMP] = rgba[GCOMP] = rgba[BCOMP] = rgba[ACOMP] = 0;
1268    } else {
1269       uint8_t r, g, b;
1270       cc = (const uint32_t *)(code + 12);
1271       if (t == 0) {
1272          b = UP5(CC_SEL(cc, 0));
1273          g = UP5(CC_SEL(cc, 5));
1274          r = UP5(CC_SEL(cc, 10));
1275       } else if (t == 6) {
1276          b = UP5(CC_SEL(cc, 15));
1277          g = UP5(CC_SEL(cc, 20));
1278          r = UP5(CC_SEL(cc, 25));
1279       } else {
1280          b = LERP(6, t, UP5(CC_SEL(cc, 0)), UP5(CC_SEL(cc, 15)));
1281          g = LERP(6, t, UP5(CC_SEL(cc, 5)), UP5(CC_SEL(cc, 20)));
1282          r = LERP(6, t, UP5(CC_SEL(cc, 10)), UP5(CC_SEL(cc, 25)));
1283       }
1284       rgba[RCOMP] = r;
1285       rgba[GCOMP] = g;
1286       rgba[BCOMP] = b;
1287       rgba[ACOMP] = 255;
1288    }
1289 }
1290 
1291 
1292 static void
fxt1_decode_1CHROMA(const uint8_t * code,int32_t t,uint8_t * rgba)1293 fxt1_decode_1CHROMA (const uint8_t *code, int32_t t, uint8_t *rgba)
1294 {
1295    const uint32_t *cc;
1296    uint32_t kk;
1297 
1298    cc = (const uint32_t *)code;
1299    if (t & 16) {
1300       cc++;
1301       t &= 15;
1302    }
1303    t = (cc[0] >> (t * 2)) & 3;
1304 
1305    t *= 15;
1306    cc = (const uint32_t *)(code + 8 + t / 8);
1307    kk = cc[0] >> (t & 7);
1308    rgba[BCOMP] = UP5(kk);
1309    rgba[GCOMP] = UP5(kk >> 5);
1310    rgba[RCOMP] = UP5(kk >> 10);
1311    rgba[ACOMP] = 255;
1312 }
1313 
1314 
1315 static void
fxt1_decode_1MIXED(const uint8_t * code,int32_t t,uint8_t * rgba)1316 fxt1_decode_1MIXED (const uint8_t *code, int32_t t, uint8_t *rgba)
1317 {
1318    const uint32_t *cc;
1319    uint32_t col[2][3];
1320    int32_t glsb, selb;
1321 
1322    cc = (const uint32_t *)code;
1323    if (t & 16) {
1324       t &= 15;
1325       t = (cc[1] >> (t * 2)) & 3;
1326       /* col 2 */
1327       col[0][BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1328       col[0][GCOMP] = CC_SEL(cc, 99);
1329       col[0][RCOMP] = CC_SEL(cc, 104);
1330       /* col 3 */
1331       col[1][BCOMP] = CC_SEL(cc, 109);
1332       col[1][GCOMP] = CC_SEL(cc, 114);
1333       col[1][RCOMP] = CC_SEL(cc, 119);
1334       glsb = CC_SEL(cc, 126);
1335       selb = CC_SEL(cc, 33);
1336    } else {
1337       t = (cc[0] >> (t * 2)) & 3;
1338       /* col 0 */
1339       col[0][BCOMP] = CC_SEL(cc, 64);
1340       col[0][GCOMP] = CC_SEL(cc, 69);
1341       col[0][RCOMP] = CC_SEL(cc, 74);
1342       /* col 1 */
1343       col[1][BCOMP] = CC_SEL(cc, 79);
1344       col[1][GCOMP] = CC_SEL(cc, 84);
1345       col[1][RCOMP] = CC_SEL(cc, 89);
1346       glsb = CC_SEL(cc, 125);
1347       selb = CC_SEL(cc, 1);
1348    }
1349 
1350    if (CC_SEL(cc, 124) & 1) {
1351       /* alpha[0] == 1 */
1352 
1353       if (t == 3) {
1354          /* zero */
1355          rgba[RCOMP] = rgba[BCOMP] = rgba[GCOMP] = rgba[ACOMP] = 0;
1356       } else {
1357          uint8_t r, g, b;
1358          if (t == 0) {
1359             b = UP5(col[0][BCOMP]);
1360             g = UP5(col[0][GCOMP]);
1361             r = UP5(col[0][RCOMP]);
1362          } else if (t == 2) {
1363             b = UP5(col[1][BCOMP]);
1364             g = UP6(col[1][GCOMP], glsb);
1365             r = UP5(col[1][RCOMP]);
1366          } else {
1367             b = (UP5(col[0][BCOMP]) + UP5(col[1][BCOMP])) / 2;
1368             g = (UP5(col[0][GCOMP]) + UP6(col[1][GCOMP], glsb)) / 2;
1369             r = (UP5(col[0][RCOMP]) + UP5(col[1][RCOMP])) / 2;
1370          }
1371          rgba[RCOMP] = r;
1372          rgba[GCOMP] = g;
1373          rgba[BCOMP] = b;
1374          rgba[ACOMP] = 255;
1375       }
1376    } else {
1377       /* alpha[0] == 0 */
1378       uint8_t r, g, b;
1379       if (t == 0) {
1380          b = UP5(col[0][BCOMP]);
1381          g = UP6(col[0][GCOMP], glsb ^ selb);
1382          r = UP5(col[0][RCOMP]);
1383       } else if (t == 3) {
1384          b = UP5(col[1][BCOMP]);
1385          g = UP6(col[1][GCOMP], glsb);
1386          r = UP5(col[1][RCOMP]);
1387       } else {
1388          b = LERP(3, t, UP5(col[0][BCOMP]), UP5(col[1][BCOMP]));
1389          g = LERP(3, t, UP6(col[0][GCOMP], glsb ^ selb),
1390                         UP6(col[1][GCOMP], glsb));
1391          r = LERP(3, t, UP5(col[0][RCOMP]), UP5(col[1][RCOMP]));
1392       }
1393       rgba[RCOMP] = r;
1394       rgba[GCOMP] = g;
1395       rgba[BCOMP] = b;
1396       rgba[ACOMP] = 255;
1397    }
1398 }
1399 
1400 
1401 static void
fxt1_decode_1ALPHA(const uint8_t * code,int32_t t,uint8_t * rgba)1402 fxt1_decode_1ALPHA (const uint8_t *code, int32_t t, uint8_t *rgba)
1403 {
1404    const uint32_t *cc;
1405    uint8_t r, g, b, a;
1406 
1407    cc = (const uint32_t *)code;
1408    if (CC_SEL(cc, 124) & 1) {
1409       /* lerp == 1 */
1410       uint32_t col0[4];
1411 
1412       if (t & 16) {
1413          t &= 15;
1414          t = (cc[1] >> (t * 2)) & 3;
1415          /* col 2 */
1416          col0[BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1417          col0[GCOMP] = CC_SEL(cc, 99);
1418          col0[RCOMP] = CC_SEL(cc, 104);
1419          col0[ACOMP] = CC_SEL(cc, 119);
1420       } else {
1421          t = (cc[0] >> (t * 2)) & 3;
1422          /* col 0 */
1423          col0[BCOMP] = CC_SEL(cc, 64);
1424          col0[GCOMP] = CC_SEL(cc, 69);
1425          col0[RCOMP] = CC_SEL(cc, 74);
1426          col0[ACOMP] = CC_SEL(cc, 109);
1427       }
1428 
1429       if (t == 0) {
1430          b = UP5(col0[BCOMP]);
1431          g = UP5(col0[GCOMP]);
1432          r = UP5(col0[RCOMP]);
1433          a = UP5(col0[ACOMP]);
1434       } else if (t == 3) {
1435          b = UP5(CC_SEL(cc, 79));
1436          g = UP5(CC_SEL(cc, 84));
1437          r = UP5(CC_SEL(cc, 89));
1438          a = UP5(CC_SEL(cc, 114));
1439       } else {
1440          b = LERP(3, t, UP5(col0[BCOMP]), UP5(CC_SEL(cc, 79)));
1441          g = LERP(3, t, UP5(col0[GCOMP]), UP5(CC_SEL(cc, 84)));
1442          r = LERP(3, t, UP5(col0[RCOMP]), UP5(CC_SEL(cc, 89)));
1443          a = LERP(3, t, UP5(col0[ACOMP]), UP5(CC_SEL(cc, 114)));
1444       }
1445    } else {
1446       /* lerp == 0 */
1447 
1448       if (t & 16) {
1449          cc++;
1450          t &= 15;
1451       }
1452       t = (cc[0] >> (t * 2)) & 3;
1453 
1454       if (t == 3) {
1455          /* zero */
1456          r = g = b = a = 0;
1457       } else {
1458          uint32_t kk;
1459          cc = (const uint32_t *)code;
1460          a = UP5(cc[3] >> (t * 5 + 13));
1461          t *= 15;
1462          cc = (const uint32_t *)(code + 8 + t / 8);
1463          kk = cc[0] >> (t & 7);
1464          b = UP5(kk);
1465          g = UP5(kk >> 5);
1466          r = UP5(kk >> 10);
1467       }
1468    }
1469    rgba[RCOMP] = r;
1470    rgba[GCOMP] = g;
1471    rgba[BCOMP] = b;
1472    rgba[ACOMP] = a;
1473 }
1474 
1475 
1476 static void
fxt1_decode_1(const void * texture,int32_t stride,int32_t i,int32_t j,uint8_t * rgba)1477 fxt1_decode_1 (const void *texture, int32_t stride, /* in pixels */
1478                int32_t i, int32_t j, uint8_t *rgba)
1479 {
1480    static void (*decode_1[]) (const uint8_t *, int32_t, uint8_t *) = {
1481       fxt1_decode_1HI,     /* cc-high   = "00?" */
1482       fxt1_decode_1HI,     /* cc-high   = "00?" */
1483       fxt1_decode_1CHROMA, /* cc-chroma = "010" */
1484       fxt1_decode_1ALPHA,  /* alpha     = "011" */
1485       fxt1_decode_1MIXED,  /* mixed     = "1??" */
1486       fxt1_decode_1MIXED,  /* mixed     = "1??" */
1487       fxt1_decode_1MIXED,  /* mixed     = "1??" */
1488       fxt1_decode_1MIXED   /* mixed     = "1??" */
1489    };
1490 
1491    const uint8_t *code = (const uint8_t *)texture +
1492                          ((j / 4) * (stride / 8) + (i / 8)) * 16;
1493    int32_t mode = CC_SEL(code, 125);
1494    int32_t t = i & 7;
1495 
1496    if (t & 4) {
1497       t += 12;
1498    }
1499    t += (j & 3) * 4;
1500 
1501    decode_1[mode](code, t, rgba);
1502 }
1503 
1504 /*
1505  * Pixel fetch within a block.
1506  */
1507 
1508 void
util_format_fxt1_rgb_fetch_rgba_8unorm(uint8_t * restrict dst,const uint8_t * restrict src,unsigned i,unsigned j)1509 util_format_fxt1_rgb_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1510 {
1511    fxt1_decode_1(src, 0, i, j, dst);
1512 }
1513 
1514 void
util_format_fxt1_rgba_fetch_rgba_8unorm(uint8_t * restrict dst,const uint8_t * restrict src,unsigned i,unsigned j)1515 util_format_fxt1_rgba_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1516 {
1517    fxt1_decode_1(src, 0, i, j, dst);
1518    dst[3] = 0xff;
1519 }
1520 
1521 void
util_format_fxt1_rgb_fetch_rgba(void * restrict in_dst,const uint8_t * restrict src,unsigned i,unsigned j)1522 util_format_fxt1_rgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1523 {
1524    float *dst = in_dst;
1525    uint8_t tmp[4];
1526    fxt1_decode_1(src, 0, i, j, tmp);
1527    dst[0] = ubyte_to_float(tmp[0]);
1528    dst[1] = ubyte_to_float(tmp[1]);
1529    dst[2] = ubyte_to_float(tmp[2]);
1530    dst[3] = 1.0;
1531 }
1532 
1533 void
util_format_fxt1_rgba_fetch_rgba(void * restrict in_dst,const uint8_t * restrict src,unsigned i,unsigned j)1534 util_format_fxt1_rgba_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1535 {
1536    float *dst = in_dst;
1537    uint8_t tmp[4];
1538    fxt1_decode_1(src, 0, i, j, tmp);
1539    dst[0] = ubyte_to_float(tmp[0]);
1540    dst[1] = ubyte_to_float(tmp[1]);
1541    dst[2] = ubyte_to_float(tmp[2]);
1542    dst[3] = ubyte_to_float(tmp[3]);
1543 }
1544 
1545 /*
1546  * Block decompression.
1547  */
1548 
1549 static inline void
util_format_fxtn_rgb_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height,bool rgba)1550 util_format_fxtn_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1551                                         const uint8_t *restrict src_row, unsigned src_stride,
1552                                         unsigned width, unsigned height,
1553                                         bool rgba)
1554 {
1555    const unsigned bw = 8, bh = 4, comps = 4;
1556    unsigned x, y, i, j;
1557    for (y = 0; y < height; y += bh) {
1558       const uint8_t *src = src_row;
1559       for (x = 0; x < width; x += bw) {
1560          for (j = 0; j < bh; ++j) {
1561             for (i = 0; i < bw; ++i) {
1562                uint8_t *dst = dst_row + (y + j) * dst_stride / sizeof(*dst_row) + (x + i) * comps;
1563                fxt1_decode_1(src, 0, i, j, dst);
1564                if (!rgba)
1565                   dst[3] = 0xff;
1566             }
1567          }
1568          src += FXT1_BLOCK_SIZE;
1569       }
1570       src_row += src_stride;
1571    }
1572 }
1573 
1574 void
util_format_fxt1_rgb_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1575 util_format_fxt1_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1576                                         const uint8_t *restrict src_row, unsigned src_stride,
1577                                         unsigned width, unsigned height)
1578 {
1579    util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1580                                            src_row, src_stride,
1581                                            width, height,
1582                                            false);
1583 }
1584 
1585 void
util_format_fxt1_rgba_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1586 util_format_fxt1_rgba_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1587                                          const uint8_t *restrict src_row, unsigned src_stride,
1588                                          unsigned width, unsigned height)
1589 {
1590    util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1591                                            src_row, src_stride,
1592                                            width, height,
1593                                            true);
1594 }
1595 
1596 static inline void
util_format_fxtn_rgb_unpack_rgba_float(float * dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height,bool rgba)1597 util_format_fxtn_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride,
1598                                        const uint8_t *restrict src_row, unsigned src_stride,
1599                                        unsigned width, unsigned height,
1600                                        bool rgba)
1601 {
1602    const unsigned bw = 8, bh = 4, comps = 4;
1603    unsigned x, y, i, j;
1604    for (y = 0; y < height; y += 4) {
1605       const uint8_t *src = src_row;
1606       for (x = 0; x < width; x += 8) {
1607          for (j = 0; j < bh; ++j) {
1608             for (i = 0; i < bw; ++i) {
1609                float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i) * comps;
1610                uint8_t tmp[4];
1611                fxt1_decode_1(src, 0, i, j, tmp);
1612                dst[0] = ubyte_to_float(tmp[0]);
1613                dst[1] = ubyte_to_float(tmp[1]);
1614                dst[2] = ubyte_to_float(tmp[2]);
1615                if (rgba)
1616                   dst[3] = ubyte_to_float(tmp[3]);
1617                else
1618                   dst[3] = 1.0;
1619             }
1620          }
1621          src += FXT1_BLOCK_SIZE;
1622       }
1623       src_row += src_stride;
1624    }
1625 }
1626 
1627 void
util_format_fxt1_rgb_unpack_rgba_float(void * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1628 util_format_fxt1_rgb_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1629                                        const uint8_t *restrict src_row, unsigned src_stride,
1630                                        unsigned width, unsigned height)
1631 {
1632    util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1633                                           src_row, src_stride,
1634                                           width, height,
1635                                           false);
1636 }
1637 
1638 void
util_format_fxt1_rgba_unpack_rgba_float(void * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1639 util_format_fxt1_rgba_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1640                                         const uint8_t *restrict src_row, unsigned src_stride,
1641                                         unsigned width, unsigned height)
1642 {
1643    util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1644                                           src_row, src_stride,
1645                                           width, height,
1646                                           true);
1647 }
1648 
1649 /*
1650  * Block compression.
1651  */
1652 
1653 void
util_format_fxt1_rgb_pack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src,unsigned src_stride,unsigned width,unsigned height)1654 util_format_fxt1_rgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1655                                       const uint8_t *restrict src, unsigned src_stride,
1656                                       unsigned width, unsigned height)
1657 {
1658    /* The encoder for FXT1_RGB wants 24bpp packed rgb, so make a temporary to do that.
1659     */
1660    int temp_stride = width * 3;
1661    uint8_t *temp = malloc(height * temp_stride);
1662    if (!temp)
1663       return;
1664 
1665    for (int y = 0; y < height; y++) {
1666       for (int x = 0; x < width; x++) {
1667          temp[y * temp_stride + x * 3 + 0] = src[x * 4 + 0];
1668          temp[y * temp_stride + x * 3 + 1] = src[x * 4 + 1];
1669          temp[y * temp_stride + x * 3 + 2] = src[x * 4 + 2];
1670       }
1671       src += src_stride;
1672    }
1673 
1674    fxt1_encode(width, height, 3, temp, temp_stride, dst_row, dst_stride);
1675 
1676    free(temp);
1677 }
1678 
1679 void
util_format_fxt1_rgba_pack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src,unsigned src_stride,unsigned width,unsigned height)1680 util_format_fxt1_rgba_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1681                                        const uint8_t *restrict src, unsigned src_stride,
1682                                        unsigned width, unsigned height)
1683 {
1684    fxt1_encode(width, height, 4, src, src_stride, dst_row, dst_stride);
1685 }
1686 
1687 void
util_format_fxt1_rgb_pack_rgba_float(uint8_t * restrict dst_row,unsigned dst_stride,const float * restrict src,unsigned src_stride,unsigned width,unsigned height)1688 util_format_fxt1_rgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1689                                      const float *restrict src, unsigned src_stride,
1690                                      unsigned width, unsigned height)
1691 {
1692    int temp_stride = width * 4;
1693    uint8_t *temp = malloc(height * temp_stride);
1694    if (!temp)
1695       return;
1696 
1697    util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1698                                               src, src_stride,
1699                                               width, height);
1700 
1701    util_format_fxt1_rgb_pack_rgba_8unorm(dst_row, dst_stride,
1702                                          temp, temp_stride,
1703                                          width, height);
1704 
1705    free(temp);
1706 }
1707 
1708 void
util_format_fxt1_rgba_pack_rgba_float(uint8_t * restrict dst_row,unsigned dst_stride,const float * restrict src,unsigned src_stride,unsigned width,unsigned height)1709 util_format_fxt1_rgba_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1710                                       const float *restrict src, unsigned src_stride,
1711                                       unsigned width, unsigned height)
1712 {
1713    int temp_stride = width * 4;
1714    uint8_t *temp = malloc(height * temp_stride);
1715    if (!temp)
1716       return;
1717 
1718    util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1719                                               src, src_stride,
1720                                               width, height);
1721 
1722    util_format_fxt1_rgba_pack_rgba_8unorm(dst_row, dst_stride,
1723                                           temp, temp_stride,
1724                                           width, height);
1725 
1726    free(temp);
1727 }
1728