1 /**************************************************************************
2 *
3 * Copyright (C) 1999-2008 Brian Paul All Rights Reserved.
4 * Copyright (c) 2008 VMware, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 **************************************************************************/
25
26 #include "util/format/u_format.h"
27 #include "util/format/u_format_fxt1.h"
28 #include "util/format_srgb.h"
29 #include "util/u_math.h"
30
31 #include "u_format_pack.h"
32
33 #define RCOMP 0
34 #define GCOMP 1
35 #define BCOMP 2
36 #define ACOMP 3
37
38 #define FXT1_BLOCK_SIZE 16
39
40 static void
41 fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
42 const void *source, int32_t srcRowStride,
43 void *dest, int32_t destRowStride);
44
45 static void
46 fxt1_decode_1 (const void *texture, int32_t stride,
47 int32_t i, int32_t j, uint8_t *rgba);
48
49 /***************************************************************************\
50 * FXT1 encoder
51 *
52 * The encoder was built by reversing the decoder,
53 * and is vaguely based on Texus2 by 3dfx. Note that this code
54 * is merely a proof of concept, since it is highly UNoptimized;
55 * moreover, it is sub-optimal due to initial conditions passed
56 * to Lloyd's algorithm (the interpolation modes are even worse).
57 \***************************************************************************/
58
59
60 #define MAX_COMP 4 /* ever needed maximum number of components in texel */
61 #define MAX_VECT 4 /* ever needed maximum number of base vectors to find */
62 #define N_TEXELS 32 /* number of texels in a block (always 32) */
63 #define LL_N_REP 50 /* number of iterations in lloyd's vq */
64 #define LL_RMS_D 10 /* fault tolerance (maximum delta) */
65 #define LL_RMS_E 255 /* fault tolerance (maximum error) */
66 #define ALPHA_TS 2 /* alpha threshold: (255 - ALPHA_TS) deemed opaque */
67 static const uint32_t zero = 0;
68 #define ISTBLACK(v) (memcmp(&(v), &zero, sizeof(zero)) == 0)
69
70 /*
71 * Define a 64-bit unsigned integer type and macros
72 */
73 #if 1
74
75 #define FX64_NATIVE 1
76
77 typedef uint64_t Fx64;
78
79 #define FX64_MOV32(a, b) a = b
80 #define FX64_OR32(a, b) a |= b
81 #define FX64_SHL(a, c) a <<= c
82
83 #else
84
85 #define FX64_NATIVE 0
86
87 typedef struct {
88 uint32_t lo, hi;
89 } Fx64;
90
91 #define FX64_MOV32(a, b) a.lo = b
92 #define FX64_OR32(a, b) a.lo |= b
93
94 #define FX64_SHL(a, c) \
95 do { \
96 if ((c) >= 32) { \
97 a.hi = a.lo << ((c) - 32); \
98 a.lo = 0; \
99 } else { \
100 a.hi = (a.hi << (c)) | (a.lo >> (32 - (c))); \
101 a.lo <<= (c); \
102 } \
103 } while (0)
104
105 #endif
106
107
108 #define F(i) (float)1 /* can be used to obtain an oblong metric: 0.30 / 0.59 / 0.11 */
109 #define SAFECDOT 1 /* for paranoids */
110
111 #define MAKEIVEC(NV, NC, IV, B, V0, V1) \
112 do { \
113 /* compute interpolation vector */ \
114 float d2 = 0.0F; \
115 float rd2; \
116 \
117 for (i = 0; i < NC; i++) { \
118 IV[i] = (V1[i] - V0[i]) * F(i); \
119 d2 += IV[i] * IV[i]; \
120 } \
121 rd2 = (float)NV / d2; \
122 B = 0; \
123 for (i = 0; i < NC; i++) { \
124 IV[i] *= F(i); \
125 B -= IV[i] * V0[i]; \
126 IV[i] *= rd2; \
127 } \
128 B = B * rd2 + 0.5f; \
129 } while (0)
130
131 #define CALCCDOT(TEXEL, NV, NC, IV, B, V)\
132 do { \
133 float dot = 0.0F; \
134 for (i = 0; i < NC; i++) { \
135 dot += V[i] * IV[i]; \
136 } \
137 TEXEL = (int32_t)(dot + B); \
138 if (SAFECDOT) { \
139 if (TEXEL < 0) { \
140 TEXEL = 0; \
141 } else if (TEXEL > NV) { \
142 TEXEL = NV; \
143 } \
144 } \
145 } while (0)
146
147
148 static int32_t
fxt1_bestcol(float vec[][MAX_COMP],int32_t nv,uint8_t input[MAX_COMP],int32_t nc)149 fxt1_bestcol (float vec[][MAX_COMP], int32_t nv,
150 uint8_t input[MAX_COMP], int32_t nc)
151 {
152 int32_t i, j, best = -1;
153 float err = 1e9; /* big enough */
154
155 for (j = 0; j < nv; j++) {
156 float e = 0.0F;
157 for (i = 0; i < nc; i++) {
158 e += (vec[j][i] - input[i]) * (vec[j][i] - input[i]);
159 }
160 if (e < err) {
161 err = e;
162 best = j;
163 }
164 }
165
166 return best;
167 }
168
169
170 static int32_t
fxt1_worst(float vec[MAX_COMP],uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)171 fxt1_worst (float vec[MAX_COMP],
172 uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
173 {
174 int32_t i, k, worst = -1;
175 float err = -1.0F; /* small enough */
176
177 for (k = 0; k < n; k++) {
178 float e = 0.0F;
179 for (i = 0; i < nc; i++) {
180 e += (vec[i] - input[k][i]) * (vec[i] - input[k][i]);
181 }
182 if (e > err) {
183 err = e;
184 worst = k;
185 }
186 }
187
188 return worst;
189 }
190
191
192 static int32_t
fxt1_variance(uint8_t input[N_TEXELS/2][MAX_COMP],int32_t nc)193 fxt1_variance (uint8_t input[N_TEXELS / 2][MAX_COMP], int32_t nc)
194 {
195 const int n = N_TEXELS / 2;
196 int32_t i, k, best = 0;
197 int32_t sx, sx2;
198 double var, maxvar = -1; /* small enough */
199 double teenth = 1.0 / n;
200
201 for (i = 0; i < nc; i++) {
202 sx = sx2 = 0;
203 for (k = 0; k < n; k++) {
204 int32_t t = input[k][i];
205 sx += t;
206 sx2 += t * t;
207 }
208 var = sx2 * teenth - sx * sx * teenth * teenth;
209 if (maxvar < var) {
210 maxvar = var;
211 best = i;
212 }
213 }
214
215 return best;
216 }
217
218
219 static int32_t
fxt1_choose(float vec[][MAX_COMP],int32_t nv,uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)220 fxt1_choose (float vec[][MAX_COMP], int32_t nv,
221 uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
222 {
223 #if 0
224 /* Choose colors from a grid.
225 */
226 int32_t i, j;
227
228 for (j = 0; j < nv; j++) {
229 int32_t m = j * (n - 1) / (nv - 1);
230 for (i = 0; i < nc; i++) {
231 vec[j][i] = input[m][i];
232 }
233 }
234 #else
235 /* Our solution here is to find the darkest and brightest colors in
236 * the 8x4 tile and use those as the two representative colors.
237 * There are probably better algorithms to use (histogram-based).
238 */
239 int32_t i, j, k;
240 int32_t minSum = 2000; /* big enough */
241 int32_t maxSum = -1; /* small enough */
242 int32_t minCol = 0; /* phoudoin: silent compiler! */
243 int32_t maxCol = 0; /* phoudoin: silent compiler! */
244
245 struct {
246 int32_t flag;
247 int32_t key;
248 int32_t freq;
249 int32_t idx;
250 } hist[N_TEXELS];
251 int32_t lenh = 0;
252
253 memset(hist, 0, sizeof(hist));
254
255 for (k = 0; k < n; k++) {
256 int32_t l;
257 int32_t key = 0;
258 int32_t sum = 0;
259 for (i = 0; i < nc; i++) {
260 key <<= 8;
261 key |= input[k][i];
262 sum += input[k][i];
263 }
264 for (l = 0; l < n; l++) {
265 if (!hist[l].flag) {
266 /* alloc new slot */
267 hist[l].flag = !0;
268 hist[l].key = key;
269 hist[l].freq = 1;
270 hist[l].idx = k;
271 lenh = l + 1;
272 break;
273 } else if (hist[l].key == key) {
274 hist[l].freq++;
275 break;
276 }
277 }
278 if (minSum > sum) {
279 minSum = sum;
280 minCol = k;
281 }
282 if (maxSum < sum) {
283 maxSum = sum;
284 maxCol = k;
285 }
286 }
287
288 if (lenh <= nv) {
289 for (j = 0; j < lenh; j++) {
290 for (i = 0; i < nc; i++) {
291 vec[j][i] = (float)input[hist[j].idx][i];
292 }
293 }
294 for (; j < nv; j++) {
295 for (i = 0; i < nc; i++) {
296 vec[j][i] = vec[0][i];
297 }
298 }
299 return 0;
300 }
301
302 for (j = 0; j < nv; j++) {
303 for (i = 0; i < nc; i++) {
304 vec[j][i] = ((nv - 1 - j) * input[minCol][i] + j * input[maxCol][i] + (nv - 1) / 2) / (float)(nv - 1);
305 }
306 }
307 #endif
308
309 return !0;
310 }
311
312
313 static int32_t
fxt1_lloyd(float vec[][MAX_COMP],int32_t nv,uint8_t input[N_TEXELS][MAX_COMP],int32_t nc,int32_t n)314 fxt1_lloyd (float vec[][MAX_COMP], int32_t nv,
315 uint8_t input[N_TEXELS][MAX_COMP], int32_t nc, int32_t n)
316 {
317 /* Use the generalized lloyd's algorithm for VQ:
318 * find 4 color vectors.
319 *
320 * for each sample color
321 * sort to nearest vector.
322 *
323 * replace each vector with the centroid of its matching colors.
324 *
325 * repeat until RMS doesn't improve.
326 *
327 * if a color vector has no samples, or becomes the same as another
328 * vector, replace it with the color which is farthest from a sample.
329 *
330 * vec[][MAX_COMP] initial vectors and resulting colors
331 * nv number of resulting colors required
332 * input[N_TEXELS][MAX_COMP] input texels
333 * nc number of components in input / vec
334 * n number of input samples
335 */
336
337 int32_t sum[MAX_VECT][MAX_COMP]; /* used to accumulate closest texels */
338 int32_t cnt[MAX_VECT]; /* how many times a certain vector was chosen */
339 float error, lasterror = 1e9;
340
341 int32_t i, j, k, rep;
342
343 /* the quantizer */
344 for (rep = 0; rep < LL_N_REP; rep++) {
345 /* reset sums & counters */
346 for (j = 0; j < nv; j++) {
347 for (i = 0; i < nc; i++) {
348 sum[j][i] = 0;
349 }
350 cnt[j] = 0;
351 }
352 error = 0;
353
354 /* scan whole block */
355 for (k = 0; k < n; k++) {
356 #if 1
357 int32_t best = -1;
358 float err = 1e9; /* big enough */
359 /* determine best vector */
360 for (j = 0; j < nv; j++) {
361 float e = (vec[j][0] - input[k][0]) * (vec[j][0] - input[k][0]) +
362 (vec[j][1] - input[k][1]) * (vec[j][1] - input[k][1]) +
363 (vec[j][2] - input[k][2]) * (vec[j][2] - input[k][2]);
364 if (nc == 4) {
365 e += (vec[j][3] - input[k][3]) * (vec[j][3] - input[k][3]);
366 }
367 if (e < err) {
368 err = e;
369 best = j;
370 }
371 }
372 #else
373 int32_t best = fxt1_bestcol(vec, nv, input[k], nc, &err);
374 #endif
375 assert(best >= 0);
376 /* add in closest color */
377 for (i = 0; i < nc; i++) {
378 sum[best][i] += input[k][i];
379 }
380 /* mark this vector as used */
381 cnt[best]++;
382 /* accumulate error */
383 error += err;
384 }
385
386 /* check RMS */
387 if ((error < LL_RMS_E) ||
388 ((error < lasterror) && ((lasterror - error) < LL_RMS_D))) {
389 return !0; /* good match */
390 }
391 lasterror = error;
392
393 /* move each vector to the barycenter of its closest colors */
394 for (j = 0; j < nv; j++) {
395 if (cnt[j]) {
396 float div = 1.0F / cnt[j];
397 for (i = 0; i < nc; i++) {
398 vec[j][i] = div * sum[j][i];
399 }
400 } else {
401 /* this vec has no samples or is identical with a previous vec */
402 int32_t worst = fxt1_worst(vec[j], input, nc, n);
403 for (i = 0; i < nc; i++) {
404 vec[j][i] = input[worst][i];
405 }
406 }
407 }
408 }
409
410 return 0; /* could not converge fast enough */
411 }
412
413
414 static void
fxt1_quantize_CHROMA(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])415 fxt1_quantize_CHROMA (uint32_t *cc,
416 uint8_t input[N_TEXELS][MAX_COMP])
417 {
418 const int32_t n_vect = 4; /* 4 base vectors to find */
419 const int32_t n_comp = 3; /* 3 components: R, G, B */
420 float vec[MAX_VECT][MAX_COMP];
421 int32_t i, j, k;
422 Fx64 hi; /* high quadword */
423 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
424
425 if (fxt1_choose(vec, n_vect, input, n_comp, N_TEXELS) != 0) {
426 fxt1_lloyd(vec, n_vect, input, n_comp, N_TEXELS);
427 }
428
429 FX64_MOV32(hi, 4); /* cc-chroma = "010" + unused bit */
430 for (j = n_vect - 1; j >= 0; j--) {
431 for (i = 0; i < n_comp; i++) {
432 /* add in colors */
433 FX64_SHL(hi, 5);
434 FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
435 }
436 }
437 ((Fx64 *)cc)[1] = hi;
438
439 lohi = lolo = 0;
440 /* right microtile */
441 for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
442 lohi <<= 2;
443 lohi |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
444 }
445 /* left microtile */
446 for (; k >= 0; k--) {
447 lolo <<= 2;
448 lolo |= fxt1_bestcol(vec, n_vect, input[k], n_comp);
449 }
450 cc[1] = lohi;
451 cc[0] = lolo;
452 }
453
454
455 static void
fxt1_quantize_ALPHA0(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP],uint8_t reord[N_TEXELS][MAX_COMP],int32_t n)456 fxt1_quantize_ALPHA0 (uint32_t *cc,
457 uint8_t input[N_TEXELS][MAX_COMP],
458 uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
459 {
460 const int32_t n_vect = 3; /* 3 base vectors to find */
461 const int32_t n_comp = 4; /* 4 components: R, G, B, A */
462 float vec[MAX_VECT][MAX_COMP];
463 int32_t i, j, k;
464 Fx64 hi; /* high quadword */
465 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
466
467 /* the last vector indicates zero */
468 for (i = 0; i < n_comp; i++) {
469 vec[n_vect][i] = 0;
470 }
471
472 /* the first n texels in reord are guaranteed to be non-zero */
473 if (fxt1_choose(vec, n_vect, reord, n_comp, n) != 0) {
474 fxt1_lloyd(vec, n_vect, reord, n_comp, n);
475 }
476
477 FX64_MOV32(hi, 6); /* alpha = "011" + lerp = 0 */
478 for (j = n_vect - 1; j >= 0; j--) {
479 /* add in alphas */
480 FX64_SHL(hi, 5);
481 FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
482 }
483 for (j = n_vect - 1; j >= 0; j--) {
484 for (i = 0; i < n_comp - 1; i++) {
485 /* add in colors */
486 FX64_SHL(hi, 5);
487 FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
488 }
489 }
490 ((Fx64 *)cc)[1] = hi;
491
492 lohi = lolo = 0;
493 /* right microtile */
494 for (k = N_TEXELS - 1; k >= N_TEXELS/2; k--) {
495 lohi <<= 2;
496 lohi |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
497 }
498 /* left microtile */
499 for (; k >= 0; k--) {
500 lolo <<= 2;
501 lolo |= fxt1_bestcol(vec, n_vect + 1, input[k], n_comp);
502 }
503 cc[1] = lohi;
504 cc[0] = lolo;
505 }
506
507
508 static void
fxt1_quantize_ALPHA1(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])509 fxt1_quantize_ALPHA1 (uint32_t *cc,
510 uint8_t input[N_TEXELS][MAX_COMP])
511 {
512 const int32_t n_vect = 3; /* highest vector number in each microtile */
513 const int32_t n_comp = 4; /* 4 components: R, G, B, A */
514 float vec[1 + 1 + 1][MAX_COMP]; /* 1.5 extrema for each sub-block */
515 float b, iv[MAX_COMP]; /* interpolation vector */
516 int32_t i, j, k;
517 Fx64 hi; /* high quadword */
518 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
519
520 int32_t minSum;
521 int32_t maxSum;
522 int32_t minColL = 0, maxColL = 0;
523 int32_t minColR = 0, maxColR = 0;
524 int32_t sumL = 0, sumR = 0;
525 int32_t nn_comp;
526 /* Our solution here is to find the darkest and brightest colors in
527 * the 4x4 tile and use those as the two representative colors.
528 * There are probably better algorithms to use (histogram-based).
529 */
530 nn_comp = n_comp;
531 while ((minColL == maxColL) && nn_comp) {
532 minSum = 2000; /* big enough */
533 maxSum = -1; /* small enough */
534 for (k = 0; k < N_TEXELS / 2; k++) {
535 int32_t sum = 0;
536 for (i = 0; i < nn_comp; i++) {
537 sum += input[k][i];
538 }
539 if (minSum > sum) {
540 minSum = sum;
541 minColL = k;
542 }
543 if (maxSum < sum) {
544 maxSum = sum;
545 maxColL = k;
546 }
547 sumL += sum;
548 }
549
550 nn_comp--;
551 }
552
553 nn_comp = n_comp;
554 while ((minColR == maxColR) && nn_comp) {
555 minSum = 2000; /* big enough */
556 maxSum = -1; /* small enough */
557 for (k = N_TEXELS / 2; k < N_TEXELS; k++) {
558 int32_t sum = 0;
559 for (i = 0; i < nn_comp; i++) {
560 sum += input[k][i];
561 }
562 if (minSum > sum) {
563 minSum = sum;
564 minColR = k;
565 }
566 if (maxSum < sum) {
567 maxSum = sum;
568 maxColR = k;
569 }
570 sumR += sum;
571 }
572
573 nn_comp--;
574 }
575
576 /* choose the common vector (yuck!) */
577 {
578 int32_t j1, j2;
579 int32_t v1 = 0, v2 = 0;
580 float err = 1e9; /* big enough */
581 float tv[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
582 for (i = 0; i < n_comp; i++) {
583 tv[0][i] = input[minColL][i];
584 tv[1][i] = input[maxColL][i];
585 tv[2][i] = input[minColR][i];
586 tv[3][i] = input[maxColR][i];
587 }
588 for (j1 = 0; j1 < 2; j1++) {
589 for (j2 = 2; j2 < 4; j2++) {
590 float e = 0.0F;
591 for (i = 0; i < n_comp; i++) {
592 e += (tv[j1][i] - tv[j2][i]) * (tv[j1][i] - tv[j2][i]);
593 }
594 if (e < err) {
595 err = e;
596 v1 = j1;
597 v2 = j2;
598 }
599 }
600 }
601 for (i = 0; i < n_comp; i++) {
602 vec[0][i] = tv[1 - v1][i];
603 vec[1][i] = (tv[v1][i] * sumL + tv[v2][i] * sumR) / (sumL + sumR);
604 vec[2][i] = tv[5 - v2][i];
605 }
606 }
607
608 /* left microtile */
609 cc[0] = 0;
610 if (minColL != maxColL) {
611 /* compute interpolation vector */
612 MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
613
614 /* add in texels */
615 lolo = 0;
616 for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
617 int32_t texel;
618 /* interpolate color */
619 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
620 /* add in texel */
621 lolo <<= 2;
622 lolo |= texel;
623 }
624
625 cc[0] = lolo;
626 }
627
628 /* right microtile */
629 cc[1] = 0;
630 if (minColR != maxColR) {
631 /* compute interpolation vector */
632 MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[1]);
633
634 /* add in texels */
635 lohi = 0;
636 for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
637 int32_t texel;
638 /* interpolate color */
639 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
640 /* add in texel */
641 lohi <<= 2;
642 lohi |= texel;
643 }
644
645 cc[1] = lohi;
646 }
647
648 FX64_MOV32(hi, 7); /* alpha = "011" + lerp = 1 */
649 for (j = n_vect - 1; j >= 0; j--) {
650 /* add in alphas */
651 FX64_SHL(hi, 5);
652 FX64_OR32(hi, (uint32_t)(vec[j][ACOMP] / 8.0F));
653 }
654 for (j = n_vect - 1; j >= 0; j--) {
655 for (i = 0; i < n_comp - 1; i++) {
656 /* add in colors */
657 FX64_SHL(hi, 5);
658 FX64_OR32(hi, (uint32_t)(vec[j][i] / 8.0F));
659 }
660 }
661 ((Fx64 *)cc)[1] = hi;
662 }
663
664
665 static void
fxt1_quantize_HI(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP],uint8_t reord[N_TEXELS][MAX_COMP],int32_t n)666 fxt1_quantize_HI (uint32_t *cc,
667 uint8_t input[N_TEXELS][MAX_COMP],
668 uint8_t reord[N_TEXELS][MAX_COMP], int32_t n)
669 {
670 const int32_t n_vect = 6; /* highest vector number */
671 const int32_t n_comp = 3; /* 3 components: R, G, B */
672 float b = 0.0F; /* phoudoin: silent compiler! */
673 float iv[MAX_COMP]; /* interpolation vector */
674 int32_t i, k;
675 uint32_t hihi; /* high quadword: hi dword */
676
677 int32_t minSum = 2000; /* big enough */
678 int32_t maxSum = -1; /* small enough */
679 int32_t minCol = 0; /* phoudoin: silent compiler! */
680 int32_t maxCol = 0; /* phoudoin: silent compiler! */
681
682 /* Our solution here is to find the darkest and brightest colors in
683 * the 8x4 tile and use those as the two representative colors.
684 * There are probably better algorithms to use (histogram-based).
685 */
686 for (k = 0; k < n; k++) {
687 int32_t sum = 0;
688 for (i = 0; i < n_comp; i++) {
689 sum += reord[k][i];
690 }
691 if (minSum > sum) {
692 minSum = sum;
693 minCol = k;
694 }
695 if (maxSum < sum) {
696 maxSum = sum;
697 maxCol = k;
698 }
699 }
700
701 hihi = 0; /* cc-hi = "00" */
702 for (i = 0; i < n_comp; i++) {
703 /* add in colors */
704 hihi <<= 5;
705 hihi |= reord[maxCol][i] >> 3;
706 }
707 for (i = 0; i < n_comp; i++) {
708 /* add in colors */
709 hihi <<= 5;
710 hihi |= reord[minCol][i] >> 3;
711 }
712 cc[3] = hihi;
713 cc[0] = cc[1] = cc[2] = 0;
714
715 /* compute interpolation vector */
716 if (minCol != maxCol) {
717 MAKEIVEC(n_vect, n_comp, iv, b, reord[minCol], reord[maxCol]);
718 }
719
720 /* add in texels */
721 for (k = N_TEXELS - 1; k >= 0; k--) {
722 int32_t t = k * 3;
723 uint32_t *kk = (uint32_t *)((char *)cc + t / 8);
724 int32_t texel = n_vect + 1; /* transparent black */
725
726 if (!ISTBLACK(input[k])) {
727 if (minCol != maxCol) {
728 /* interpolate color */
729 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
730 /* add in texel */
731 kk[0] |= texel << (t & 7);
732 }
733 } else {
734 /* add in texel */
735 kk[0] |= texel << (t & 7);
736 }
737 }
738 }
739
740
741 static void
fxt1_quantize_MIXED1(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])742 fxt1_quantize_MIXED1 (uint32_t *cc,
743 uint8_t input[N_TEXELS][MAX_COMP])
744 {
745 const int32_t n_vect = 2; /* highest vector number in each microtile */
746 const int32_t n_comp = 3; /* 3 components: R, G, B */
747 uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
748 float b, iv[MAX_COMP]; /* interpolation vector */
749 int32_t i, j, k;
750 Fx64 hi; /* high quadword */
751 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
752
753 int32_t minSum;
754 int32_t maxSum;
755 int32_t minColL = 0, maxColL = -1;
756 int32_t minColR = 0, maxColR = -1;
757
758 /* Our solution here is to find the darkest and brightest colors in
759 * the 4x4 tile and use those as the two representative colors.
760 * There are probably better algorithms to use (histogram-based).
761 */
762 minSum = 2000; /* big enough */
763 maxSum = -1; /* small enough */
764 for (k = 0; k < N_TEXELS / 2; k++) {
765 if (!ISTBLACK(input[k])) {
766 int32_t sum = 0;
767 for (i = 0; i < n_comp; i++) {
768 sum += input[k][i];
769 }
770 if (minSum > sum) {
771 minSum = sum;
772 minColL = k;
773 }
774 if (maxSum < sum) {
775 maxSum = sum;
776 maxColL = k;
777 }
778 }
779 }
780 minSum = 2000; /* big enough */
781 maxSum = -1; /* small enough */
782 for (; k < N_TEXELS; k++) {
783 if (!ISTBLACK(input[k])) {
784 int32_t sum = 0;
785 for (i = 0; i < n_comp; i++) {
786 sum += input[k][i];
787 }
788 if (minSum > sum) {
789 minSum = sum;
790 minColR = k;
791 }
792 if (maxSum < sum) {
793 maxSum = sum;
794 maxColR = k;
795 }
796 }
797 }
798
799 /* left microtile */
800 if (maxColL == -1) {
801 /* all transparent black */
802 cc[0] = ~0u;
803 for (i = 0; i < n_comp; i++) {
804 vec[0][i] = 0;
805 vec[1][i] = 0;
806 }
807 } else {
808 cc[0] = 0;
809 for (i = 0; i < n_comp; i++) {
810 vec[0][i] = input[minColL][i];
811 vec[1][i] = input[maxColL][i];
812 }
813 if (minColL != maxColL) {
814 /* compute interpolation vector */
815 MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
816
817 /* add in texels */
818 lolo = 0;
819 for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
820 int32_t texel = n_vect + 1; /* transparent black */
821 if (!ISTBLACK(input[k])) {
822 /* interpolate color */
823 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
824 }
825 /* add in texel */
826 lolo <<= 2;
827 lolo |= texel;
828 }
829 cc[0] = lolo;
830 }
831 }
832
833 /* right microtile */
834 if (maxColR == -1) {
835 /* all transparent black */
836 cc[1] = ~0u;
837 for (i = 0; i < n_comp; i++) {
838 vec[2][i] = 0;
839 vec[3][i] = 0;
840 }
841 } else {
842 cc[1] = 0;
843 for (i = 0; i < n_comp; i++) {
844 vec[2][i] = input[minColR][i];
845 vec[3][i] = input[maxColR][i];
846 }
847 if (minColR != maxColR) {
848 /* compute interpolation vector */
849 MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
850
851 /* add in texels */
852 lohi = 0;
853 for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
854 int32_t texel = n_vect + 1; /* transparent black */
855 if (!ISTBLACK(input[k])) {
856 /* interpolate color */
857 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
858 }
859 /* add in texel */
860 lohi <<= 2;
861 lohi |= texel;
862 }
863 cc[1] = lohi;
864 }
865 }
866
867 FX64_MOV32(hi, 9 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
868 for (j = 2 * 2 - 1; j >= 0; j--) {
869 for (i = 0; i < n_comp; i++) {
870 /* add in colors */
871 FX64_SHL(hi, 5);
872 FX64_OR32(hi, vec[j][i] >> 3);
873 }
874 }
875 ((Fx64 *)cc)[1] = hi;
876 }
877
878
879 static void
fxt1_quantize_MIXED0(uint32_t * cc,uint8_t input[N_TEXELS][MAX_COMP])880 fxt1_quantize_MIXED0 (uint32_t *cc,
881 uint8_t input[N_TEXELS][MAX_COMP])
882 {
883 const int32_t n_vect = 3; /* highest vector number in each microtile */
884 const int32_t n_comp = 3; /* 3 components: R, G, B */
885 uint8_t vec[2 * 2][MAX_COMP]; /* 2 extrema for each sub-block */
886 float b, iv[MAX_COMP]; /* interpolation vector */
887 int32_t i, j, k;
888 Fx64 hi; /* high quadword */
889 uint32_t lohi, lolo; /* low quadword: hi dword, lo dword */
890
891 int32_t minColL = 0, maxColL = 0;
892 int32_t minColR = 0, maxColR = 0;
893 #if 0
894 int32_t minSum;
895 int32_t maxSum;
896
897 /* Our solution here is to find the darkest and brightest colors in
898 * the 4x4 tile and use those as the two representative colors.
899 * There are probably better algorithms to use (histogram-based).
900 */
901 minSum = 2000; /* big enough */
902 maxSum = -1; /* small enough */
903 for (k = 0; k < N_TEXELS / 2; k++) {
904 int32_t sum = 0;
905 for (i = 0; i < n_comp; i++) {
906 sum += input[k][i];
907 }
908 if (minSum > sum) {
909 minSum = sum;
910 minColL = k;
911 }
912 if (maxSum < sum) {
913 maxSum = sum;
914 maxColL = k;
915 }
916 }
917 minSum = 2000; /* big enough */
918 maxSum = -1; /* small enough */
919 for (; k < N_TEXELS; k++) {
920 int32_t sum = 0;
921 for (i = 0; i < n_comp; i++) {
922 sum += input[k][i];
923 }
924 if (minSum > sum) {
925 minSum = sum;
926 minColR = k;
927 }
928 if (maxSum < sum) {
929 maxSum = sum;
930 maxColR = k;
931 }
932 }
933 #else
934 int32_t minVal;
935 int32_t maxVal;
936 int32_t maxVarL = fxt1_variance(input, n_comp);
937 int32_t maxVarR = fxt1_variance(&input[N_TEXELS / 2], n_comp);
938
939 /* Scan the channel with max variance for lo & hi
940 * and use those as the two representative colors.
941 */
942 minVal = 2000; /* big enough */
943 maxVal = -1; /* small enough */
944 for (k = 0; k < N_TEXELS / 2; k++) {
945 int32_t t = input[k][maxVarL];
946 if (minVal > t) {
947 minVal = t;
948 minColL = k;
949 }
950 if (maxVal < t) {
951 maxVal = t;
952 maxColL = k;
953 }
954 }
955 minVal = 2000; /* big enough */
956 maxVal = -1; /* small enough */
957 for (; k < N_TEXELS; k++) {
958 int32_t t = input[k][maxVarR];
959 if (minVal > t) {
960 minVal = t;
961 minColR = k;
962 }
963 if (maxVal < t) {
964 maxVal = t;
965 maxColR = k;
966 }
967 }
968 #endif
969
970 /* left microtile */
971 cc[0] = 0;
972 for (i = 0; i < n_comp; i++) {
973 vec[0][i] = input[minColL][i];
974 vec[1][i] = input[maxColL][i];
975 }
976 if (minColL != maxColL) {
977 /* compute interpolation vector */
978 MAKEIVEC(n_vect, n_comp, iv, b, vec[0], vec[1]);
979
980 /* add in texels */
981 lolo = 0;
982 for (k = N_TEXELS / 2 - 1; k >= 0; k--) {
983 int32_t texel;
984 /* interpolate color */
985 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
986 /* add in texel */
987 lolo <<= 2;
988 lolo |= texel;
989 }
990
991 /* funky encoding for LSB of green */
992 if ((int32_t)((lolo >> 1) & 1) != (((vec[1][GCOMP] ^ vec[0][GCOMP]) >> 2) & 1)) {
993 for (i = 0; i < n_comp; i++) {
994 vec[1][i] = input[minColL][i];
995 vec[0][i] = input[maxColL][i];
996 }
997 lolo = ~lolo;
998 }
999
1000 cc[0] = lolo;
1001 }
1002
1003 /* right microtile */
1004 cc[1] = 0;
1005 for (i = 0; i < n_comp; i++) {
1006 vec[2][i] = input[minColR][i];
1007 vec[3][i] = input[maxColR][i];
1008 }
1009 if (minColR != maxColR) {
1010 /* compute interpolation vector */
1011 MAKEIVEC(n_vect, n_comp, iv, b, vec[2], vec[3]);
1012
1013 /* add in texels */
1014 lohi = 0;
1015 for (k = N_TEXELS - 1; k >= N_TEXELS / 2; k--) {
1016 int32_t texel;
1017 /* interpolate color */
1018 CALCCDOT(texel, n_vect, n_comp, iv, b, input[k]);
1019 /* add in texel */
1020 lohi <<= 2;
1021 lohi |= texel;
1022 }
1023
1024 /* funky encoding for LSB of green */
1025 if ((int32_t)((lohi >> 1) & 1) != (((vec[3][GCOMP] ^ vec[2][GCOMP]) >> 2) & 1)) {
1026 for (i = 0; i < n_comp; i++) {
1027 vec[3][i] = input[minColR][i];
1028 vec[2][i] = input[maxColR][i];
1029 }
1030 lohi = ~lohi;
1031 }
1032
1033 cc[1] = lohi;
1034 }
1035
1036 FX64_MOV32(hi, 8 | (vec[3][GCOMP] & 4) | ((vec[1][GCOMP] >> 1) & 2)); /* chroma = "1" */
1037 for (j = 2 * 2 - 1; j >= 0; j--) {
1038 for (i = 0; i < n_comp; i++) {
1039 /* add in colors */
1040 FX64_SHL(hi, 5);
1041 FX64_OR32(hi, vec[j][i] >> 3);
1042 }
1043 }
1044 ((Fx64 *)cc)[1] = hi;
1045 }
1046
1047
1048 static void
fxt1_quantize(uint32_t * cc,const uint8_t * lines[],int32_t comps)1049 fxt1_quantize (uint32_t *cc, const uint8_t *lines[], int32_t comps)
1050 {
1051 int32_t trualpha;
1052 uint8_t reord[N_TEXELS][MAX_COMP];
1053
1054 uint8_t input[N_TEXELS][MAX_COMP];
1055 int32_t i, k, l;
1056
1057 if (comps == 3) {
1058 /* make the whole block opaque */
1059 memset(input, -1, sizeof(input));
1060 }
1061
1062 /* 8 texels each line */
1063 for (l = 0; l < 4; l++) {
1064 for (k = 0; k < 4; k++) {
1065 for (i = 0; i < comps; i++) {
1066 input[k + l * 4][i] = *lines[l]++;
1067 }
1068 }
1069 for (; k < 8; k++) {
1070 for (i = 0; i < comps; i++) {
1071 input[k + l * 4 + 12][i] = *lines[l]++;
1072 }
1073 }
1074 }
1075
1076 /* block layout:
1077 * 00, 01, 02, 03, 08, 09, 0a, 0b
1078 * 10, 11, 12, 13, 18, 19, 1a, 1b
1079 * 04, 05, 06, 07, 0c, 0d, 0e, 0f
1080 * 14, 15, 16, 17, 1c, 1d, 1e, 1f
1081 */
1082
1083 /* [dBorca]
1084 * stupidity flows forth from this
1085 */
1086 l = N_TEXELS;
1087 trualpha = 0;
1088 if (comps == 4) {
1089 /* skip all transparent black texels */
1090 l = 0;
1091 for (k = 0; k < N_TEXELS; k++) {
1092 /* test all components against 0 */
1093 if (!ISTBLACK(input[k])) {
1094 /* texel is not transparent black */
1095 memcpy(reord[l], input[k], 4);
1096 if (reord[l][ACOMP] < (255 - ALPHA_TS)) {
1097 /* non-opaque texel */
1098 trualpha = !0;
1099 }
1100 l++;
1101 }
1102 }
1103 }
1104
1105 #if 0
1106 if (trualpha) {
1107 fxt1_quantize_ALPHA0(cc, input, reord, l);
1108 } else if (l == 0) {
1109 cc[0] = cc[1] = cc[2] = -1;
1110 cc[3] = 0;
1111 } else if (l < N_TEXELS) {
1112 fxt1_quantize_HI(cc, input, reord, l);
1113 } else {
1114 fxt1_quantize_CHROMA(cc, input);
1115 }
1116 (void)fxt1_quantize_ALPHA1;
1117 (void)fxt1_quantize_MIXED1;
1118 (void)fxt1_quantize_MIXED0;
1119 #else
1120 if (trualpha) {
1121 fxt1_quantize_ALPHA1(cc, input);
1122 } else if (l == 0) {
1123 cc[0] = cc[1] = cc[2] = ~0u;
1124 cc[3] = 0;
1125 } else if (l < N_TEXELS) {
1126 fxt1_quantize_MIXED1(cc, input);
1127 } else {
1128 fxt1_quantize_MIXED0(cc, input);
1129 }
1130 (void)fxt1_quantize_ALPHA0;
1131 (void)fxt1_quantize_HI;
1132 (void)fxt1_quantize_CHROMA;
1133 #endif
1134 }
1135
1136
1137
1138 /**
1139 * Upscale an image by replication, not (typical) stretching.
1140 * We use this when the image width or height is less than a
1141 * certain size (4, 8) and we need to upscale an image.
1142 */
1143 static void
upscale_teximage2d(int32_t inWidth,int32_t inHeight,int32_t outWidth,int32_t outHeight,int32_t comps,const uint8_t * src,int32_t srcRowStride,uint8_t * dest)1144 upscale_teximage2d(int32_t inWidth, int32_t inHeight,
1145 int32_t outWidth, int32_t outHeight,
1146 int32_t comps, const uint8_t *src, int32_t srcRowStride,
1147 uint8_t *dest )
1148 {
1149 int32_t i, j, k;
1150
1151 assert(outWidth >= inWidth);
1152 assert(outHeight >= inHeight);
1153 #if 0
1154 assert(inWidth == 1 || inWidth == 2 || inHeight == 1 || inHeight == 2);
1155 assert((outWidth & 3) == 0);
1156 assert((outHeight & 3) == 0);
1157 #endif
1158
1159 for (i = 0; i < outHeight; i++) {
1160 const int32_t ii = i % inHeight;
1161 for (j = 0; j < outWidth; j++) {
1162 const int32_t jj = j % inWidth;
1163 for (k = 0; k < comps; k++) {
1164 dest[(i * outWidth + j) * comps + k]
1165 = src[ii * srcRowStride + jj * comps + k];
1166 }
1167 }
1168 }
1169 }
1170
1171
1172 static void
fxt1_encode(uint32_t width,uint32_t height,int32_t comps,const void * source,int32_t srcRowStride,void * dest,int32_t destRowStride)1173 fxt1_encode (uint32_t width, uint32_t height, int32_t comps,
1174 const void *source, int32_t srcRowStride,
1175 void *dest, int32_t destRowStride)
1176 {
1177 uint32_t x, y;
1178 const uint8_t *data;
1179 uint32_t *encoded = (uint32_t *)dest;
1180 void *newSource = NULL;
1181
1182 assert(comps == 3 || comps == 4);
1183
1184 /* Replicate image if width is not M8 or height is not M4 */
1185 if ((width & 7) | (height & 3)) {
1186 int32_t newWidth = (width + 7) & ~7;
1187 int32_t newHeight = (height + 3) & ~3;
1188 newSource = malloc(comps * newWidth * newHeight * sizeof(uint8_t));
1189 if (!newSource)
1190 return;
1191 upscale_teximage2d(width, height, newWidth, newHeight,
1192 comps, (const uint8_t *) source,
1193 srcRowStride, (uint8_t *) newSource);
1194 source = newSource;
1195 width = newWidth;
1196 height = newHeight;
1197 srcRowStride = comps * newWidth;
1198 }
1199
1200 data = (const uint8_t *) source;
1201 destRowStride = (destRowStride - width * 2) / 4;
1202 for (y = 0; y < height; y += 4) {
1203 uint32_t offs = 0 + (y + 0) * srcRowStride;
1204 for (x = 0; x < width; x += 8) {
1205 const uint8_t *lines[4];
1206 lines[0] = &data[offs];
1207 lines[1] = lines[0] + srcRowStride;
1208 lines[2] = lines[1] + srcRowStride;
1209 lines[3] = lines[2] + srcRowStride;
1210 offs += 8 * comps;
1211 fxt1_quantize(encoded, lines, comps);
1212 /* 128 bits per 8x4 block */
1213 encoded += 4;
1214 }
1215 encoded += destRowStride;
1216 }
1217
1218 free(newSource);
1219 }
1220
1221
1222 /***************************************************************************\
1223 * FXT1 decoder
1224 *
1225 * The decoder is based on GL_3DFX_texture_compression_FXT1
1226 * specification and serves as a concept for the encoder.
1227 \***************************************************************************/
1228
1229
1230 /* lookup table for scaling 5 bit colors up to 8 bits */
1231 static const uint8_t _rgb_scale_5[] = {
1232 0, 8, 16, 25, 33, 41, 49, 58,
1233 66, 74, 82, 90, 99, 107, 115, 123,
1234 132, 140, 148, 156, 165, 173, 181, 189,
1235 197, 206, 214, 222, 230, 239, 247, 255
1236 };
1237
1238 /* lookup table for scaling 6 bit colors up to 8 bits */
1239 static const uint8_t _rgb_scale_6[] = {
1240 0, 4, 8, 12, 16, 20, 24, 28,
1241 32, 36, 40, 45, 49, 53, 57, 61,
1242 65, 69, 73, 77, 81, 85, 89, 93,
1243 97, 101, 105, 109, 113, 117, 121, 125,
1244 130, 134, 138, 142, 146, 150, 154, 158,
1245 162, 166, 170, 174, 178, 182, 186, 190,
1246 194, 198, 202, 206, 210, 215, 219, 223,
1247 227, 231, 235, 239, 243, 247, 251, 255
1248 };
1249
1250
1251 #define CC_SEL(cc, which) (((uint32_t *)(cc))[(which) / 32] >> ((which) & 31))
1252 #define UP5(c) _rgb_scale_5[(c) & 31]
1253 #define UP6(c, b) _rgb_scale_6[(((c) & 31) << 1) | ((b) & 1)]
1254 #define LERP(n, t, c0, c1) (((n) - (t)) * (c0) + (t) * (c1) + (n) / 2) / (n)
1255
1256
1257 static void
fxt1_decode_1HI(const uint8_t * code,int32_t t,uint8_t * rgba)1258 fxt1_decode_1HI (const uint8_t *code, int32_t t, uint8_t *rgba)
1259 {
1260 const uint32_t *cc;
1261
1262 t *= 3;
1263 cc = (const uint32_t *)(code + t / 8);
1264 t = (cc[0] >> (t & 7)) & 7;
1265
1266 if (t == 7) {
1267 rgba[RCOMP] = rgba[GCOMP] = rgba[BCOMP] = rgba[ACOMP] = 0;
1268 } else {
1269 uint8_t r, g, b;
1270 cc = (const uint32_t *)(code + 12);
1271 if (t == 0) {
1272 b = UP5(CC_SEL(cc, 0));
1273 g = UP5(CC_SEL(cc, 5));
1274 r = UP5(CC_SEL(cc, 10));
1275 } else if (t == 6) {
1276 b = UP5(CC_SEL(cc, 15));
1277 g = UP5(CC_SEL(cc, 20));
1278 r = UP5(CC_SEL(cc, 25));
1279 } else {
1280 b = LERP(6, t, UP5(CC_SEL(cc, 0)), UP5(CC_SEL(cc, 15)));
1281 g = LERP(6, t, UP5(CC_SEL(cc, 5)), UP5(CC_SEL(cc, 20)));
1282 r = LERP(6, t, UP5(CC_SEL(cc, 10)), UP5(CC_SEL(cc, 25)));
1283 }
1284 rgba[RCOMP] = r;
1285 rgba[GCOMP] = g;
1286 rgba[BCOMP] = b;
1287 rgba[ACOMP] = 255;
1288 }
1289 }
1290
1291
1292 static void
fxt1_decode_1CHROMA(const uint8_t * code,int32_t t,uint8_t * rgba)1293 fxt1_decode_1CHROMA (const uint8_t *code, int32_t t, uint8_t *rgba)
1294 {
1295 const uint32_t *cc;
1296 uint32_t kk;
1297
1298 cc = (const uint32_t *)code;
1299 if (t & 16) {
1300 cc++;
1301 t &= 15;
1302 }
1303 t = (cc[0] >> (t * 2)) & 3;
1304
1305 t *= 15;
1306 cc = (const uint32_t *)(code + 8 + t / 8);
1307 kk = cc[0] >> (t & 7);
1308 rgba[BCOMP] = UP5(kk);
1309 rgba[GCOMP] = UP5(kk >> 5);
1310 rgba[RCOMP] = UP5(kk >> 10);
1311 rgba[ACOMP] = 255;
1312 }
1313
1314
1315 static void
fxt1_decode_1MIXED(const uint8_t * code,int32_t t,uint8_t * rgba)1316 fxt1_decode_1MIXED (const uint8_t *code, int32_t t, uint8_t *rgba)
1317 {
1318 const uint32_t *cc;
1319 uint32_t col[2][3];
1320 int32_t glsb, selb;
1321
1322 cc = (const uint32_t *)code;
1323 if (t & 16) {
1324 t &= 15;
1325 t = (cc[1] >> (t * 2)) & 3;
1326 /* col 2 */
1327 col[0][BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1328 col[0][GCOMP] = CC_SEL(cc, 99);
1329 col[0][RCOMP] = CC_SEL(cc, 104);
1330 /* col 3 */
1331 col[1][BCOMP] = CC_SEL(cc, 109);
1332 col[1][GCOMP] = CC_SEL(cc, 114);
1333 col[1][RCOMP] = CC_SEL(cc, 119);
1334 glsb = CC_SEL(cc, 126);
1335 selb = CC_SEL(cc, 33);
1336 } else {
1337 t = (cc[0] >> (t * 2)) & 3;
1338 /* col 0 */
1339 col[0][BCOMP] = CC_SEL(cc, 64);
1340 col[0][GCOMP] = CC_SEL(cc, 69);
1341 col[0][RCOMP] = CC_SEL(cc, 74);
1342 /* col 1 */
1343 col[1][BCOMP] = CC_SEL(cc, 79);
1344 col[1][GCOMP] = CC_SEL(cc, 84);
1345 col[1][RCOMP] = CC_SEL(cc, 89);
1346 glsb = CC_SEL(cc, 125);
1347 selb = CC_SEL(cc, 1);
1348 }
1349
1350 if (CC_SEL(cc, 124) & 1) {
1351 /* alpha[0] == 1 */
1352
1353 if (t == 3) {
1354 /* zero */
1355 rgba[RCOMP] = rgba[BCOMP] = rgba[GCOMP] = rgba[ACOMP] = 0;
1356 } else {
1357 uint8_t r, g, b;
1358 if (t == 0) {
1359 b = UP5(col[0][BCOMP]);
1360 g = UP5(col[0][GCOMP]);
1361 r = UP5(col[0][RCOMP]);
1362 } else if (t == 2) {
1363 b = UP5(col[1][BCOMP]);
1364 g = UP6(col[1][GCOMP], glsb);
1365 r = UP5(col[1][RCOMP]);
1366 } else {
1367 b = (UP5(col[0][BCOMP]) + UP5(col[1][BCOMP])) / 2;
1368 g = (UP5(col[0][GCOMP]) + UP6(col[1][GCOMP], glsb)) / 2;
1369 r = (UP5(col[0][RCOMP]) + UP5(col[1][RCOMP])) / 2;
1370 }
1371 rgba[RCOMP] = r;
1372 rgba[GCOMP] = g;
1373 rgba[BCOMP] = b;
1374 rgba[ACOMP] = 255;
1375 }
1376 } else {
1377 /* alpha[0] == 0 */
1378 uint8_t r, g, b;
1379 if (t == 0) {
1380 b = UP5(col[0][BCOMP]);
1381 g = UP6(col[0][GCOMP], glsb ^ selb);
1382 r = UP5(col[0][RCOMP]);
1383 } else if (t == 3) {
1384 b = UP5(col[1][BCOMP]);
1385 g = UP6(col[1][GCOMP], glsb);
1386 r = UP5(col[1][RCOMP]);
1387 } else {
1388 b = LERP(3, t, UP5(col[0][BCOMP]), UP5(col[1][BCOMP]));
1389 g = LERP(3, t, UP6(col[0][GCOMP], glsb ^ selb),
1390 UP6(col[1][GCOMP], glsb));
1391 r = LERP(3, t, UP5(col[0][RCOMP]), UP5(col[1][RCOMP]));
1392 }
1393 rgba[RCOMP] = r;
1394 rgba[GCOMP] = g;
1395 rgba[BCOMP] = b;
1396 rgba[ACOMP] = 255;
1397 }
1398 }
1399
1400
1401 static void
fxt1_decode_1ALPHA(const uint8_t * code,int32_t t,uint8_t * rgba)1402 fxt1_decode_1ALPHA (const uint8_t *code, int32_t t, uint8_t *rgba)
1403 {
1404 const uint32_t *cc;
1405 uint8_t r, g, b, a;
1406
1407 cc = (const uint32_t *)code;
1408 if (CC_SEL(cc, 124) & 1) {
1409 /* lerp == 1 */
1410 uint32_t col0[4];
1411
1412 if (t & 16) {
1413 t &= 15;
1414 t = (cc[1] >> (t * 2)) & 3;
1415 /* col 2 */
1416 col0[BCOMP] = (*(const uint32_t *)(code + 11)) >> 6;
1417 col0[GCOMP] = CC_SEL(cc, 99);
1418 col0[RCOMP] = CC_SEL(cc, 104);
1419 col0[ACOMP] = CC_SEL(cc, 119);
1420 } else {
1421 t = (cc[0] >> (t * 2)) & 3;
1422 /* col 0 */
1423 col0[BCOMP] = CC_SEL(cc, 64);
1424 col0[GCOMP] = CC_SEL(cc, 69);
1425 col0[RCOMP] = CC_SEL(cc, 74);
1426 col0[ACOMP] = CC_SEL(cc, 109);
1427 }
1428
1429 if (t == 0) {
1430 b = UP5(col0[BCOMP]);
1431 g = UP5(col0[GCOMP]);
1432 r = UP5(col0[RCOMP]);
1433 a = UP5(col0[ACOMP]);
1434 } else if (t == 3) {
1435 b = UP5(CC_SEL(cc, 79));
1436 g = UP5(CC_SEL(cc, 84));
1437 r = UP5(CC_SEL(cc, 89));
1438 a = UP5(CC_SEL(cc, 114));
1439 } else {
1440 b = LERP(3, t, UP5(col0[BCOMP]), UP5(CC_SEL(cc, 79)));
1441 g = LERP(3, t, UP5(col0[GCOMP]), UP5(CC_SEL(cc, 84)));
1442 r = LERP(3, t, UP5(col0[RCOMP]), UP5(CC_SEL(cc, 89)));
1443 a = LERP(3, t, UP5(col0[ACOMP]), UP5(CC_SEL(cc, 114)));
1444 }
1445 } else {
1446 /* lerp == 0 */
1447
1448 if (t & 16) {
1449 cc++;
1450 t &= 15;
1451 }
1452 t = (cc[0] >> (t * 2)) & 3;
1453
1454 if (t == 3) {
1455 /* zero */
1456 r = g = b = a = 0;
1457 } else {
1458 uint32_t kk;
1459 cc = (const uint32_t *)code;
1460 a = UP5(cc[3] >> (t * 5 + 13));
1461 t *= 15;
1462 cc = (const uint32_t *)(code + 8 + t / 8);
1463 kk = cc[0] >> (t & 7);
1464 b = UP5(kk);
1465 g = UP5(kk >> 5);
1466 r = UP5(kk >> 10);
1467 }
1468 }
1469 rgba[RCOMP] = r;
1470 rgba[GCOMP] = g;
1471 rgba[BCOMP] = b;
1472 rgba[ACOMP] = a;
1473 }
1474
1475
1476 static void
fxt1_decode_1(const void * texture,int32_t stride,int32_t i,int32_t j,uint8_t * rgba)1477 fxt1_decode_1 (const void *texture, int32_t stride, /* in pixels */
1478 int32_t i, int32_t j, uint8_t *rgba)
1479 {
1480 static void (*decode_1[]) (const uint8_t *, int32_t, uint8_t *) = {
1481 fxt1_decode_1HI, /* cc-high = "00?" */
1482 fxt1_decode_1HI, /* cc-high = "00?" */
1483 fxt1_decode_1CHROMA, /* cc-chroma = "010" */
1484 fxt1_decode_1ALPHA, /* alpha = "011" */
1485 fxt1_decode_1MIXED, /* mixed = "1??" */
1486 fxt1_decode_1MIXED, /* mixed = "1??" */
1487 fxt1_decode_1MIXED, /* mixed = "1??" */
1488 fxt1_decode_1MIXED /* mixed = "1??" */
1489 };
1490
1491 const uint8_t *code = (const uint8_t *)texture +
1492 ((j / 4) * (stride / 8) + (i / 8)) * 16;
1493 int32_t mode = CC_SEL(code, 125);
1494 int32_t t = i & 7;
1495
1496 if (t & 4) {
1497 t += 12;
1498 }
1499 t += (j & 3) * 4;
1500
1501 decode_1[mode](code, t, rgba);
1502 }
1503
1504 /*
1505 * Pixel fetch within a block.
1506 */
1507
1508 void
util_format_fxt1_rgb_fetch_rgba_8unorm(uint8_t * restrict dst,const uint8_t * restrict src,unsigned i,unsigned j)1509 util_format_fxt1_rgb_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1510 {
1511 fxt1_decode_1(src, 0, i, j, dst);
1512 }
1513
1514 void
util_format_fxt1_rgba_fetch_rgba_8unorm(uint8_t * restrict dst,const uint8_t * restrict src,unsigned i,unsigned j)1515 util_format_fxt1_rgba_fetch_rgba_8unorm(uint8_t *restrict dst, const uint8_t *restrict src, unsigned i, unsigned j)
1516 {
1517 fxt1_decode_1(src, 0, i, j, dst);
1518 dst[3] = 0xff;
1519 }
1520
1521 void
util_format_fxt1_rgb_fetch_rgba(void * restrict in_dst,const uint8_t * restrict src,unsigned i,unsigned j)1522 util_format_fxt1_rgb_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1523 {
1524 float *dst = in_dst;
1525 uint8_t tmp[4];
1526 fxt1_decode_1(src, 0, i, j, tmp);
1527 dst[0] = ubyte_to_float(tmp[0]);
1528 dst[1] = ubyte_to_float(tmp[1]);
1529 dst[2] = ubyte_to_float(tmp[2]);
1530 dst[3] = 1.0;
1531 }
1532
1533 void
util_format_fxt1_rgba_fetch_rgba(void * restrict in_dst,const uint8_t * restrict src,unsigned i,unsigned j)1534 util_format_fxt1_rgba_fetch_rgba(void *restrict in_dst, const uint8_t *restrict src, unsigned i, unsigned j)
1535 {
1536 float *dst = in_dst;
1537 uint8_t tmp[4];
1538 fxt1_decode_1(src, 0, i, j, tmp);
1539 dst[0] = ubyte_to_float(tmp[0]);
1540 dst[1] = ubyte_to_float(tmp[1]);
1541 dst[2] = ubyte_to_float(tmp[2]);
1542 dst[3] = ubyte_to_float(tmp[3]);
1543 }
1544
1545 /*
1546 * Block decompression.
1547 */
1548
1549 static inline void
util_format_fxtn_rgb_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height,bool rgba)1550 util_format_fxtn_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1551 const uint8_t *restrict src_row, unsigned src_stride,
1552 unsigned width, unsigned height,
1553 bool rgba)
1554 {
1555 const unsigned bw = 8, bh = 4, comps = 4;
1556 unsigned x, y, i, j;
1557 for (y = 0; y < height; y += bh) {
1558 const uint8_t *src = src_row;
1559 for (x = 0; x < width; x += bw) {
1560 for (j = 0; j < bh; ++j) {
1561 for (i = 0; i < bw; ++i) {
1562 uint8_t *dst = dst_row + (y + j) * dst_stride / sizeof(*dst_row) + (x + i) * comps;
1563 fxt1_decode_1(src, 0, i, j, dst);
1564 if (!rgba)
1565 dst[3] = 0xff;
1566 }
1567 }
1568 src += FXT1_BLOCK_SIZE;
1569 }
1570 src_row += src_stride;
1571 }
1572 }
1573
1574 void
util_format_fxt1_rgb_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1575 util_format_fxt1_rgb_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1576 const uint8_t *restrict src_row, unsigned src_stride,
1577 unsigned width, unsigned height)
1578 {
1579 util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1580 src_row, src_stride,
1581 width, height,
1582 false);
1583 }
1584
1585 void
util_format_fxt1_rgba_unpack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1586 util_format_fxt1_rgba_unpack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1587 const uint8_t *restrict src_row, unsigned src_stride,
1588 unsigned width, unsigned height)
1589 {
1590 util_format_fxtn_rgb_unpack_rgba_8unorm(dst_row, dst_stride,
1591 src_row, src_stride,
1592 width, height,
1593 true);
1594 }
1595
1596 static inline void
util_format_fxtn_rgb_unpack_rgba_float(float * dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height,bool rgba)1597 util_format_fxtn_rgb_unpack_rgba_float(float *dst_row, unsigned dst_stride,
1598 const uint8_t *restrict src_row, unsigned src_stride,
1599 unsigned width, unsigned height,
1600 bool rgba)
1601 {
1602 const unsigned bw = 8, bh = 4, comps = 4;
1603 unsigned x, y, i, j;
1604 for (y = 0; y < height; y += 4) {
1605 const uint8_t *src = src_row;
1606 for (x = 0; x < width; x += 8) {
1607 for (j = 0; j < bh; ++j) {
1608 for (i = 0; i < bw; ++i) {
1609 float *dst = dst_row + (y + j)*dst_stride/sizeof(*dst_row) + (x + i) * comps;
1610 uint8_t tmp[4];
1611 fxt1_decode_1(src, 0, i, j, tmp);
1612 dst[0] = ubyte_to_float(tmp[0]);
1613 dst[1] = ubyte_to_float(tmp[1]);
1614 dst[2] = ubyte_to_float(tmp[2]);
1615 if (rgba)
1616 dst[3] = ubyte_to_float(tmp[3]);
1617 else
1618 dst[3] = 1.0;
1619 }
1620 }
1621 src += FXT1_BLOCK_SIZE;
1622 }
1623 src_row += src_stride;
1624 }
1625 }
1626
1627 void
util_format_fxt1_rgb_unpack_rgba_float(void * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1628 util_format_fxt1_rgb_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1629 const uint8_t *restrict src_row, unsigned src_stride,
1630 unsigned width, unsigned height)
1631 {
1632 util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1633 src_row, src_stride,
1634 width, height,
1635 false);
1636 }
1637
1638 void
util_format_fxt1_rgba_unpack_rgba_float(void * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src_row,unsigned src_stride,unsigned width,unsigned height)1639 util_format_fxt1_rgba_unpack_rgba_float(void *restrict dst_row, unsigned dst_stride,
1640 const uint8_t *restrict src_row, unsigned src_stride,
1641 unsigned width, unsigned height)
1642 {
1643 util_format_fxtn_rgb_unpack_rgba_float(dst_row, dst_stride,
1644 src_row, src_stride,
1645 width, height,
1646 true);
1647 }
1648
1649 /*
1650 * Block compression.
1651 */
1652
1653 void
util_format_fxt1_rgb_pack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src,unsigned src_stride,unsigned width,unsigned height)1654 util_format_fxt1_rgb_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1655 const uint8_t *restrict src, unsigned src_stride,
1656 unsigned width, unsigned height)
1657 {
1658 /* The encoder for FXT1_RGB wants 24bpp packed rgb, so make a temporary to do that.
1659 */
1660 int temp_stride = width * 3;
1661 uint8_t *temp = malloc(height * temp_stride);
1662 if (!temp)
1663 return;
1664
1665 for (int y = 0; y < height; y++) {
1666 for (int x = 0; x < width; x++) {
1667 temp[y * temp_stride + x * 3 + 0] = src[x * 4 + 0];
1668 temp[y * temp_stride + x * 3 + 1] = src[x * 4 + 1];
1669 temp[y * temp_stride + x * 3 + 2] = src[x * 4 + 2];
1670 }
1671 src += src_stride;
1672 }
1673
1674 fxt1_encode(width, height, 3, temp, temp_stride, dst_row, dst_stride);
1675
1676 free(temp);
1677 }
1678
1679 void
util_format_fxt1_rgba_pack_rgba_8unorm(uint8_t * restrict dst_row,unsigned dst_stride,const uint8_t * restrict src,unsigned src_stride,unsigned width,unsigned height)1680 util_format_fxt1_rgba_pack_rgba_8unorm(uint8_t *restrict dst_row, unsigned dst_stride,
1681 const uint8_t *restrict src, unsigned src_stride,
1682 unsigned width, unsigned height)
1683 {
1684 fxt1_encode(width, height, 4, src, src_stride, dst_row, dst_stride);
1685 }
1686
1687 void
util_format_fxt1_rgb_pack_rgba_float(uint8_t * restrict dst_row,unsigned dst_stride,const float * restrict src,unsigned src_stride,unsigned width,unsigned height)1688 util_format_fxt1_rgb_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1689 const float *restrict src, unsigned src_stride,
1690 unsigned width, unsigned height)
1691 {
1692 int temp_stride = width * 4;
1693 uint8_t *temp = malloc(height * temp_stride);
1694 if (!temp)
1695 return;
1696
1697 util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1698 src, src_stride,
1699 width, height);
1700
1701 util_format_fxt1_rgb_pack_rgba_8unorm(dst_row, dst_stride,
1702 temp, temp_stride,
1703 width, height);
1704
1705 free(temp);
1706 }
1707
1708 void
util_format_fxt1_rgba_pack_rgba_float(uint8_t * restrict dst_row,unsigned dst_stride,const float * restrict src,unsigned src_stride,unsigned width,unsigned height)1709 util_format_fxt1_rgba_pack_rgba_float(uint8_t *restrict dst_row, unsigned dst_stride,
1710 const float *restrict src, unsigned src_stride,
1711 unsigned width, unsigned height)
1712 {
1713 int temp_stride = width * 4;
1714 uint8_t *temp = malloc(height * temp_stride);
1715 if (!temp)
1716 return;
1717
1718 util_format_r8g8b8a8_unorm_pack_rgba_float(temp, temp_stride,
1719 src, src_stride,
1720 width, height);
1721
1722 util_format_fxt1_rgba_pack_rgba_8unorm(dst_row, dst_stride,
1723 temp, temp_stride,
1724 width, height);
1725
1726 free(temp);
1727 }
1728