1 /* libs/pixelflinger/codeflinger/texturing.cpp
2 **
3 ** Copyright 2006, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 ** http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17
18 #define LOG_TAG "pixelflinger-code"
19
20 #include <assert.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <sys/types.h>
25
26 #include <log/log.h>
27
28 #include "GGLAssembler.h"
29
30 namespace android {
31
32 // ---------------------------------------------------------------------------
33
34 // iterators are initialized like this:
35 // (intToFixedCenter(x) * dx)>>16 + x0
36 // ((x<<16 + 0x8000) * dx)>>16 + x0
37 // ((x<<16)*dx + (0x8000*dx))>>16 + x0
38 // ( (x*dx) + dx>>1 ) + x0
39 // (x*dx) + (dx>>1 + x0)
40
init_iterated_color(fragment_parts_t & parts,const reg_t & x)41 void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
42 {
43 context_t const* c = mBuilderContext.c;
44 const needs_t& needs = mBuilderContext.needs;
45
46 if (mSmooth) {
47 // NOTE: we could take this case in the mDithering + !mSmooth case,
48 // but this would use up to 4 more registers for the color components
49 // for only a little added quality.
50 // Currently, this causes the system to run out of registers in
51 // some case (see issue #719496)
52
53 comment("compute initial iterated color (smooth and/or dither case)");
54
55 parts.iterated_packed = 0;
56 parts.packed = 0;
57
58 // 0x1: color component
59 // 0x2: iterators
60 const int optReload = mOptLevel >> 1;
61 if (optReload >= 3) parts.reload = 0; // reload nothing
62 else if (optReload == 2) parts.reload = 2; // reload iterators
63 else if (optReload == 1) parts.reload = 1; // reload colors
64 else if (optReload <= 0) parts.reload = 3; // reload both
65
66 if (!mSmooth) {
67 // we're not smoothing (just dithering), we never have to
68 // reload the iterators
69 parts.reload &= ~2;
70 }
71
72 Scratch scratches(registerFile());
73 const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
74 const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
75 for (int i=0 ; i<4 ; i++) {
76 if (!mInfo[i].iterated)
77 continue;
78
79 // this component exists in the destination and is not replaced
80 // by a texture unit.
81 const int c = (parts.reload & 1) ? t0 : obtainReg();
82 if (i==0) CONTEXT_LOAD(c, iterators.ydady);
83 if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
84 if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
85 if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
86 parts.argb[i].reg = c;
87
88 if (mInfo[i].smooth) {
89 parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
90 const int dvdx = parts.argb_dx[i].reg;
91 CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
92 MLA(AL, 0, c, x.reg, dvdx, c);
93
94 // adjust the color iterator to make sure it won't overflow
95 if (!mAA) {
96 // this is not needed when we're using anti-aliasing
97 // because we will (have to) clamp the components
98 // anyway.
99 int end = scratches.obtain();
100 MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
101 MLA(AL, 1, end, dvdx, end, c);
102 SUB(MI, 0, c, c, end);
103 BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
104 scratches.recycle(end);
105 }
106 }
107
108 if (parts.reload & 1) {
109 CONTEXT_STORE(c, generated_vars.argb[i].c);
110 }
111 }
112 } else {
113 // We're not smoothed, so we can
114 // just use a packed version of the color and extract the
115 // components as needed (or not at all if we don't blend)
116
117 // figure out if we need the iterated color
118 int load = 0;
119 for (int i=0 ; i<4 ; i++) {
120 component_info_t& info = mInfo[i];
121 if ((info.inDest || info.needed) && !info.replaced)
122 load |= 1;
123 }
124
125 parts.iterated_packed = 1;
126 parts.packed = (!mTextureMachine.mask && !mBlending
127 && !mFog && !mDithering);
128 parts.reload = 0;
129 if (load || parts.packed) {
130 if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
131 comment("load initial iterated color (8888 packed)");
132 parts.iterated.setTo(obtainReg(),
133 &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
134 CONTEXT_LOAD(parts.iterated.reg, packed8888);
135 } else {
136 comment("load initial iterated color (dest format packed)");
137
138 parts.iterated.setTo(obtainReg(), &mCbFormat);
139
140 // pre-mask the iterated color
141 const int bits = parts.iterated.size();
142 const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
143 uint32_t mask = 0;
144 if (mMasking) {
145 for (int i=0 ; i<4 ; i++) {
146 const int component_mask = 1<<i;
147 const int h = parts.iterated.format.c[i].h;
148 const int l = parts.iterated.format.c[i].l;
149 if (h && (!(mMasking & component_mask))) {
150 mask |= ((1<<(h-l))-1) << l;
151 }
152 }
153 }
154
155 if (mMasking && ((mask & size)==0)) {
156 // none of the components are present in the mask
157 } else {
158 CONTEXT_LOAD(parts.iterated.reg, packed);
159 if (mCbFormat.size == 1) {
160 AND(AL, 0, parts.iterated.reg,
161 parts.iterated.reg, imm(0xFF));
162 } else if (mCbFormat.size == 2) {
163 MOV(AL, 0, parts.iterated.reg,
164 reg_imm(parts.iterated.reg, LSR, 16));
165 }
166 }
167
168 // pre-mask the iterated color
169 if (mMasking) {
170 build_and_immediate(parts.iterated.reg, parts.iterated.reg,
171 mask, bits);
172 }
173 }
174 }
175 }
176 }
177
build_iterated_color(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)178 void GGLAssembler::build_iterated_color(
179 component_t& fragment,
180 const fragment_parts_t& parts,
181 int component,
182 Scratch& regs)
183 {
184 fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
185
186 if (!mInfo[component].iterated)
187 return;
188
189 if (parts.iterated_packed) {
190 // iterated colors are packed, extract the one we need
191 extract(fragment, parts.iterated, component);
192 } else {
193 fragment.h = GGL_COLOR_BITS;
194 fragment.l = GGL_COLOR_BITS - 8;
195 fragment.flags |= CLEAR_LO;
196 // iterated colors are held in their own register,
197 // (smooth and/or dithering case)
198 if (parts.reload==3) {
199 // this implies mSmooth
200 Scratch scratches(registerFile());
201 int dx = scratches.obtain();
202 CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
203 CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
204 ADD(AL, 0, dx, fragment.reg, dx);
205 CONTEXT_STORE(dx, generated_vars.argb[component].c);
206 } else if (parts.reload & 1) {
207 CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
208 } else {
209 // we don't reload, so simply rename the register and mark as
210 // non CORRUPTIBLE so that the texture env or blending code
211 // won't modify this (renamed) register
212 regs.recycle(fragment.reg);
213 fragment.reg = parts.argb[component].reg;
214 fragment.flags &= ~CORRUPTIBLE;
215 }
216 if (mInfo[component].smooth && mAA) {
217 // when using smooth shading AND anti-aliasing, we need to clamp
218 // the iterators because there is always an extra pixel on the
219 // edges, which most of the time will cause an overflow
220 // (since technically its outside of the domain).
221 BIC(AL, 0, fragment.reg, fragment.reg,
222 reg_imm(fragment.reg, ASR, 31));
223 component_sat(fragment);
224 }
225 }
226 }
227
228 // ---------------------------------------------------------------------------
229
decodeLogicOpNeeds(const needs_t & needs)230 void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
231 {
232 // gather some informations about the components we need to process...
233 const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
234 switch(opcode) {
235 case GGL_COPY:
236 mLogicOp = 0;
237 break;
238 case GGL_CLEAR:
239 case GGL_SET:
240 mLogicOp = LOGIC_OP;
241 break;
242 case GGL_AND:
243 case GGL_AND_REVERSE:
244 case GGL_AND_INVERTED:
245 case GGL_XOR:
246 case GGL_OR:
247 case GGL_NOR:
248 case GGL_EQUIV:
249 case GGL_OR_REVERSE:
250 case GGL_OR_INVERTED:
251 case GGL_NAND:
252 mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
253 break;
254 case GGL_NOOP:
255 case GGL_INVERT:
256 mLogicOp = LOGIC_OP|LOGIC_OP_DST;
257 break;
258 case GGL_COPY_INVERTED:
259 mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
260 break;
261 };
262 }
263
decodeTMUNeeds(const needs_t & needs,context_t const * c)264 void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
265 {
266 uint8_t replaced=0;
267 mTextureMachine.mask = 0;
268 mTextureMachine.activeUnits = 0;
269 for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
270 texture_unit_t& tmu = mTextureMachine.tmu[i];
271 if (replaced == 0xF) {
272 // all components are replaced, skip this TMU.
273 tmu.format_idx = 0;
274 tmu.mask = 0;
275 tmu.replaced = replaced;
276 continue;
277 }
278 tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
279 tmu.format = c->formats[tmu.format_idx];
280 tmu.bits = tmu.format.size*8;
281 tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
282 tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
283 tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
284 tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
285 tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
286 && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
287
288 // 5551 linear filtering is not supported
289 if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
290 tmu.linear = 0;
291
292 tmu.mask = 0;
293 tmu.replaced = replaced;
294
295 if (tmu.format_idx) {
296 mTextureMachine.activeUnits++;
297 if (tmu.format.c[0].h) tmu.mask |= 0x1;
298 if (tmu.format.c[1].h) tmu.mask |= 0x2;
299 if (tmu.format.c[2].h) tmu.mask |= 0x4;
300 if (tmu.format.c[3].h) tmu.mask |= 0x8;
301 if (tmu.env == GGL_REPLACE) {
302 replaced |= tmu.mask;
303 } else if (tmu.env == GGL_DECAL) {
304 if (!tmu.format.c[GGLFormat::ALPHA].h) {
305 // if we don't have alpha, decal does nothing
306 tmu.mask = 0;
307 } else {
308 // decal always ignores At
309 tmu.mask &= ~(1<<GGLFormat::ALPHA);
310 }
311 }
312 }
313 mTextureMachine.mask |= tmu.mask;
314 //printf("%d: mask=%08lx, replaced=%08lx\n",
315 // i, int(tmu.mask), int(tmu.replaced));
316 }
317 mTextureMachine.replaced = replaced;
318 mTextureMachine.directTexture = 0;
319 //printf("replaced=%08lx\n", mTextureMachine.replaced);
320 }
321
322
init_textures(tex_coord_t * coords,const reg_t & x,const reg_t & y)323 void GGLAssembler::init_textures(
324 tex_coord_t* coords,
325 const reg_t& x, const reg_t& y)
326 {
327 context_t const* c = mBuilderContext.c;
328 const needs_t& needs = mBuilderContext.needs;
329 int Rctx = mBuilderContext.Rctx;
330 int Rx = x.reg;
331 int Ry = y.reg;
332
333 if (mTextureMachine.mask) {
334 comment("compute texture coordinates");
335 }
336
337 // init texture coordinates for each tmu
338 const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
339 const bool multiTexture = mTextureMachine.activeUnits > 1;
340 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
341 const texture_unit_t& tmu = mTextureMachine.tmu[i];
342 if (tmu.format_idx == 0)
343 continue;
344 if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
345 (tmu.twrap == GGL_NEEDS_WRAP_11))
346 {
347 // 1:1 texture
348 pointer_t& txPtr = coords[i].ptr;
349 txPtr.setTo(obtainReg(), tmu.bits);
350 CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
351 ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16)); // x += (s>>16)
352 CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
353 ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16)); // y += (t>>16)
354 // merge base & offset
355 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
356 SMLABB(AL, Rx, Ry, txPtr.reg, Rx); // x+y*stride
357 CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
358 base_offset(txPtr, txPtr, Rx);
359 } else {
360 Scratch scratches(registerFile());
361 reg_t& s = coords[i].s;
362 reg_t& t = coords[i].t;
363 // s = (x * dsdx)>>16 + ydsdy
364 // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
365 // t = (x * dtdx)>>16 + ydtdy
366 // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
367 s.setTo(obtainReg());
368 t.setTo(obtainReg());
369 const int need_w = GGL_READ_NEEDS(W, needs.n);
370 if (need_w) {
371 CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
372 CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
373 } else {
374 int ydsdy = scratches.obtain();
375 int ydtdy = scratches.obtain();
376 CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
377 CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
378 CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
379 CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
380 MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
381 MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
382 }
383
384 if ((mOptLevel&1)==0) {
385 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
386 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
387 recycleReg(s.reg);
388 recycleReg(t.reg);
389 }
390 }
391
392 // direct texture?
393 if (!multiTexture && !mBlending && !mDithering && !mFog &&
394 cb_format_idx == tmu.format_idx && !tmu.linear &&
395 mTextureMachine.replaced == tmu.mask)
396 {
397 mTextureMachine.directTexture = i + 1;
398 }
399 }
400 }
401
build_textures(fragment_parts_t & parts,Scratch & regs)402 void GGLAssembler::build_textures( fragment_parts_t& parts,
403 Scratch& regs)
404 {
405 context_t const* c = mBuilderContext.c;
406 const needs_t& needs = mBuilderContext.needs;
407 int Rctx = mBuilderContext.Rctx;
408
409 // We don't have a way to spill registers automatically
410 // spill depth and AA regs, when we know we may have to.
411 // build the spill list...
412 uint32_t spill_list = 0;
413 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
414 const texture_unit_t& tmu = mTextureMachine.tmu[i];
415 if (tmu.format_idx == 0)
416 continue;
417 if (tmu.linear) {
418 // we may run out of register if we have linear filtering
419 // at 1 or 4 bytes / pixel on any texture unit.
420 if (tmu.format.size == 1) {
421 // if depth and AA enabled, we'll run out of 1 register
422 if (parts.z.reg > 0 && parts.covPtr.reg > 0)
423 spill_list |= 1<<parts.covPtr.reg;
424 }
425 if (tmu.format.size == 4) {
426 // if depth or AA enabled, we'll run out of 1 or 2 registers
427 if (parts.z.reg > 0)
428 spill_list |= 1<<parts.z.reg;
429 if (parts.covPtr.reg > 0)
430 spill_list |= 1<<parts.covPtr.reg;
431 }
432 }
433 }
434
435 Spill spill(registerFile(), *this, spill_list);
436
437 const bool multiTexture = mTextureMachine.activeUnits > 1;
438 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
439 const texture_unit_t& tmu = mTextureMachine.tmu[i];
440 if (tmu.format_idx == 0)
441 continue;
442
443 pointer_t& txPtr = parts.coords[i].ptr;
444 pixel_t& texel = parts.texel[i];
445
446 // repeat...
447 if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
448 (tmu.twrap == GGL_NEEDS_WRAP_11))
449 { // 1:1 textures
450 comment("fetch texel");
451 texel.setTo(regs.obtain(), &tmu.format);
452 load(txPtr, texel, WRITE_BACK);
453 } else {
454 Scratch scratches(registerFile());
455 reg_t& s = parts.coords[i].s;
456 reg_t& t = parts.coords[i].t;
457 if ((mOptLevel&1)==0) {
458 comment("reload s/t (multitexture or linear filtering)");
459 s.reg = scratches.obtain();
460 t.reg = scratches.obtain();
461 CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
462 CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
463 }
464
465 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
466 return;
467
468 comment("compute repeat/clamp");
469 int u = scratches.obtain();
470 int v = scratches.obtain();
471 int width = scratches.obtain();
472 int height = scratches.obtain();
473 int U = 0;
474 int V = 0;
475
476 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
477 return;
478
479 CONTEXT_LOAD(width, generated_vars.texture[i].width);
480 CONTEXT_LOAD(height, generated_vars.texture[i].height);
481
482 int FRAC_BITS = 0;
483 if (tmu.linear) {
484 // linear interpolation
485 if (tmu.format.size == 1) {
486 // for 8-bits textures, we can afford
487 // 7 bits of fractional precision at no
488 // additional cost (we can't do 8 bits
489 // because filter8 uses signed 16 bits muls)
490 FRAC_BITS = 7;
491 } else if (tmu.format.size == 2) {
492 // filter16() is internally limited to 4 bits, so:
493 // FRAC_BITS=2 generates less instructions,
494 // FRAC_BITS=3,4,5 creates unpleasant artifacts,
495 // FRAC_BITS=6+ looks good
496 FRAC_BITS = 6;
497 } else if (tmu.format.size == 4) {
498 // filter32() is internally limited to 8 bits, so:
499 // FRAC_BITS=4 looks good
500 // FRAC_BITS=5+ looks better, but generates 3 extra ipp
501 FRAC_BITS = 6;
502 } else {
503 // for all other cases we use 4 bits.
504 FRAC_BITS = 4;
505 }
506 }
507 wrapping(u, s.reg, width, tmu.swrap, FRAC_BITS);
508 wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
509
510 if (tmu.linear) {
511 comment("compute linear filtering offsets");
512 // pixel size scale
513 const int shift = 31 - gglClz(tmu.format.size);
514 U = scratches.obtain();
515 V = scratches.obtain();
516
517 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
518 return;
519
520 // sample the texel center
521 SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
522 SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
523
524 // get the fractionnal part of U,V
525 AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
526 AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
527
528 // compute width-1 and height-1
529 SUB(AL, 0, width, width, imm(1));
530 SUB(AL, 0, height, height, imm(1));
531
532 // get the integer part of U,V and clamp/wrap
533 // and compute offset to the next texel
534 if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
535 // u has already been REPEATed
536 MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
537 MOV(MI, 0, u, width);
538 CMP(AL, u, width);
539 MOV(LT, 0, width, imm(1 << shift));
540 if (shift)
541 MOV(GE, 0, width, reg_imm(width, LSL, shift));
542 RSB(GE, 0, width, width, imm(0));
543 } else {
544 // u has not been CLAMPed yet
545 // algorithm:
546 // if ((u>>4) >= width)
547 // u = width<<4
548 // width = 0
549 // else
550 // width = 1<<shift
551 // u = u>>4; // get integer part
552 // if (u<0)
553 // u = 0
554 // width = 0
555 // generated_vars.rt = width
556
557 CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
558 MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
559 MOV(LE, 0, width, imm(0));
560 MOV(GT, 0, width, imm(1 << shift));
561 MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
562 MOV(MI, 0, u, imm(0));
563 MOV(MI, 0, width, imm(0));
564 }
565 CONTEXT_STORE(width, generated_vars.rt);
566
567 const int stride = width;
568 CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
569 if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
570 // v has already been REPEATed
571 MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
572 MOV(MI, 0, v, height);
573 CMP(AL, v, height);
574 MOV(LT, 0, height, imm(1 << shift));
575 if (shift)
576 MOV(GE, 0, height, reg_imm(height, LSL, shift));
577 RSB(GE, 0, height, height, imm(0));
578 MUL(AL, 0, height, stride, height);
579 } else {
580 // v has not been CLAMPed yet
581 CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
582 MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
583 MOV(LE, 0, height, imm(0));
584 if (shift) {
585 MOV(GT, 0, height, reg_imm(stride, LSL, shift));
586 } else {
587 MOV(GT, 0, height, stride);
588 }
589 MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
590 MOV(MI, 0, v, imm(0));
591 MOV(MI, 0, height, imm(0));
592 }
593 CONTEXT_STORE(height, generated_vars.lb);
594 }
595
596 scratches.recycle(width);
597 scratches.recycle(height);
598
599 // iterate texture coordinates...
600 comment("iterate s,t");
601 int dsdx = scratches.obtain();
602 int dtdx = scratches.obtain();
603
604 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
605 return;
606
607 CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
608 CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
609 ADD(AL, 0, s.reg, s.reg, dsdx);
610 ADD(AL, 0, t.reg, t.reg, dtdx);
611 if ((mOptLevel&1)==0) {
612 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
613 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
614 scratches.recycle(s.reg);
615 scratches.recycle(t.reg);
616 }
617 scratches.recycle(dsdx);
618 scratches.recycle(dtdx);
619
620 // merge base & offset...
621 comment("merge base & offset");
622 texel.setTo(regs.obtain(), &tmu.format);
623 txPtr.setTo(texel.reg, tmu.bits);
624 int stride = scratches.obtain();
625
626 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
627 return;
628
629 CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
630 CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
631 SMLABB(AL, u, v, stride, u); // u+v*stride
632 base_offset(txPtr, txPtr, u);
633
634 // load texel
635 if (!tmu.linear) {
636 comment("fetch texel");
637 load(txPtr, texel, 0);
638 } else {
639 // recycle registers we don't need anymore
640 scratches.recycle(u);
641 scratches.recycle(v);
642 scratches.recycle(stride);
643
644 comment("fetch texel, bilinear");
645 switch (tmu.format.size) {
646 case 1: filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
647 case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
648 case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
649 case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
650 }
651 }
652 }
653 }
654 }
655
build_iterate_texture_coordinates(const fragment_parts_t & parts)656 void GGLAssembler::build_iterate_texture_coordinates(
657 const fragment_parts_t& parts)
658 {
659 const bool multiTexture = mTextureMachine.activeUnits > 1;
660 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
661 const texture_unit_t& tmu = mTextureMachine.tmu[i];
662 if (tmu.format_idx == 0)
663 continue;
664
665 if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
666 (tmu.twrap == GGL_NEEDS_WRAP_11))
667 { // 1:1 textures
668 const pointer_t& txPtr = parts.coords[i].ptr;
669 ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
670 } else {
671 Scratch scratches(registerFile());
672 int s = parts.coords[i].s.reg;
673 int t = parts.coords[i].t.reg;
674 if ((mOptLevel&1)==0) {
675 s = scratches.obtain();
676 t = scratches.obtain();
677 CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
678 CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
679 }
680 int dsdx = scratches.obtain();
681 int dtdx = scratches.obtain();
682 CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
683 CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
684 ADD(AL, 0, s, s, dsdx);
685 ADD(AL, 0, t, t, dtdx);
686 if ((mOptLevel&1)==0) {
687 CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
688 CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
689 }
690 }
691 }
692 }
693
filter8(const fragment_parts_t &,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)694 void GGLAssembler::filter8(
695 const fragment_parts_t& /*parts*/,
696 pixel_t& texel, const texture_unit_t& tmu,
697 int U, int V, pointer_t& txPtr,
698 int FRAC_BITS)
699 {
700 if (tmu.format.components != GGL_ALPHA &&
701 tmu.format.components != GGL_LUMINANCE)
702 {
703 // this is a packed format, and we don't support
704 // linear filtering (it's probably RGB 332)
705 // Should not happen with OpenGL|ES
706 LDRB(AL, texel.reg, txPtr.reg);
707 return;
708 }
709
710 // ------------------------
711 // about ~22 cycles / pixel
712 Scratch scratches(registerFile());
713
714 int pixel= scratches.obtain();
715 int d = scratches.obtain();
716 int u = scratches.obtain();
717 int k = scratches.obtain();
718 int rt = scratches.obtain();
719 int lb = scratches.obtain();
720
721 // RB -> U * V
722
723 CONTEXT_LOAD(rt, generated_vars.rt);
724 CONTEXT_LOAD(lb, generated_vars.lb);
725
726 int offset = pixel;
727 ADD(AL, 0, offset, lb, rt);
728 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
729 SMULBB(AL, u, U, V);
730 SMULBB(AL, d, pixel, u);
731 RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
732
733 // LB -> (1-U) * V
734 RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
735 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
736 SMULBB(AL, u, U, V);
737 SMLABB(AL, d, pixel, u, d);
738 SUB(AL, 0, k, k, u);
739
740 // LT -> (1-U)*(1-V)
741 RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
742 LDRB(AL, pixel, txPtr.reg);
743 SMULBB(AL, u, U, V);
744 SMLABB(AL, d, pixel, u, d);
745
746 // RT -> U*(1-V)
747 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
748 SUB(AL, 0, u, k, u);
749 SMLABB(AL, texel.reg, pixel, u, d);
750
751 for (int i=0 ; i<4 ; i++) {
752 if (!texel.format.c[i].h) continue;
753 texel.format.c[i].h = FRAC_BITS*2+8;
754 texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
755 }
756 texel.format.size = 4;
757 texel.format.bitsPerPixel = 32;
758 texel.flags |= CLEAR_LO;
759 }
760
filter16(const fragment_parts_t &,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)761 void GGLAssembler::filter16(
762 const fragment_parts_t& /*parts*/,
763 pixel_t& texel, const texture_unit_t& tmu,
764 int U, int V, pointer_t& txPtr,
765 int FRAC_BITS)
766 {
767 // compute the mask
768 // XXX: it would be nice if the mask below could be computed
769 // automatically.
770 uint32_t mask = 0;
771 int shift = 0;
772 int prec = 0;
773 switch (tmu.format_idx) {
774 case GGL_PIXEL_FORMAT_RGB_565:
775 // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
776 // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
777 mask = 0x07E0F81F;
778 shift = 16;
779 prec = 5;
780 break;
781 case GGL_PIXEL_FORMAT_RGBA_4444:
782 // 0000,1111,0000,1111 | 0000,1111,0000,1111
783 mask = 0x0F0F0F0F;
784 shift = 12;
785 prec = 4;
786 break;
787 case GGL_PIXEL_FORMAT_LA_88:
788 // 0000,0000,1111,1111 | 0000,0000,1111,1111
789 // AALL -> 00AA | 00LL
790 mask = 0x00FF00FF;
791 shift = 8;
792 prec = 8;
793 break;
794 default:
795 // unsupported format, do something sensical...
796 ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
797 LDRH(AL, texel.reg, txPtr.reg);
798 return;
799 }
800
801 const int adjust = FRAC_BITS*2 - prec;
802 const int round = 0;
803
804 // update the texel format
805 texel.format.size = 4;
806 texel.format.bitsPerPixel = 32;
807 texel.flags |= CLEAR_HI|CLEAR_LO;
808 for (int i=0 ; i<4 ; i++) {
809 if (!texel.format.c[i].h) continue;
810 const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
811 texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
812 texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
813 }
814
815 // ------------------------
816 // about ~40 cycles / pixel
817 Scratch scratches(registerFile());
818
819 int pixel= scratches.obtain();
820 int d = scratches.obtain();
821 int u = scratches.obtain();
822 int k = scratches.obtain();
823
824 // RB -> U * V
825 int offset = pixel;
826 CONTEXT_LOAD(offset, generated_vars.rt);
827 CONTEXT_LOAD(u, generated_vars.lb);
828 ADD(AL, 0, offset, offset, u);
829
830 LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
831 SMULBB(AL, u, U, V);
832 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
833 build_and_immediate(pixel, pixel, mask, 32);
834 if (adjust) {
835 if (round)
836 ADD(AL, 0, u, u, imm(1<<(adjust-1)));
837 MOV(AL, 0, u, reg_imm(u, LSR, adjust));
838 }
839 MUL(AL, 0, d, pixel, u);
840 RSB(AL, 0, k, u, imm(1<<prec));
841
842 // LB -> (1-U) * V
843 CONTEXT_LOAD(offset, generated_vars.lb);
844 RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
845 LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
846 SMULBB(AL, u, U, V);
847 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
848 build_and_immediate(pixel, pixel, mask, 32);
849 if (adjust) {
850 if (round)
851 ADD(AL, 0, u, u, imm(1<<(adjust-1)));
852 MOV(AL, 0, u, reg_imm(u, LSR, adjust));
853 }
854 MLA(AL, 0, d, pixel, u, d);
855 SUB(AL, 0, k, k, u);
856
857 // LT -> (1-U)*(1-V)
858 RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
859 LDRH(AL, pixel, txPtr.reg);
860 SMULBB(AL, u, U, V);
861 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
862 build_and_immediate(pixel, pixel, mask, 32);
863 if (adjust) {
864 if (round)
865 ADD(AL, 0, u, u, imm(1<<(adjust-1)));
866 MOV(AL, 0, u, reg_imm(u, LSR, adjust));
867 }
868 MLA(AL, 0, d, pixel, u, d);
869
870 // RT -> U*(1-V)
871 CONTEXT_LOAD(offset, generated_vars.rt);
872 LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
873 SUB(AL, 0, u, k, u);
874 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
875 build_and_immediate(pixel, pixel, mask, 32);
876 MLA(AL, 0, texel.reg, pixel, u, d);
877 }
878
filter24(const fragment_parts_t &,pixel_t & texel,const texture_unit_t &,int,int,pointer_t & txPtr,int)879 void GGLAssembler::filter24(
880 const fragment_parts_t& /*parts*/,
881 pixel_t& texel, const texture_unit_t& /*tmu*/,
882 int /*U*/, int /*V*/, pointer_t& txPtr,
883 int /*FRAC_BITS*/)
884 {
885 // not supported yet (currently disabled)
886 load(txPtr, texel, 0);
887 }
888
filter32(const fragment_parts_t &,pixel_t & texel,const texture_unit_t &,int U,int V,pointer_t & txPtr,int FRAC_BITS)889 void GGLAssembler::filter32(
890 const fragment_parts_t& /*parts*/,
891 pixel_t& texel, const texture_unit_t& /*tmu*/,
892 int U, int V, pointer_t& txPtr,
893 int FRAC_BITS)
894 {
895 const int adjust = FRAC_BITS*2 - 8;
896 const int round = 0;
897
898 // ------------------------
899 // about ~38 cycles / pixel
900 Scratch scratches(registerFile());
901
902 int pixel= scratches.obtain();
903 int dh = scratches.obtain();
904 int u = scratches.obtain();
905 int k = scratches.obtain();
906
907 int temp = scratches.obtain();
908 int dl = scratches.obtain();
909 int mask = scratches.obtain();
910
911 MOV(AL, 0, mask, imm(0xFF));
912 ORR(AL, 0, mask, mask, imm(0xFF0000));
913
914 // RB -> U * V
915 int offset = pixel;
916 CONTEXT_LOAD(offset, generated_vars.rt);
917 CONTEXT_LOAD(u, generated_vars.lb);
918 ADD(AL, 0, offset, offset, u);
919
920 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
921 SMULBB(AL, u, U, V);
922 AND(AL, 0, temp, mask, pixel);
923 if (adjust) {
924 if (round)
925 ADD(AL, 0, u, u, imm(1<<(adjust-1)));
926 MOV(AL, 0, u, reg_imm(u, LSR, adjust));
927 }
928 MUL(AL, 0, dh, temp, u);
929 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
930 MUL(AL, 0, dl, temp, u);
931 RSB(AL, 0, k, u, imm(0x100));
932
933 // LB -> (1-U) * V
934 CONTEXT_LOAD(offset, generated_vars.lb);
935 RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
936 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
937 SMULBB(AL, u, U, V);
938 AND(AL, 0, temp, mask, pixel);
939 if (adjust) {
940 if (round)
941 ADD(AL, 0, u, u, imm(1<<(adjust-1)));
942 MOV(AL, 0, u, reg_imm(u, LSR, adjust));
943 }
944 MLA(AL, 0, dh, temp, u, dh);
945 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
946 MLA(AL, 0, dl, temp, u, dl);
947 SUB(AL, 0, k, k, u);
948
949 // LT -> (1-U)*(1-V)
950 RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
951 LDR(AL, pixel, txPtr.reg);
952 SMULBB(AL, u, U, V);
953 AND(AL, 0, temp, mask, pixel);
954 if (adjust) {
955 if (round)
956 ADD(AL, 0, u, u, imm(1<<(adjust-1)));
957 MOV(AL, 0, u, reg_imm(u, LSR, adjust));
958 }
959 MLA(AL, 0, dh, temp, u, dh);
960 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
961 MLA(AL, 0, dl, temp, u, dl);
962
963 // RT -> U*(1-V)
964 CONTEXT_LOAD(offset, generated_vars.rt);
965 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
966 SUB(AL, 0, u, k, u);
967 AND(AL, 0, temp, mask, pixel);
968 MLA(AL, 0, dh, temp, u, dh);
969 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
970 MLA(AL, 0, dl, temp, u, dl);
971
972 AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
973 AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
974 ORR(AL, 0, texel.reg, dh, dl);
975 }
976
build_texture_environment(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)977 void GGLAssembler::build_texture_environment(
978 component_t& fragment,
979 const fragment_parts_t& parts,
980 int component,
981 Scratch& regs)
982 {
983 const uint32_t component_mask = 1<<component;
984 const bool multiTexture = mTextureMachine.activeUnits > 1;
985 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
986 texture_unit_t& tmu = mTextureMachine.tmu[i];
987
988 if (tmu.mask & component_mask) {
989 // replace or modulate with this texture
990 if ((tmu.replaced & component_mask) == 0) {
991 // not replaced by a later tmu...
992
993 Scratch scratches(registerFile());
994 pixel_t texel(parts.texel[i]);
995
996 if (multiTexture &&
997 tmu.swrap == GGL_NEEDS_WRAP_11 &&
998 tmu.twrap == GGL_NEEDS_WRAP_11)
999 {
1000 texel.reg = scratches.obtain();
1001 texel.flags |= CORRUPTIBLE;
1002 comment("fetch texel (multitexture 1:1)");
1003 load(parts.coords[i].ptr, texel, WRITE_BACK);
1004 }
1005
1006 component_t incoming(fragment);
1007 modify(fragment, regs);
1008
1009 switch (tmu.env) {
1010 case GGL_REPLACE:
1011 extract(fragment, texel, component);
1012 break;
1013 case GGL_MODULATE:
1014 modulate(fragment, incoming, texel, component);
1015 break;
1016 case GGL_DECAL:
1017 decal(fragment, incoming, texel, component);
1018 break;
1019 case GGL_BLEND:
1020 blend(fragment, incoming, texel, component, i);
1021 break;
1022 case GGL_ADD:
1023 add(fragment, incoming, texel, component);
1024 break;
1025 }
1026 }
1027 }
1028 }
1029 }
1030
1031 // ---------------------------------------------------------------------------
1032
wrapping(int d,int coord,int size,int tx_wrap,int tx_linear)1033 void GGLAssembler::wrapping(
1034 int d,
1035 int coord, int size,
1036 int tx_wrap, int tx_linear)
1037 {
1038 // notes:
1039 // if tx_linear is set, we need 4 extra bits of precision on the result
1040 // SMULL/UMULL is 3 cycles
1041 Scratch scratches(registerFile());
1042 int c = coord;
1043 if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
1044 // UMULL takes 4 cycles (interlocked), and we can get away with
1045 // 2 cycles using SMULWB, but we're loosing 16 bits of precision
1046 // out of 32 (this is not a problem because the iterator keeps
1047 // its full precision)
1048 // UMULL(AL, 0, size, d, c, size);
1049 // note: we can't use SMULTB because it's signed.
1050 MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
1051 SMULWB(AL, d, d, size);
1052 } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
1053 if (tx_linear) {
1054 // 1 cycle
1055 MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
1056 } else {
1057 // 4 cycles (common case)
1058 MOV(AL, 0, d, reg_imm(coord, ASR, 16));
1059 BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
1060 CMP(AL, d, size);
1061 SUB(GE, 0, d, size, imm(1));
1062 }
1063 }
1064 }
1065
1066 // ---------------------------------------------------------------------------
1067
modulate(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1068 void GGLAssembler::modulate(
1069 component_t& dest,
1070 const component_t& incoming,
1071 const pixel_t& incomingTexel, int component)
1072 {
1073 Scratch locals(registerFile());
1074 integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1075 extract(texel, incomingTexel, component);
1076
1077 const int Nt = texel.size();
1078 // Nt should always be less than 10 bits because it comes
1079 // from the TMU.
1080
1081 int Ni = incoming.size();
1082 // Ni could be big because it comes from previous MODULATEs
1083
1084 if (Nt == 1) {
1085 // texel acts as a bit-mask
1086 // dest = incoming & ((texel << incoming.h)-texel)
1087 RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
1088 AND(AL, 0, dest.reg, dest.reg, incoming.reg);
1089 dest.l = incoming.l;
1090 dest.h = incoming.h;
1091 dest.flags |= (incoming.flags & CLEAR_LO);
1092 } else if (Ni == 1) {
1093 MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
1094 AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
1095 dest.l = 0;
1096 dest.h = Nt;
1097 } else {
1098 int inReg = incoming.reg;
1099 int shift = incoming.l;
1100 if ((Nt + Ni) > 32) {
1101 // we will overflow, reduce the precision of Ni to 8 bits
1102 // (Note Nt cannot be more than 10 bits which happens with
1103 // 565 textures and GGL_LINEAR)
1104 shift += Ni-8;
1105 Ni = 8;
1106 }
1107
1108 // modulate by the component with the lowest precision
1109 if (Nt >= Ni) {
1110 if (shift) {
1111 // XXX: we should be able to avoid this shift
1112 // when shift==16 && Nt<16 && Ni<16, in which
1113 // we could use SMULBT below.
1114 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1115 inReg = dest.reg;
1116 shift = 0;
1117 }
1118 // operation: (Cf*Ct)/((1<<Ni)-1)
1119 // approximated with: Cf*(Ct + Ct>>(Ni-1))>>Ni
1120 // this operation doesn't change texel's size
1121 ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
1122 if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
1123 else MUL(AL, 0, dest.reg, texel.reg, dest.reg);
1124 dest.l = Ni;
1125 dest.h = Nt + Ni;
1126 } else {
1127 if (shift && (shift != 16)) {
1128 // if shift==16, we can use 16-bits mul instructions later
1129 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1130 inReg = dest.reg;
1131 shift = 0;
1132 }
1133 // operation: (Cf*Ct)/((1<<Nt)-1)
1134 // approximated with: Ct*(Cf + Cf>>(Nt-1))>>Nt
1135 // this operation doesn't change incoming's size
1136 Scratch scratches(registerFile());
1137 int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
1138 if (t == inReg)
1139 t = scratches.obtain();
1140 ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
1141 if (Nt<16 && Ni<16) {
1142 if (shift==16) SMULBT(AL, dest.reg, t, inReg);
1143 else SMULBB(AL, dest.reg, t, inReg);
1144 } else MUL(AL, 0, dest.reg, t, inReg);
1145 dest.l = Nt;
1146 dest.h = Nt + Ni;
1147 }
1148
1149 // low bits are not valid
1150 dest.flags |= CLEAR_LO;
1151
1152 // no need to keep more than 8 bits/component
1153 if (dest.size() > 8)
1154 dest.l = dest.h-8;
1155 }
1156 }
1157
decal(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1158 void GGLAssembler::decal(
1159 component_t& dest,
1160 const component_t& incoming,
1161 const pixel_t& incomingTexel, int component)
1162 {
1163 // RGBA:
1164 // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
1165 // Av = Af
1166 Scratch locals(registerFile());
1167 integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1168 integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1169 extract(texel, incomingTexel, component);
1170 extract(factor, incomingTexel, GGLFormat::ALPHA);
1171
1172 // no need to keep more than 8-bits for decal
1173 int Ni = incoming.size();
1174 int shift = incoming.l;
1175 if (Ni > 8) {
1176 shift += Ni-8;
1177 Ni = 8;
1178 }
1179 integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1180 if (shift) {
1181 MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1182 incomingNorm.reg = dest.reg;
1183 incomingNorm.flags |= CORRUPTIBLE;
1184 }
1185 ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1186 build_blendOneMinusFF(dest, factor, incomingNorm, texel);
1187 }
1188
blend(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component,int tmu)1189 void GGLAssembler::blend(
1190 component_t& dest,
1191 const component_t& incoming,
1192 const pixel_t& incomingTexel, int component, int tmu)
1193 {
1194 // RGBA:
1195 // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
1196 // Av = At*Af
1197
1198 if (component == GGLFormat::ALPHA) {
1199 modulate(dest, incoming, incomingTexel, component);
1200 return;
1201 }
1202
1203 Scratch locals(registerFile());
1204 integer_t color(locals.obtain(), 8, CORRUPTIBLE);
1205 integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1206 LDRB(AL, color.reg, mBuilderContext.Rctx,
1207 immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
1208 extract(factor, incomingTexel, component);
1209
1210 // no need to keep more than 8-bits for blend
1211 int Ni = incoming.size();
1212 int shift = incoming.l;
1213 if (Ni > 8) {
1214 shift += Ni-8;
1215 Ni = 8;
1216 }
1217 integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1218 if (shift) {
1219 MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1220 incomingNorm.reg = dest.reg;
1221 incomingNorm.flags |= CORRUPTIBLE;
1222 }
1223 ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1224 build_blendOneMinusFF(dest, factor, incomingNorm, color);
1225 }
1226
add(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1227 void GGLAssembler::add(
1228 component_t& dest,
1229 const component_t& incoming,
1230 const pixel_t& incomingTexel, int component)
1231 {
1232 // RGBA:
1233 // Cv = Cf + Ct;
1234 Scratch locals(registerFile());
1235
1236 component_t incomingTemp(incoming);
1237
1238 // use "dest" as a temporary for extracting the texel, unless "dest"
1239 // overlaps "incoming".
1240 integer_t texel(dest.reg, 32, CORRUPTIBLE);
1241 if (dest.reg == incomingTemp.reg)
1242 texel.reg = locals.obtain();
1243 extract(texel, incomingTexel, component);
1244
1245 if (texel.s < incomingTemp.size()) {
1246 expand(texel, texel, incomingTemp.size());
1247 } else if (texel.s > incomingTemp.size()) {
1248 if (incomingTemp.flags & CORRUPTIBLE) {
1249 expand(incomingTemp, incomingTemp, texel.s);
1250 } else {
1251 incomingTemp.reg = locals.obtain();
1252 expand(incomingTemp, incoming, texel.s);
1253 }
1254 }
1255
1256 if (incomingTemp.l) {
1257 ADD(AL, 0, dest.reg, texel.reg,
1258 reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
1259 } else {
1260 ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
1261 }
1262 dest.l = 0;
1263 dest.h = texel.size();
1264 component_sat(dest);
1265 }
1266
1267 // ----------------------------------------------------------------------------
1268
1269 }; // namespace android
1270
1271