1 /* libs/pixelflinger/codeflinger/texturing.cpp
2 **
3 ** Copyright 2006, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17 
18 #include <assert.h>
19 #include <stdint.h>
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <sys/types.h>
23 
24 #include <cutils/log.h>
25 
26 #include "GGLAssembler.h"
27 
28 namespace android {
29 
30 // ---------------------------------------------------------------------------
31 
32 // iterators are initialized like this:
33 // (intToFixedCenter(x) * dx)>>16 + x0
34 // ((x<<16 + 0x8000) * dx)>>16 + x0
35 // ((x<<16)*dx + (0x8000*dx))>>16 + x0
36 // ( (x*dx) + dx>>1 ) + x0
37 // (x*dx) + (dx>>1 + x0)
38 
init_iterated_color(fragment_parts_t & parts,const reg_t & x)39 void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
40 {
41     context_t const* c = mBuilderContext.c;
42     const needs_t& needs = mBuilderContext.needs;
43 
44     if (mSmooth) {
45         // NOTE: we could take this case in the mDithering + !mSmooth case,
46         // but this would use up to 4 more registers for the color components
47         // for only a little added quality.
48         // Currently, this causes the system to run out of registers in
49         // some case (see issue #719496)
50 
51         comment("compute initial iterated color (smooth and/or dither case)");
52 
53         parts.iterated_packed = 0;
54         parts.packed = 0;
55 
56         // 0x1: color component
57         // 0x2: iterators
58         const int optReload = mOptLevel >> 1;
59         if (optReload >= 3)         parts.reload = 0; // reload nothing
60         else if (optReload == 2)    parts.reload = 2; // reload iterators
61         else if (optReload == 1)    parts.reload = 1; // reload colors
62         else if (optReload <= 0)    parts.reload = 3; // reload both
63 
64         if (!mSmooth) {
65             // we're not smoothing (just dithering), we never have to
66             // reload the iterators
67             parts.reload &= ~2;
68         }
69 
70         Scratch scratches(registerFile());
71         const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
72         const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
73         for (int i=0 ; i<4 ; i++) {
74             if (!mInfo[i].iterated)
75                 continue;
76 
77             // this component exists in the destination and is not replaced
78             // by a texture unit.
79             const int c = (parts.reload & 1) ? t0 : obtainReg();
80             if (i==0) CONTEXT_LOAD(c, iterators.ydady);
81             if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
82             if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
83             if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
84             parts.argb[i].reg = c;
85 
86             if (mInfo[i].smooth) {
87                 parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
88                 const int dvdx = parts.argb_dx[i].reg;
89                 CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
90                 MLA(AL, 0, c, x.reg, dvdx, c);
91 
92                 // adjust the color iterator to make sure it won't overflow
93                 if (!mAA) {
94                     // this is not needed when we're using anti-aliasing
95                     // because we will (have to) clamp the components
96                     // anyway.
97                     int end = scratches.obtain();
98                     MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
99                     MLA(AL, 1, end, dvdx, end, c);
100                     SUB(MI, 0, c, c, end);
101                     BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
102                     scratches.recycle(end);
103                 }
104             }
105 
106             if (parts.reload & 1) {
107                 CONTEXT_STORE(c, generated_vars.argb[i].c);
108             }
109         }
110     } else {
111         // We're not smoothed, so we can
112         // just use a packed version of the color and extract the
113         // components as needed (or not at all if we don't blend)
114 
115         // figure out if we need the iterated color
116         int load = 0;
117         for (int i=0 ; i<4 ; i++) {
118             component_info_t& info = mInfo[i];
119             if ((info.inDest || info.needed) && !info.replaced)
120                 load |= 1;
121         }
122 
123         parts.iterated_packed = 1;
124         parts.packed = (!mTextureMachine.mask && !mBlending
125                 && !mFog && !mDithering);
126         parts.reload = 0;
127         if (load || parts.packed) {
128             if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
129                 comment("load initial iterated color (8888 packed)");
130                 parts.iterated.setTo(obtainReg(),
131                         &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
132                 CONTEXT_LOAD(parts.iterated.reg, packed8888);
133             } else {
134                 comment("load initial iterated color (dest format packed)");
135 
136                 parts.iterated.setTo(obtainReg(), &mCbFormat);
137 
138                 // pre-mask the iterated color
139                 const int bits = parts.iterated.size();
140                 const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
141                 uint32_t mask = 0;
142                 if (mMasking) {
143                     for (int i=0 ; i<4 ; i++) {
144                         const int component_mask = 1<<i;
145                         const int h = parts.iterated.format.c[i].h;
146                         const int l = parts.iterated.format.c[i].l;
147                         if (h && (!(mMasking & component_mask))) {
148                             mask |= ((1<<(h-l))-1) << l;
149                         }
150                     }
151                 }
152 
153                 if (mMasking && ((mask & size)==0)) {
154                     // none of the components are present in the mask
155                 } else {
156                     CONTEXT_LOAD(parts.iterated.reg, packed);
157                     if (mCbFormat.size == 1) {
158                         AND(AL, 0, parts.iterated.reg,
159                                 parts.iterated.reg, imm(0xFF));
160                     } else if (mCbFormat.size == 2) {
161                         MOV(AL, 0, parts.iterated.reg,
162                                 reg_imm(parts.iterated.reg, LSR, 16));
163                     }
164                 }
165 
166                 // pre-mask the iterated color
167                 if (mMasking) {
168                     build_and_immediate(parts.iterated.reg, parts.iterated.reg,
169                             mask, bits);
170                 }
171             }
172         }
173     }
174 }
175 
build_iterated_color(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)176 void GGLAssembler::build_iterated_color(
177         component_t& fragment,
178         const fragment_parts_t& parts,
179         int component,
180         Scratch& regs)
181 {
182     fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
183 
184     if (!mInfo[component].iterated)
185         return;
186 
187     if (parts.iterated_packed) {
188         // iterated colors are packed, extract the one we need
189         extract(fragment, parts.iterated, component);
190     } else {
191         fragment.h = GGL_COLOR_BITS;
192         fragment.l = GGL_COLOR_BITS - 8;
193         fragment.flags |= CLEAR_LO;
194         // iterated colors are held in their own register,
195         // (smooth and/or dithering case)
196         if (parts.reload==3) {
197             // this implies mSmooth
198             Scratch scratches(registerFile());
199             int dx = scratches.obtain();
200             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
201             CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
202             ADD(AL, 0, dx, fragment.reg, dx);
203             CONTEXT_STORE(dx, generated_vars.argb[component].c);
204         } else if (parts.reload & 1) {
205             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
206         } else {
207             // we don't reload, so simply rename the register and mark as
208             // non CORRUPTIBLE so that the texture env or blending code
209             // won't modify this (renamed) register
210             regs.recycle(fragment.reg);
211             fragment.reg = parts.argb[component].reg;
212             fragment.flags &= ~CORRUPTIBLE;
213         }
214         if (mInfo[component].smooth && mAA) {
215             // when using smooth shading AND anti-aliasing, we need to clamp
216             // the iterators because there is always an extra pixel on the
217             // edges, which most of the time will cause an overflow
218             // (since technically its outside of the domain).
219             BIC(AL, 0, fragment.reg, fragment.reg,
220                     reg_imm(fragment.reg, ASR, 31));
221             component_sat(fragment);
222         }
223     }
224 }
225 
226 // ---------------------------------------------------------------------------
227 
decodeLogicOpNeeds(const needs_t & needs)228 void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
229 {
230     // gather some informations about the components we need to process...
231     const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
232     switch(opcode) {
233     case GGL_COPY:
234         mLogicOp = 0;
235         break;
236     case GGL_CLEAR:
237     case GGL_SET:
238         mLogicOp = LOGIC_OP;
239         break;
240     case GGL_AND:
241     case GGL_AND_REVERSE:
242     case GGL_AND_INVERTED:
243     case GGL_XOR:
244     case GGL_OR:
245     case GGL_NOR:
246     case GGL_EQUIV:
247     case GGL_OR_REVERSE:
248     case GGL_OR_INVERTED:
249     case GGL_NAND:
250         mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
251         break;
252     case GGL_NOOP:
253     case GGL_INVERT:
254         mLogicOp = LOGIC_OP|LOGIC_OP_DST;
255         break;
256     case GGL_COPY_INVERTED:
257         mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
258         break;
259     };
260 }
261 
decodeTMUNeeds(const needs_t & needs,context_t const * c)262 void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
263 {
264     uint8_t replaced=0;
265     mTextureMachine.mask = 0;
266     mTextureMachine.activeUnits = 0;
267     for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
268         texture_unit_t& tmu = mTextureMachine.tmu[i];
269         if (replaced == 0xF) {
270             // all components are replaced, skip this TMU.
271             tmu.format_idx = 0;
272             tmu.mask = 0;
273             tmu.replaced = replaced;
274             continue;
275         }
276         tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
277         tmu.format = c->formats[tmu.format_idx];
278         tmu.bits = tmu.format.size*8;
279         tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
280         tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
281         tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
282         tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
283         tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
284                 && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
285 
286         // 5551 linear filtering is not supported
287         if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
288             tmu.linear = 0;
289 
290         tmu.mask = 0;
291         tmu.replaced = replaced;
292 
293         if (tmu.format_idx) {
294             mTextureMachine.activeUnits++;
295             if (tmu.format.c[0].h)    tmu.mask |= 0x1;
296             if (tmu.format.c[1].h)    tmu.mask |= 0x2;
297             if (tmu.format.c[2].h)    tmu.mask |= 0x4;
298             if (tmu.format.c[3].h)    tmu.mask |= 0x8;
299             if (tmu.env == GGL_REPLACE) {
300                 replaced |= tmu.mask;
301             } else if (tmu.env == GGL_DECAL) {
302                 if (!tmu.format.c[GGLFormat::ALPHA].h) {
303                     // if we don't have alpha, decal does nothing
304                     tmu.mask = 0;
305                 } else {
306                     // decal always ignores At
307                     tmu.mask &= ~(1<<GGLFormat::ALPHA);
308                 }
309             }
310         }
311         mTextureMachine.mask |= tmu.mask;
312         //printf("%d: mask=%08lx, replaced=%08lx\n",
313         //    i, int(tmu.mask), int(tmu.replaced));
314     }
315     mTextureMachine.replaced = replaced;
316     mTextureMachine.directTexture = 0;
317     //printf("replaced=%08lx\n", mTextureMachine.replaced);
318 }
319 
320 
init_textures(tex_coord_t * coords,const reg_t & x,const reg_t & y)321 void GGLAssembler::init_textures(
322         tex_coord_t* coords,
323         const reg_t& x, const reg_t& y)
324 {
325     context_t const* c = mBuilderContext.c;
326     const needs_t& needs = mBuilderContext.needs;
327     int Rctx = mBuilderContext.Rctx;
328     int Rx = x.reg;
329     int Ry = y.reg;
330 
331     if (mTextureMachine.mask) {
332         comment("compute texture coordinates");
333     }
334 
335     // init texture coordinates for each tmu
336     const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
337     const bool multiTexture = mTextureMachine.activeUnits > 1;
338     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
339         const texture_unit_t& tmu = mTextureMachine.tmu[i];
340         if (tmu.format_idx == 0)
341             continue;
342         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
343             (tmu.twrap == GGL_NEEDS_WRAP_11))
344         {
345             // 1:1 texture
346             pointer_t& txPtr = coords[i].ptr;
347             txPtr.setTo(obtainReg(), tmu.bits);
348             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
349             ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16));    // x += (s>>16)
350             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
351             ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16));    // y += (t>>16)
352             // merge base & offset
353             CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
354             SMLABB(AL, Rx, Ry, txPtr.reg, Rx);               // x+y*stride
355             CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
356             base_offset(txPtr, txPtr, Rx);
357         } else {
358             Scratch scratches(registerFile());
359             reg_t& s = coords[i].s;
360             reg_t& t = coords[i].t;
361             // s = (x * dsdx)>>16 + ydsdy
362             // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
363             // t = (x * dtdx)>>16 + ydtdy
364             // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
365             s.setTo(obtainReg());
366             t.setTo(obtainReg());
367             const int need_w = GGL_READ_NEEDS(W, needs.n);
368             if (need_w) {
369                 CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
370                 CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
371             } else {
372                 int ydsdy = scratches.obtain();
373                 int ydtdy = scratches.obtain();
374                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
375                 CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
376                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
377                 CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
378                 MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
379                 MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
380             }
381 
382             if ((mOptLevel&1)==0) {
383                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
384                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
385                 recycleReg(s.reg);
386                 recycleReg(t.reg);
387             }
388         }
389 
390         // direct texture?
391         if (!multiTexture && !mBlending && !mDithering && !mFog &&
392             cb_format_idx == tmu.format_idx && !tmu.linear &&
393             mTextureMachine.replaced == tmu.mask)
394         {
395                 mTextureMachine.directTexture = i + 1;
396         }
397     }
398 }
399 
build_textures(fragment_parts_t & parts,Scratch & regs)400 void GGLAssembler::build_textures(  fragment_parts_t& parts,
401                                     Scratch& regs)
402 {
403     context_t const* c = mBuilderContext.c;
404     const needs_t& needs = mBuilderContext.needs;
405     int Rctx = mBuilderContext.Rctx;
406 
407     // We don't have a way to spill registers automatically
408     // spill depth and AA regs, when we know we may have to.
409     // build the spill list...
410     uint32_t spill_list = 0;
411     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
412         const texture_unit_t& tmu = mTextureMachine.tmu[i];
413         if (tmu.format_idx == 0)
414             continue;
415         if (tmu.linear) {
416             // we may run out of register if we have linear filtering
417             // at 1 or 4 bytes / pixel on any texture unit.
418             if (tmu.format.size == 1) {
419                 // if depth and AA enabled, we'll run out of 1 register
420                 if (parts.z.reg > 0 && parts.covPtr.reg > 0)
421                     spill_list |= 1<<parts.covPtr.reg;
422             }
423             if (tmu.format.size == 4) {
424                 // if depth or AA enabled, we'll run out of 1 or 2 registers
425                 if (parts.z.reg > 0)
426                     spill_list |= 1<<parts.z.reg;
427                 if (parts.covPtr.reg > 0)
428                     spill_list |= 1<<parts.covPtr.reg;
429             }
430         }
431     }
432 
433     Spill spill(registerFile(), *this, spill_list);
434 
435     const bool multiTexture = mTextureMachine.activeUnits > 1;
436     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
437         const texture_unit_t& tmu = mTextureMachine.tmu[i];
438         if (tmu.format_idx == 0)
439             continue;
440 
441         pointer_t& txPtr = parts.coords[i].ptr;
442         pixel_t& texel = parts.texel[i];
443 
444         // repeat...
445         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
446             (tmu.twrap == GGL_NEEDS_WRAP_11))
447         { // 1:1 textures
448             comment("fetch texel");
449             texel.setTo(regs.obtain(), &tmu.format);
450             load(txPtr, texel, WRITE_BACK);
451         } else {
452             Scratch scratches(registerFile());
453             reg_t& s = parts.coords[i].s;
454             reg_t& t = parts.coords[i].t;
455             if ((mOptLevel&1)==0) {
456                 comment("reload s/t (multitexture or linear filtering)");
457                 s.reg = scratches.obtain();
458                 t.reg = scratches.obtain();
459                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
460                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
461             }
462 
463             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
464                 return;
465 
466             comment("compute repeat/clamp");
467             int u       = scratches.obtain();
468             int v       = scratches.obtain();
469             int width   = scratches.obtain();
470             int height  = scratches.obtain();
471             int U = 0;
472             int V = 0;
473 
474             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
475                 return;
476 
477             CONTEXT_LOAD(width,  generated_vars.texture[i].width);
478             CONTEXT_LOAD(height, generated_vars.texture[i].height);
479 
480             int FRAC_BITS = 0;
481             if (tmu.linear) {
482                 // linear interpolation
483                 if (tmu.format.size == 1) {
484                     // for 8-bits textures, we can afford
485                     // 7 bits of fractional precision at no
486                     // additional cost (we can't do 8 bits
487                     // because filter8 uses signed 16 bits muls)
488                     FRAC_BITS = 7;
489                 } else if (tmu.format.size == 2) {
490                     // filter16() is internally limited to 4 bits, so:
491                     // FRAC_BITS=2 generates less instructions,
492                     // FRAC_BITS=3,4,5 creates unpleasant artifacts,
493                     // FRAC_BITS=6+ looks good
494                     FRAC_BITS = 6;
495                 } else if (tmu.format.size == 4) {
496                     // filter32() is internally limited to 8 bits, so:
497                     // FRAC_BITS=4 looks good
498                     // FRAC_BITS=5+ looks better, but generates 3 extra ipp
499                     FRAC_BITS = 6;
500                 } else {
501                     // for all other cases we use 4 bits.
502                     FRAC_BITS = 4;
503                 }
504             }
505             wrapping(u, s.reg, width,  tmu.swrap, FRAC_BITS);
506             wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
507 
508             if (tmu.linear) {
509                 comment("compute linear filtering offsets");
510                 // pixel size scale
511                 const int shift = 31 - gglClz(tmu.format.size);
512                 U = scratches.obtain();
513                 V = scratches.obtain();
514 
515                 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
516                     return;
517 
518                 // sample the texel center
519                 SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
520                 SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
521 
522                 // get the fractionnal part of U,V
523                 AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
524                 AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
525 
526                 // compute width-1 and height-1
527                 SUB(AL, 0, width,  width,  imm(1));
528                 SUB(AL, 0, height, height, imm(1));
529 
530                 // get the integer part of U,V and clamp/wrap
531                 // and compute offset to the next texel
532                 if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
533                     // u has already been REPEATed
534                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
535                     MOV(MI, 0, u, width);
536                     CMP(AL, u, width);
537                     MOV(LT, 0, width, imm(1 << shift));
538                     if (shift)
539                         MOV(GE, 0, width, reg_imm(width, LSL, shift));
540                     RSB(GE, 0, width, width, imm(0));
541                 } else {
542                     // u has not been CLAMPed yet
543                     // algorithm:
544                     // if ((u>>4) >= width)
545                     //      u = width<<4
546                     //      width = 0
547                     // else
548                     //      width = 1<<shift
549                     // u = u>>4; // get integer part
550                     // if (u<0)
551                     //      u = 0
552                     //      width = 0
553                     // generated_vars.rt = width
554 
555                     CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
556                     MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
557                     MOV(LE, 0, width, imm(0));
558                     MOV(GT, 0, width, imm(1 << shift));
559                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
560                     MOV(MI, 0, u, imm(0));
561                     MOV(MI, 0, width, imm(0));
562                 }
563                 CONTEXT_STORE(width, generated_vars.rt);
564 
565                 const int stride = width;
566                 CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
567                 if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
568                     // v has already been REPEATed
569                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
570                     MOV(MI, 0, v, height);
571                     CMP(AL, v, height);
572                     MOV(LT, 0, height, imm(1 << shift));
573                     if (shift)
574                         MOV(GE, 0, height, reg_imm(height, LSL, shift));
575                     RSB(GE, 0, height, height, imm(0));
576                     MUL(AL, 0, height, stride, height);
577                 } else {
578                     // v has not been CLAMPed yet
579                     CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
580                     MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
581                     MOV(LE, 0, height, imm(0));
582                     if (shift) {
583                         MOV(GT, 0, height, reg_imm(stride, LSL, shift));
584                     } else {
585                         MOV(GT, 0, height, stride);
586                     }
587                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
588                     MOV(MI, 0, v, imm(0));
589                     MOV(MI, 0, height, imm(0));
590                 }
591                 CONTEXT_STORE(height, generated_vars.lb);
592             }
593 
594             scratches.recycle(width);
595             scratches.recycle(height);
596 
597             // iterate texture coordinates...
598             comment("iterate s,t");
599             int dsdx = scratches.obtain();
600             int dtdx = scratches.obtain();
601 
602             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
603                 return;
604 
605             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
606             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
607             ADD(AL, 0, s.reg, s.reg, dsdx);
608             ADD(AL, 0, t.reg, t.reg, dtdx);
609             if ((mOptLevel&1)==0) {
610                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
611                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
612                 scratches.recycle(s.reg);
613                 scratches.recycle(t.reg);
614             }
615             scratches.recycle(dsdx);
616             scratches.recycle(dtdx);
617 
618             // merge base & offset...
619             comment("merge base & offset");
620             texel.setTo(regs.obtain(), &tmu.format);
621             txPtr.setTo(texel.reg, tmu.bits);
622             int stride = scratches.obtain();
623 
624             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
625                 return;
626 
627             CONTEXT_LOAD(stride,    generated_vars.texture[i].stride);
628             CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
629             SMLABB(AL, u, v, stride, u);    // u+v*stride
630             base_offset(txPtr, txPtr, u);
631 
632             // load texel
633             if (!tmu.linear) {
634                 comment("fetch texel");
635                 load(txPtr, texel, 0);
636             } else {
637                 // recycle registers we don't need anymore
638                 scratches.recycle(u);
639                 scratches.recycle(v);
640                 scratches.recycle(stride);
641 
642                 comment("fetch texel, bilinear");
643                 switch (tmu.format.size) {
644                 case 1:  filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
645                 case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
646                 case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
647                 case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
648                 }
649             }
650         }
651     }
652 }
653 
build_iterate_texture_coordinates(const fragment_parts_t & parts)654 void GGLAssembler::build_iterate_texture_coordinates(
655     const fragment_parts_t& parts)
656 {
657     const bool multiTexture = mTextureMachine.activeUnits > 1;
658     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
659         const texture_unit_t& tmu = mTextureMachine.tmu[i];
660         if (tmu.format_idx == 0)
661             continue;
662 
663         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
664             (tmu.twrap == GGL_NEEDS_WRAP_11))
665         { // 1:1 textures
666             const pointer_t& txPtr = parts.coords[i].ptr;
667             ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
668         } else {
669             Scratch scratches(registerFile());
670             int s = parts.coords[i].s.reg;
671             int t = parts.coords[i].t.reg;
672             if ((mOptLevel&1)==0) {
673                 s = scratches.obtain();
674                 t = scratches.obtain();
675                 CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
676                 CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
677             }
678             int dsdx = scratches.obtain();
679             int dtdx = scratches.obtain();
680             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
681             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
682             ADD(AL, 0, s, s, dsdx);
683             ADD(AL, 0, t, t, dtdx);
684             if ((mOptLevel&1)==0) {
685                 CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
686                 CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
687             }
688         }
689     }
690 }
691 
filter8(const fragment_parts_t &,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)692 void GGLAssembler::filter8(
693         const fragment_parts_t& /*parts*/,
694         pixel_t& texel, const texture_unit_t& tmu,
695         int U, int V, pointer_t& txPtr,
696         int FRAC_BITS)
697 {
698     if (tmu.format.components != GGL_ALPHA &&
699         tmu.format.components != GGL_LUMINANCE)
700     {
701         // this is a packed format, and we don't support
702         // linear filtering (it's probably RGB 332)
703         // Should not happen with OpenGL|ES
704         LDRB(AL, texel.reg, txPtr.reg);
705         return;
706     }
707 
708     // ------------------------
709     // about ~22 cycles / pixel
710     Scratch scratches(registerFile());
711 
712     int pixel= scratches.obtain();
713     int d    = scratches.obtain();
714     int u    = scratches.obtain();
715     int k    = scratches.obtain();
716     int rt   = scratches.obtain();
717     int lb   = scratches.obtain();
718 
719     // RB -> U * V
720 
721     CONTEXT_LOAD(rt, generated_vars.rt);
722     CONTEXT_LOAD(lb, generated_vars.lb);
723 
724     int offset = pixel;
725     ADD(AL, 0, offset, lb, rt);
726     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
727     SMULBB(AL, u, U, V);
728     SMULBB(AL, d, pixel, u);
729     RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
730 
731     // LB -> (1-U) * V
732     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
733     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
734     SMULBB(AL, u, U, V);
735     SMLABB(AL, d, pixel, u, d);
736     SUB(AL, 0, k, k, u);
737 
738     // LT -> (1-U)*(1-V)
739     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
740     LDRB(AL, pixel, txPtr.reg);
741     SMULBB(AL, u, U, V);
742     SMLABB(AL, d, pixel, u, d);
743 
744     // RT -> U*(1-V)
745     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
746     SUB(AL, 0, u, k, u);
747     SMLABB(AL, texel.reg, pixel, u, d);
748 
749     for (int i=0 ; i<4 ; i++) {
750         if (!texel.format.c[i].h) continue;
751         texel.format.c[i].h = FRAC_BITS*2+8;
752         texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
753     }
754     texel.format.size = 4;
755     texel.format.bitsPerPixel = 32;
756     texel.flags |= CLEAR_LO;
757 }
758 
filter16(const fragment_parts_t &,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)759 void GGLAssembler::filter16(
760         const fragment_parts_t& /*parts*/,
761         pixel_t& texel, const texture_unit_t& tmu,
762         int U, int V, pointer_t& txPtr,
763         int FRAC_BITS)
764 {
765     // compute the mask
766     // XXX: it would be nice if the mask below could be computed
767     // automatically.
768     uint32_t mask = 0;
769     int shift = 0;
770     int prec = 0;
771     switch (tmu.format_idx) {
772         case GGL_PIXEL_FORMAT_RGB_565:
773             // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
774             // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
775             mask = 0x07E0F81F;
776             shift = 16;
777             prec = 5;
778             break;
779         case GGL_PIXEL_FORMAT_RGBA_4444:
780             // 0000,1111,0000,1111 | 0000,1111,0000,1111
781             mask = 0x0F0F0F0F;
782             shift = 12;
783             prec = 4;
784             break;
785         case GGL_PIXEL_FORMAT_LA_88:
786             // 0000,0000,1111,1111 | 0000,0000,1111,1111
787             // AALL -> 00AA | 00LL
788             mask = 0x00FF00FF;
789             shift = 8;
790             prec = 8;
791             break;
792         default:
793             // unsupported format, do something sensical...
794             ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
795             LDRH(AL, texel.reg, txPtr.reg);
796             return;
797     }
798 
799     const int adjust = FRAC_BITS*2 - prec;
800     const int round  = 0;
801 
802     // update the texel format
803     texel.format.size = 4;
804     texel.format.bitsPerPixel = 32;
805     texel.flags |= CLEAR_HI|CLEAR_LO;
806     for (int i=0 ; i<4 ; i++) {
807         if (!texel.format.c[i].h) continue;
808         const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
809         texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
810         texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
811     }
812 
813     // ------------------------
814     // about ~40 cycles / pixel
815     Scratch scratches(registerFile());
816 
817     int pixel= scratches.obtain();
818     int d    = scratches.obtain();
819     int u    = scratches.obtain();
820     int k    = scratches.obtain();
821 
822     // RB -> U * V
823     int offset = pixel;
824     CONTEXT_LOAD(offset, generated_vars.rt);
825     CONTEXT_LOAD(u, generated_vars.lb);
826     ADD(AL, 0, offset, offset, u);
827 
828     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
829     SMULBB(AL, u, U, V);
830     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
831     build_and_immediate(pixel, pixel, mask, 32);
832     if (adjust) {
833         if (round)
834             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
835         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
836     }
837     MUL(AL, 0, d, pixel, u);
838     RSB(AL, 0, k, u, imm(1<<prec));
839 
840     // LB -> (1-U) * V
841     CONTEXT_LOAD(offset, generated_vars.lb);
842     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
843     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
844     SMULBB(AL, u, U, V);
845     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
846     build_and_immediate(pixel, pixel, mask, 32);
847     if (adjust) {
848         if (round)
849             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
850         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
851     }
852     MLA(AL, 0, d, pixel, u, d);
853     SUB(AL, 0, k, k, u);
854 
855     // LT -> (1-U)*(1-V)
856     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
857     LDRH(AL, pixel, txPtr.reg);
858     SMULBB(AL, u, U, V);
859     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
860     build_and_immediate(pixel, pixel, mask, 32);
861     if (adjust) {
862         if (round)
863             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
864         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
865     }
866     MLA(AL, 0, d, pixel, u, d);
867 
868     // RT -> U*(1-V)
869     CONTEXT_LOAD(offset, generated_vars.rt);
870     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
871     SUB(AL, 0, u, k, u);
872     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
873     build_and_immediate(pixel, pixel, mask, 32);
874     MLA(AL, 0, texel.reg, pixel, u, d);
875 }
876 
filter24(const fragment_parts_t &,pixel_t & texel,const texture_unit_t &,int,int,pointer_t & txPtr,int)877 void GGLAssembler::filter24(
878         const fragment_parts_t& /*parts*/,
879         pixel_t& texel, const texture_unit_t& /*tmu*/,
880         int /*U*/, int /*V*/, pointer_t& txPtr,
881         int /*FRAC_BITS*/)
882 {
883     // not supported yet (currently disabled)
884     load(txPtr, texel, 0);
885 }
886 
filter32(const fragment_parts_t &,pixel_t & texel,const texture_unit_t &,int U,int V,pointer_t & txPtr,int FRAC_BITS)887 void GGLAssembler::filter32(
888         const fragment_parts_t& /*parts*/,
889         pixel_t& texel, const texture_unit_t& /*tmu*/,
890         int U, int V, pointer_t& txPtr,
891         int FRAC_BITS)
892 {
893     const int adjust = FRAC_BITS*2 - 8;
894     const int round  = 0;
895 
896     // ------------------------
897     // about ~38 cycles / pixel
898     Scratch scratches(registerFile());
899 
900     int pixel= scratches.obtain();
901     int dh   = scratches.obtain();
902     int u    = scratches.obtain();
903     int k    = scratches.obtain();
904 
905     int temp = scratches.obtain();
906     int dl   = scratches.obtain();
907     int mask = scratches.obtain();
908 
909     MOV(AL, 0, mask, imm(0xFF));
910     ORR(AL, 0, mask, mask, imm(0xFF0000));
911 
912     // RB -> U * V
913     int offset = pixel;
914     CONTEXT_LOAD(offset, generated_vars.rt);
915     CONTEXT_LOAD(u, generated_vars.lb);
916     ADD(AL, 0, offset, offset, u);
917 
918     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
919     SMULBB(AL, u, U, V);
920     AND(AL, 0, temp, mask, pixel);
921     if (adjust) {
922         if (round)
923             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
924         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
925     }
926     MUL(AL, 0, dh, temp, u);
927     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
928     MUL(AL, 0, dl, temp, u);
929     RSB(AL, 0, k, u, imm(0x100));
930 
931     // LB -> (1-U) * V
932     CONTEXT_LOAD(offset, generated_vars.lb);
933     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
934     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
935     SMULBB(AL, u, U, V);
936     AND(AL, 0, temp, mask, pixel);
937     if (adjust) {
938         if (round)
939             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
940         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
941     }
942     MLA(AL, 0, dh, temp, u, dh);
943     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
944     MLA(AL, 0, dl, temp, u, dl);
945     SUB(AL, 0, k, k, u);
946 
947     // LT -> (1-U)*(1-V)
948     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
949     LDR(AL, pixel, txPtr.reg);
950     SMULBB(AL, u, U, V);
951     AND(AL, 0, temp, mask, pixel);
952     if (adjust) {
953         if (round)
954             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
955         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
956     }
957     MLA(AL, 0, dh, temp, u, dh);
958     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
959     MLA(AL, 0, dl, temp, u, dl);
960 
961     // RT -> U*(1-V)
962     CONTEXT_LOAD(offset, generated_vars.rt);
963     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
964     SUB(AL, 0, u, k, u);
965     AND(AL, 0, temp, mask, pixel);
966     MLA(AL, 0, dh, temp, u, dh);
967     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
968     MLA(AL, 0, dl, temp, u, dl);
969 
970     AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
971     AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
972     ORR(AL, 0, texel.reg, dh, dl);
973 }
974 
build_texture_environment(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)975 void GGLAssembler::build_texture_environment(
976         component_t& fragment,
977         const fragment_parts_t& parts,
978         int component,
979         Scratch& regs)
980 {
981     const uint32_t component_mask = 1<<component;
982     const bool multiTexture = mTextureMachine.activeUnits > 1;
983     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
984         texture_unit_t& tmu = mTextureMachine.tmu[i];
985 
986         if (tmu.mask & component_mask) {
987             // replace or modulate with this texture
988             if ((tmu.replaced & component_mask) == 0) {
989                 // not replaced by a later tmu...
990 
991                 Scratch scratches(registerFile());
992                 pixel_t texel(parts.texel[i]);
993 
994                 if (multiTexture &&
995                     tmu.swrap == GGL_NEEDS_WRAP_11 &&
996                     tmu.twrap == GGL_NEEDS_WRAP_11)
997                 {
998                     texel.reg = scratches.obtain();
999                     texel.flags |= CORRUPTIBLE;
1000                     comment("fetch texel (multitexture 1:1)");
1001                     load(parts.coords[i].ptr, texel, WRITE_BACK);
1002                  }
1003 
1004                 component_t incoming(fragment);
1005                 modify(fragment, regs);
1006 
1007                 switch (tmu.env) {
1008                 case GGL_REPLACE:
1009                     extract(fragment, texel, component);
1010                     break;
1011                 case GGL_MODULATE:
1012                     modulate(fragment, incoming, texel, component);
1013                     break;
1014                 case GGL_DECAL:
1015                     decal(fragment, incoming, texel, component);
1016                     break;
1017                 case GGL_BLEND:
1018                     blend(fragment, incoming, texel, component, i);
1019                     break;
1020                 case GGL_ADD:
1021                     add(fragment, incoming, texel, component);
1022                     break;
1023                 }
1024             }
1025         }
1026     }
1027 }
1028 
1029 // ---------------------------------------------------------------------------
1030 
wrapping(int d,int coord,int size,int tx_wrap,int tx_linear)1031 void GGLAssembler::wrapping(
1032             int d,
1033             int coord, int size,
1034             int tx_wrap, int tx_linear)
1035 {
1036     // notes:
1037     // if tx_linear is set, we need 4 extra bits of precision on the result
1038     // SMULL/UMULL is 3 cycles
1039     Scratch scratches(registerFile());
1040     int c = coord;
1041     if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
1042         // UMULL takes 4 cycles (interlocked), and we can get away with
1043         // 2 cycles using SMULWB, but we're loosing 16 bits of precision
1044         // out of 32 (this is not a problem because the iterator keeps
1045         // its full precision)
1046         // UMULL(AL, 0, size, d, c, size);
1047         // note: we can't use SMULTB because it's signed.
1048         MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
1049         SMULWB(AL, d, d, size);
1050     } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
1051         if (tx_linear) {
1052             // 1 cycle
1053             MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
1054         } else {
1055             // 4 cycles (common case)
1056             MOV(AL, 0, d, reg_imm(coord, ASR, 16));
1057             BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
1058             CMP(AL, d, size);
1059             SUB(GE, 0, d, size, imm(1));
1060         }
1061     }
1062 }
1063 
1064 // ---------------------------------------------------------------------------
1065 
modulate(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1066 void GGLAssembler::modulate(
1067         component_t& dest,
1068         const component_t& incoming,
1069         const pixel_t& incomingTexel, int component)
1070 {
1071     Scratch locals(registerFile());
1072     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1073     extract(texel, incomingTexel, component);
1074 
1075     const int Nt = texel.size();
1076         // Nt should always be less than 10 bits because it comes
1077         // from the TMU.
1078 
1079     int Ni = incoming.size();
1080         // Ni could be big because it comes from previous MODULATEs
1081 
1082     if (Nt == 1) {
1083         // texel acts as a bit-mask
1084         // dest = incoming & ((texel << incoming.h)-texel)
1085         RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
1086         AND(AL, 0, dest.reg, dest.reg, incoming.reg);
1087         dest.l = incoming.l;
1088         dest.h = incoming.h;
1089         dest.flags |= (incoming.flags & CLEAR_LO);
1090     } else if (Ni == 1) {
1091         MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
1092         AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
1093         dest.l = 0;
1094         dest.h = Nt;
1095     } else {
1096         int inReg = incoming.reg;
1097         int shift = incoming.l;
1098         if ((Nt + Ni) > 32) {
1099             // we will overflow, reduce the precision of Ni to 8 bits
1100             // (Note Nt cannot be more than 10 bits which happens with
1101             // 565 textures and GGL_LINEAR)
1102             shift += Ni-8;
1103             Ni = 8;
1104         }
1105 
1106         // modulate by the component with the lowest precision
1107         if (Nt >= Ni) {
1108             if (shift) {
1109                 // XXX: we should be able to avoid this shift
1110                 // when shift==16 && Nt<16 && Ni<16, in which
1111                 // we could use SMULBT below.
1112                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1113                 inReg = dest.reg;
1114                 shift = 0;
1115             }
1116             // operation:           (Cf*Ct)/((1<<Ni)-1)
1117             // approximated with:   Cf*(Ct + Ct>>(Ni-1))>>Ni
1118             // this operation doesn't change texel's size
1119             ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
1120             if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
1121             else                MUL(AL, 0, dest.reg, texel.reg, dest.reg);
1122             dest.l = Ni;
1123             dest.h = Nt + Ni;
1124         } else {
1125             if (shift && (shift != 16)) {
1126                 // if shift==16, we can use 16-bits mul instructions later
1127                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1128                 inReg = dest.reg;
1129                 shift = 0;
1130             }
1131             // operation:           (Cf*Ct)/((1<<Nt)-1)
1132             // approximated with:   Ct*(Cf + Cf>>(Nt-1))>>Nt
1133             // this operation doesn't change incoming's size
1134             Scratch scratches(registerFile());
1135             int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
1136             if (t == inReg)
1137                 t = scratches.obtain();
1138             ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
1139             if (Nt<16 && Ni<16) {
1140                 if (shift==16)  SMULBT(AL, dest.reg, t, inReg);
1141                 else            SMULBB(AL, dest.reg, t, inReg);
1142             } else              MUL(AL, 0, dest.reg, t, inReg);
1143             dest.l = Nt;
1144             dest.h = Nt + Ni;
1145         }
1146 
1147         // low bits are not valid
1148         dest.flags |= CLEAR_LO;
1149 
1150         // no need to keep more than 8 bits/component
1151         if (dest.size() > 8)
1152             dest.l = dest.h-8;
1153     }
1154 }
1155 
decal(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1156 void GGLAssembler::decal(
1157         component_t& dest,
1158         const component_t& incoming,
1159         const pixel_t& incomingTexel, int component)
1160 {
1161     // RGBA:
1162     // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
1163     // Av = Af
1164     Scratch locals(registerFile());
1165     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1166     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1167     extract(texel, incomingTexel, component);
1168     extract(factor, incomingTexel, GGLFormat::ALPHA);
1169 
1170     // no need to keep more than 8-bits for decal
1171     int Ni = incoming.size();
1172     int shift = incoming.l;
1173     if (Ni > 8) {
1174         shift += Ni-8;
1175         Ni = 8;
1176     }
1177     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1178     if (shift) {
1179         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1180         incomingNorm.reg = dest.reg;
1181         incomingNorm.flags |= CORRUPTIBLE;
1182     }
1183     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1184     build_blendOneMinusFF(dest, factor, incomingNorm, texel);
1185 }
1186 
blend(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component,int tmu)1187 void GGLAssembler::blend(
1188         component_t& dest,
1189         const component_t& incoming,
1190         const pixel_t& incomingTexel, int component, int tmu)
1191 {
1192     // RGBA:
1193     // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
1194     // Av = At*Af
1195 
1196     if (component == GGLFormat::ALPHA) {
1197         modulate(dest, incoming, incomingTexel, component);
1198         return;
1199     }
1200 
1201     Scratch locals(registerFile());
1202     integer_t color(locals.obtain(), 8, CORRUPTIBLE);
1203     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1204     LDRB(AL, color.reg, mBuilderContext.Rctx,
1205             immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
1206     extract(factor, incomingTexel, component);
1207 
1208     // no need to keep more than 8-bits for blend
1209     int Ni = incoming.size();
1210     int shift = incoming.l;
1211     if (Ni > 8) {
1212         shift += Ni-8;
1213         Ni = 8;
1214     }
1215     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1216     if (shift) {
1217         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1218         incomingNorm.reg = dest.reg;
1219         incomingNorm.flags |= CORRUPTIBLE;
1220     }
1221     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1222     build_blendOneMinusFF(dest, factor, incomingNorm, color);
1223 }
1224 
add(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1225 void GGLAssembler::add(
1226         component_t& dest,
1227         const component_t& incoming,
1228         const pixel_t& incomingTexel, int component)
1229 {
1230     // RGBA:
1231     // Cv = Cf + Ct;
1232     Scratch locals(registerFile());
1233 
1234     component_t incomingTemp(incoming);
1235 
1236     // use "dest" as a temporary for extracting the texel, unless "dest"
1237     // overlaps "incoming".
1238     integer_t texel(dest.reg, 32, CORRUPTIBLE);
1239     if (dest.reg == incomingTemp.reg)
1240         texel.reg = locals.obtain();
1241     extract(texel, incomingTexel, component);
1242 
1243     if (texel.s < incomingTemp.size()) {
1244         expand(texel, texel, incomingTemp.size());
1245     } else if (texel.s > incomingTemp.size()) {
1246         if (incomingTemp.flags & CORRUPTIBLE) {
1247             expand(incomingTemp, incomingTemp, texel.s);
1248         } else {
1249             incomingTemp.reg = locals.obtain();
1250             expand(incomingTemp, incoming, texel.s);
1251         }
1252     }
1253 
1254     if (incomingTemp.l) {
1255         ADD(AL, 0, dest.reg, texel.reg,
1256                 reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
1257     } else {
1258         ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
1259     }
1260     dest.l = 0;
1261     dest.h = texel.size();
1262     component_sat(dest);
1263 }
1264 
1265 // ----------------------------------------------------------------------------
1266 
1267 }; // namespace android
1268 
1269