1 /* libs/pixelflinger/codeflinger/texturing.cpp
2 **
3 ** Copyright 2006, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17 
18 #define LOG_TAG "pixelflinger-code"
19 
20 #include <assert.h>
21 #include <stdint.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <sys/types.h>
25 
26 #include <log/log.h>
27 
28 #include "GGLAssembler.h"
29 
30 namespace android {
31 
32 // ---------------------------------------------------------------------------
33 
34 // iterators are initialized like this:
35 // (intToFixedCenter(x) * dx)>>16 + x0
36 // ((x<<16 + 0x8000) * dx)>>16 + x0
37 // ((x<<16)*dx + (0x8000*dx))>>16 + x0
38 // ( (x*dx) + dx>>1 ) + x0
39 // (x*dx) + (dx>>1 + x0)
40 
init_iterated_color(fragment_parts_t & parts,const reg_t & x)41 void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
42 {
43     context_t const* c = mBuilderContext.c;
44     const needs_t& needs = mBuilderContext.needs;
45 
46     if (mSmooth) {
47         // NOTE: we could take this case in the mDithering + !mSmooth case,
48         // but this would use up to 4 more registers for the color components
49         // for only a little added quality.
50         // Currently, this causes the system to run out of registers in
51         // some case (see issue #719496)
52 
53         comment("compute initial iterated color (smooth and/or dither case)");
54 
55         parts.iterated_packed = 0;
56         parts.packed = 0;
57 
58         // 0x1: color component
59         // 0x2: iterators
60         const int optReload = mOptLevel >> 1;
61         if (optReload >= 3)         parts.reload = 0; // reload nothing
62         else if (optReload == 2)    parts.reload = 2; // reload iterators
63         else if (optReload == 1)    parts.reload = 1; // reload colors
64         else if (optReload <= 0)    parts.reload = 3; // reload both
65 
66         if (!mSmooth) {
67             // we're not smoothing (just dithering), we never have to
68             // reload the iterators
69             parts.reload &= ~2;
70         }
71 
72         Scratch scratches(registerFile());
73         const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
74         const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
75         for (int i=0 ; i<4 ; i++) {
76             if (!mInfo[i].iterated)
77                 continue;
78 
79             // this component exists in the destination and is not replaced
80             // by a texture unit.
81             const int c = (parts.reload & 1) ? t0 : obtainReg();
82             if (i==0) CONTEXT_LOAD(c, iterators.ydady);
83             if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
84             if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
85             if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
86             parts.argb[i].reg = c;
87 
88             if (mInfo[i].smooth) {
89                 parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
90                 const int dvdx = parts.argb_dx[i].reg;
91                 CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
92                 MLA(AL, 0, c, x.reg, dvdx, c);
93 
94                 // adjust the color iterator to make sure it won't overflow
95                 if (!mAA) {
96                     // this is not needed when we're using anti-aliasing
97                     // because we will (have to) clamp the components
98                     // anyway.
99                     int end = scratches.obtain();
100                     MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
101                     MLA(AL, 1, end, dvdx, end, c);
102                     SUB(MI, 0, c, c, end);
103                     BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
104                     scratches.recycle(end);
105                 }
106             }
107 
108             if (parts.reload & 1) {
109                 CONTEXT_STORE(c, generated_vars.argb[i].c);
110             }
111         }
112     } else {
113         // We're not smoothed, so we can
114         // just use a packed version of the color and extract the
115         // components as needed (or not at all if we don't blend)
116 
117         // figure out if we need the iterated color
118         int load = 0;
119         for (int i=0 ; i<4 ; i++) {
120             component_info_t& info = mInfo[i];
121             if ((info.inDest || info.needed) && !info.replaced)
122                 load |= 1;
123         }
124 
125         parts.iterated_packed = 1;
126         parts.packed = (!mTextureMachine.mask && !mBlending
127                 && !mFog && !mDithering);
128         parts.reload = 0;
129         if (load || parts.packed) {
130             if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
131                 comment("load initial iterated color (8888 packed)");
132                 parts.iterated.setTo(obtainReg(),
133                         &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
134                 CONTEXT_LOAD(parts.iterated.reg, packed8888);
135             } else {
136                 comment("load initial iterated color (dest format packed)");
137 
138                 parts.iterated.setTo(obtainReg(), &mCbFormat);
139 
140                 // pre-mask the iterated color
141                 const int bits = parts.iterated.size();
142                 const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
143                 uint32_t mask = 0;
144                 if (mMasking) {
145                     for (int i=0 ; i<4 ; i++) {
146                         const int component_mask = 1<<i;
147                         const int h = parts.iterated.format.c[i].h;
148                         const int l = parts.iterated.format.c[i].l;
149                         if (h && (!(mMasking & component_mask))) {
150                             mask |= ((1<<(h-l))-1) << l;
151                         }
152                     }
153                 }
154 
155                 if (mMasking && ((mask & size)==0)) {
156                     // none of the components are present in the mask
157                 } else {
158                     CONTEXT_LOAD(parts.iterated.reg, packed);
159                     if (mCbFormat.size == 1) {
160                         AND(AL, 0, parts.iterated.reg,
161                                 parts.iterated.reg, imm(0xFF));
162                     } else if (mCbFormat.size == 2) {
163                         MOV(AL, 0, parts.iterated.reg,
164                                 reg_imm(parts.iterated.reg, LSR, 16));
165                     }
166                 }
167 
168                 // pre-mask the iterated color
169                 if (mMasking) {
170                     build_and_immediate(parts.iterated.reg, parts.iterated.reg,
171                             mask, bits);
172                 }
173             }
174         }
175     }
176 }
177 
build_iterated_color(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)178 void GGLAssembler::build_iterated_color(
179         component_t& fragment,
180         const fragment_parts_t& parts,
181         int component,
182         Scratch& regs)
183 {
184     fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
185 
186     if (!mInfo[component].iterated)
187         return;
188 
189     if (parts.iterated_packed) {
190         // iterated colors are packed, extract the one we need
191         extract(fragment, parts.iterated, component);
192     } else {
193         fragment.h = GGL_COLOR_BITS;
194         fragment.l = GGL_COLOR_BITS - 8;
195         fragment.flags |= CLEAR_LO;
196         // iterated colors are held in their own register,
197         // (smooth and/or dithering case)
198         if (parts.reload==3) {
199             // this implies mSmooth
200             Scratch scratches(registerFile());
201             int dx = scratches.obtain();
202             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
203             CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
204             ADD(AL, 0, dx, fragment.reg, dx);
205             CONTEXT_STORE(dx, generated_vars.argb[component].c);
206         } else if (parts.reload & 1) {
207             CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
208         } else {
209             // we don't reload, so simply rename the register and mark as
210             // non CORRUPTIBLE so that the texture env or blending code
211             // won't modify this (renamed) register
212             regs.recycle(fragment.reg);
213             fragment.reg = parts.argb[component].reg;
214             fragment.flags &= ~CORRUPTIBLE;
215         }
216         if (mInfo[component].smooth && mAA) {
217             // when using smooth shading AND anti-aliasing, we need to clamp
218             // the iterators because there is always an extra pixel on the
219             // edges, which most of the time will cause an overflow
220             // (since technically its outside of the domain).
221             BIC(AL, 0, fragment.reg, fragment.reg,
222                     reg_imm(fragment.reg, ASR, 31));
223             component_sat(fragment);
224         }
225     }
226 }
227 
228 // ---------------------------------------------------------------------------
229 
decodeLogicOpNeeds(const needs_t & needs)230 void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
231 {
232     // gather some informations about the components we need to process...
233     const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
234     switch(opcode) {
235     case GGL_COPY:
236         mLogicOp = 0;
237         break;
238     case GGL_CLEAR:
239     case GGL_SET:
240         mLogicOp = LOGIC_OP;
241         break;
242     case GGL_AND:
243     case GGL_AND_REVERSE:
244     case GGL_AND_INVERTED:
245     case GGL_XOR:
246     case GGL_OR:
247     case GGL_NOR:
248     case GGL_EQUIV:
249     case GGL_OR_REVERSE:
250     case GGL_OR_INVERTED:
251     case GGL_NAND:
252         mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
253         break;
254     case GGL_NOOP:
255     case GGL_INVERT:
256         mLogicOp = LOGIC_OP|LOGIC_OP_DST;
257         break;
258     case GGL_COPY_INVERTED:
259         mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
260         break;
261     };
262 }
263 
decodeTMUNeeds(const needs_t & needs,context_t const * c)264 void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
265 {
266     uint8_t replaced=0;
267     mTextureMachine.mask = 0;
268     mTextureMachine.activeUnits = 0;
269     for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
270         texture_unit_t& tmu = mTextureMachine.tmu[i];
271         if (replaced == 0xF) {
272             // all components are replaced, skip this TMU.
273             tmu.format_idx = 0;
274             tmu.mask = 0;
275             tmu.replaced = replaced;
276             continue;
277         }
278         tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
279         tmu.format = c->formats[tmu.format_idx];
280         tmu.bits = tmu.format.size*8;
281         tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
282         tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
283         tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
284         tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
285         tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
286                 && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
287 
288         // 5551 linear filtering is not supported
289         if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
290             tmu.linear = 0;
291 
292         tmu.mask = 0;
293         tmu.replaced = replaced;
294 
295         if (tmu.format_idx) {
296             mTextureMachine.activeUnits++;
297             if (tmu.format.c[0].h)    tmu.mask |= 0x1;
298             if (tmu.format.c[1].h)    tmu.mask |= 0x2;
299             if (tmu.format.c[2].h)    tmu.mask |= 0x4;
300             if (tmu.format.c[3].h)    tmu.mask |= 0x8;
301             if (tmu.env == GGL_REPLACE) {
302                 replaced |= tmu.mask;
303             } else if (tmu.env == GGL_DECAL) {
304                 if (!tmu.format.c[GGLFormat::ALPHA].h) {
305                     // if we don't have alpha, decal does nothing
306                     tmu.mask = 0;
307                 } else {
308                     // decal always ignores At
309                     tmu.mask &= ~(1<<GGLFormat::ALPHA);
310                 }
311             }
312         }
313         mTextureMachine.mask |= tmu.mask;
314         //printf("%d: mask=%08lx, replaced=%08lx\n",
315         //    i, int(tmu.mask), int(tmu.replaced));
316     }
317     mTextureMachine.replaced = replaced;
318     mTextureMachine.directTexture = 0;
319     //printf("replaced=%08lx\n", mTextureMachine.replaced);
320 }
321 
322 
init_textures(tex_coord_t * coords,const reg_t & x,const reg_t & y)323 void GGLAssembler::init_textures(
324         tex_coord_t* coords,
325         const reg_t& x, const reg_t& y)
326 {
327     context_t const* c = mBuilderContext.c;
328     const needs_t& needs = mBuilderContext.needs;
329     int Rctx = mBuilderContext.Rctx;
330     int Rx = x.reg;
331     int Ry = y.reg;
332 
333     if (mTextureMachine.mask) {
334         comment("compute texture coordinates");
335     }
336 
337     // init texture coordinates for each tmu
338     const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
339     const bool multiTexture = mTextureMachine.activeUnits > 1;
340     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
341         const texture_unit_t& tmu = mTextureMachine.tmu[i];
342         if (tmu.format_idx == 0)
343             continue;
344         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
345             (tmu.twrap == GGL_NEEDS_WRAP_11))
346         {
347             // 1:1 texture
348             pointer_t& txPtr = coords[i].ptr;
349             txPtr.setTo(obtainReg(), tmu.bits);
350             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
351             ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16));    // x += (s>>16)
352             CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
353             ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16));    // y += (t>>16)
354             // merge base & offset
355             CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
356             SMLABB(AL, Rx, Ry, txPtr.reg, Rx);               // x+y*stride
357             CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
358             base_offset(txPtr, txPtr, Rx);
359         } else {
360             Scratch scratches(registerFile());
361             reg_t& s = coords[i].s;
362             reg_t& t = coords[i].t;
363             // s = (x * dsdx)>>16 + ydsdy
364             // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
365             // t = (x * dtdx)>>16 + ydtdy
366             // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
367             s.setTo(obtainReg());
368             t.setTo(obtainReg());
369             const int need_w = GGL_READ_NEEDS(W, needs.n);
370             if (need_w) {
371                 CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
372                 CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
373             } else {
374                 int ydsdy = scratches.obtain();
375                 int ydtdy = scratches.obtain();
376                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
377                 CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
378                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
379                 CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
380                 MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
381                 MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
382             }
383 
384             if ((mOptLevel&1)==0) {
385                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
386                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
387                 recycleReg(s.reg);
388                 recycleReg(t.reg);
389             }
390         }
391 
392         // direct texture?
393         if (!multiTexture && !mBlending && !mDithering && !mFog &&
394             cb_format_idx == tmu.format_idx && !tmu.linear &&
395             mTextureMachine.replaced == tmu.mask)
396         {
397                 mTextureMachine.directTexture = i + 1;
398         }
399     }
400 }
401 
build_textures(fragment_parts_t & parts,Scratch & regs)402 void GGLAssembler::build_textures(  fragment_parts_t& parts,
403                                     Scratch& regs)
404 {
405     context_t const* c = mBuilderContext.c;
406     const needs_t& needs = mBuilderContext.needs;
407     int Rctx = mBuilderContext.Rctx;
408 
409     // We don't have a way to spill registers automatically
410     // spill depth and AA regs, when we know we may have to.
411     // build the spill list...
412     uint32_t spill_list = 0;
413     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
414         const texture_unit_t& tmu = mTextureMachine.tmu[i];
415         if (tmu.format_idx == 0)
416             continue;
417         if (tmu.linear) {
418             // we may run out of register if we have linear filtering
419             // at 1 or 4 bytes / pixel on any texture unit.
420             if (tmu.format.size == 1) {
421                 // if depth and AA enabled, we'll run out of 1 register
422                 if (parts.z.reg > 0 && parts.covPtr.reg > 0)
423                     spill_list |= 1<<parts.covPtr.reg;
424             }
425             if (tmu.format.size == 4) {
426                 // if depth or AA enabled, we'll run out of 1 or 2 registers
427                 if (parts.z.reg > 0)
428                     spill_list |= 1<<parts.z.reg;
429                 if (parts.covPtr.reg > 0)
430                     spill_list |= 1<<parts.covPtr.reg;
431             }
432         }
433     }
434 
435     Spill spill(registerFile(), *this, spill_list);
436 
437     const bool multiTexture = mTextureMachine.activeUnits > 1;
438     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
439         const texture_unit_t& tmu = mTextureMachine.tmu[i];
440         if (tmu.format_idx == 0)
441             continue;
442 
443         pointer_t& txPtr = parts.coords[i].ptr;
444         pixel_t& texel = parts.texel[i];
445 
446         // repeat...
447         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
448             (tmu.twrap == GGL_NEEDS_WRAP_11))
449         { // 1:1 textures
450             comment("fetch texel");
451             texel.setTo(regs.obtain(), &tmu.format);
452             load(txPtr, texel, WRITE_BACK);
453         } else {
454             Scratch scratches(registerFile());
455             reg_t& s = parts.coords[i].s;
456             reg_t& t = parts.coords[i].t;
457             if ((mOptLevel&1)==0) {
458                 comment("reload s/t (multitexture or linear filtering)");
459                 s.reg = scratches.obtain();
460                 t.reg = scratches.obtain();
461                 CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
462                 CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
463             }
464 
465             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
466                 return;
467 
468             comment("compute repeat/clamp");
469             int u       = scratches.obtain();
470             int v       = scratches.obtain();
471             int width   = scratches.obtain();
472             int height  = scratches.obtain();
473             int U = 0;
474             int V = 0;
475 
476             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
477                 return;
478 
479             CONTEXT_LOAD(width,  generated_vars.texture[i].width);
480             CONTEXT_LOAD(height, generated_vars.texture[i].height);
481 
482             int FRAC_BITS = 0;
483             if (tmu.linear) {
484                 // linear interpolation
485                 if (tmu.format.size == 1) {
486                     // for 8-bits textures, we can afford
487                     // 7 bits of fractional precision at no
488                     // additional cost (we can't do 8 bits
489                     // because filter8 uses signed 16 bits muls)
490                     FRAC_BITS = 7;
491                 } else if (tmu.format.size == 2) {
492                     // filter16() is internally limited to 4 bits, so:
493                     // FRAC_BITS=2 generates less instructions,
494                     // FRAC_BITS=3,4,5 creates unpleasant artifacts,
495                     // FRAC_BITS=6+ looks good
496                     FRAC_BITS = 6;
497                 } else if (tmu.format.size == 4) {
498                     // filter32() is internally limited to 8 bits, so:
499                     // FRAC_BITS=4 looks good
500                     // FRAC_BITS=5+ looks better, but generates 3 extra ipp
501                     FRAC_BITS = 6;
502                 } else {
503                     // for all other cases we use 4 bits.
504                     FRAC_BITS = 4;
505                 }
506             }
507             wrapping(u, s.reg, width,  tmu.swrap, FRAC_BITS);
508             wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
509 
510             if (tmu.linear) {
511                 comment("compute linear filtering offsets");
512                 // pixel size scale
513                 const int shift = 31 - gglClz(tmu.format.size);
514                 U = scratches.obtain();
515                 V = scratches.obtain();
516 
517                 if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
518                     return;
519 
520                 // sample the texel center
521                 SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
522                 SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
523 
524                 // get the fractionnal part of U,V
525                 AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
526                 AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
527 
528                 // compute width-1 and height-1
529                 SUB(AL, 0, width,  width,  imm(1));
530                 SUB(AL, 0, height, height, imm(1));
531 
532                 // get the integer part of U,V and clamp/wrap
533                 // and compute offset to the next texel
534                 if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
535                     // u has already been REPEATed
536                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
537                     MOV(MI, 0, u, width);
538                     CMP(AL, u, width);
539                     MOV(LT, 0, width, imm(1 << shift));
540                     if (shift)
541                         MOV(GE, 0, width, reg_imm(width, LSL, shift));
542                     RSB(GE, 0, width, width, imm(0));
543                 } else {
544                     // u has not been CLAMPed yet
545                     // algorithm:
546                     // if ((u>>4) >= width)
547                     //      u = width<<4
548                     //      width = 0
549                     // else
550                     //      width = 1<<shift
551                     // u = u>>4; // get integer part
552                     // if (u<0)
553                     //      u = 0
554                     //      width = 0
555                     // generated_vars.rt = width
556 
557                     CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
558                     MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
559                     MOV(LE, 0, width, imm(0));
560                     MOV(GT, 0, width, imm(1 << shift));
561                     MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
562                     MOV(MI, 0, u, imm(0));
563                     MOV(MI, 0, width, imm(0));
564                 }
565                 CONTEXT_STORE(width, generated_vars.rt);
566 
567                 const int stride = width;
568                 CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
569                 if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
570                     // v has already been REPEATed
571                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
572                     MOV(MI, 0, v, height);
573                     CMP(AL, v, height);
574                     MOV(LT, 0, height, imm(1 << shift));
575                     if (shift)
576                         MOV(GE, 0, height, reg_imm(height, LSL, shift));
577                     RSB(GE, 0, height, height, imm(0));
578                     MUL(AL, 0, height, stride, height);
579                 } else {
580                     // v has not been CLAMPed yet
581                     CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
582                     MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
583                     MOV(LE, 0, height, imm(0));
584                     if (shift) {
585                         MOV(GT, 0, height, reg_imm(stride, LSL, shift));
586                     } else {
587                         MOV(GT, 0, height, stride);
588                     }
589                     MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
590                     MOV(MI, 0, v, imm(0));
591                     MOV(MI, 0, height, imm(0));
592                 }
593                 CONTEXT_STORE(height, generated_vars.lb);
594             }
595 
596             scratches.recycle(width);
597             scratches.recycle(height);
598 
599             // iterate texture coordinates...
600             comment("iterate s,t");
601             int dsdx = scratches.obtain();
602             int dtdx = scratches.obtain();
603 
604             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
605                 return;
606 
607             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
608             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
609             ADD(AL, 0, s.reg, s.reg, dsdx);
610             ADD(AL, 0, t.reg, t.reg, dtdx);
611             if ((mOptLevel&1)==0) {
612                 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
613                 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
614                 scratches.recycle(s.reg);
615                 scratches.recycle(t.reg);
616             }
617             scratches.recycle(dsdx);
618             scratches.recycle(dtdx);
619 
620             // merge base & offset...
621             comment("merge base & offset");
622             texel.setTo(regs.obtain(), &tmu.format);
623             txPtr.setTo(texel.reg, tmu.bits);
624             int stride = scratches.obtain();
625 
626             if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
627                 return;
628 
629             CONTEXT_LOAD(stride,    generated_vars.texture[i].stride);
630             CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
631             SMLABB(AL, u, v, stride, u);    // u+v*stride
632             base_offset(txPtr, txPtr, u);
633 
634             // load texel
635             if (!tmu.linear) {
636                 comment("fetch texel");
637                 load(txPtr, texel, 0);
638             } else {
639                 // recycle registers we don't need anymore
640                 scratches.recycle(u);
641                 scratches.recycle(v);
642                 scratches.recycle(stride);
643 
644                 comment("fetch texel, bilinear");
645                 switch (tmu.format.size) {
646                 case 1:  filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
647                 case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
648                 case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
649                 case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
650                 }
651             }
652         }
653     }
654 }
655 
build_iterate_texture_coordinates(const fragment_parts_t & parts)656 void GGLAssembler::build_iterate_texture_coordinates(
657     const fragment_parts_t& parts)
658 {
659     const bool multiTexture = mTextureMachine.activeUnits > 1;
660     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
661         const texture_unit_t& tmu = mTextureMachine.tmu[i];
662         if (tmu.format_idx == 0)
663             continue;
664 
665         if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
666             (tmu.twrap == GGL_NEEDS_WRAP_11))
667         { // 1:1 textures
668             const pointer_t& txPtr = parts.coords[i].ptr;
669             ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
670         } else {
671             Scratch scratches(registerFile());
672             int s = parts.coords[i].s.reg;
673             int t = parts.coords[i].t.reg;
674             if ((mOptLevel&1)==0) {
675                 s = scratches.obtain();
676                 t = scratches.obtain();
677                 CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
678                 CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
679             }
680             int dsdx = scratches.obtain();
681             int dtdx = scratches.obtain();
682             CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
683             CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
684             ADD(AL, 0, s, s, dsdx);
685             ADD(AL, 0, t, t, dtdx);
686             if ((mOptLevel&1)==0) {
687                 CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
688                 CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
689             }
690         }
691     }
692 }
693 
filter8(const fragment_parts_t &,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)694 void GGLAssembler::filter8(
695         const fragment_parts_t& /*parts*/,
696         pixel_t& texel, const texture_unit_t& tmu,
697         int U, int V, pointer_t& txPtr,
698         int FRAC_BITS)
699 {
700     if (tmu.format.components != GGL_ALPHA &&
701         tmu.format.components != GGL_LUMINANCE)
702     {
703         // this is a packed format, and we don't support
704         // linear filtering (it's probably RGB 332)
705         // Should not happen with OpenGL|ES
706         LDRB(AL, texel.reg, txPtr.reg);
707         return;
708     }
709 
710     // ------------------------
711     // about ~22 cycles / pixel
712     Scratch scratches(registerFile());
713 
714     int pixel= scratches.obtain();
715     int d    = scratches.obtain();
716     int u    = scratches.obtain();
717     int k    = scratches.obtain();
718     int rt   = scratches.obtain();
719     int lb   = scratches.obtain();
720 
721     // RB -> U * V
722 
723     CONTEXT_LOAD(rt, generated_vars.rt);
724     CONTEXT_LOAD(lb, generated_vars.lb);
725 
726     int offset = pixel;
727     ADD(AL, 0, offset, lb, rt);
728     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
729     SMULBB(AL, u, U, V);
730     SMULBB(AL, d, pixel, u);
731     RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
732 
733     // LB -> (1-U) * V
734     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
735     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
736     SMULBB(AL, u, U, V);
737     SMLABB(AL, d, pixel, u, d);
738     SUB(AL, 0, k, k, u);
739 
740     // LT -> (1-U)*(1-V)
741     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
742     LDRB(AL, pixel, txPtr.reg);
743     SMULBB(AL, u, U, V);
744     SMLABB(AL, d, pixel, u, d);
745 
746     // RT -> U*(1-V)
747     LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
748     SUB(AL, 0, u, k, u);
749     SMLABB(AL, texel.reg, pixel, u, d);
750 
751     for (int i=0 ; i<4 ; i++) {
752         if (!texel.format.c[i].h) continue;
753         texel.format.c[i].h = FRAC_BITS*2+8;
754         texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
755     }
756     texel.format.size = 4;
757     texel.format.bitsPerPixel = 32;
758     texel.flags |= CLEAR_LO;
759 }
760 
filter16(const fragment_parts_t &,pixel_t & texel,const texture_unit_t & tmu,int U,int V,pointer_t & txPtr,int FRAC_BITS)761 void GGLAssembler::filter16(
762         const fragment_parts_t& /*parts*/,
763         pixel_t& texel, const texture_unit_t& tmu,
764         int U, int V, pointer_t& txPtr,
765         int FRAC_BITS)
766 {
767     // compute the mask
768     // XXX: it would be nice if the mask below could be computed
769     // automatically.
770     uint32_t mask = 0;
771     int shift = 0;
772     int prec = 0;
773     switch (tmu.format_idx) {
774         case GGL_PIXEL_FORMAT_RGB_565:
775             // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
776             // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
777             mask = 0x07E0F81F;
778             shift = 16;
779             prec = 5;
780             break;
781         case GGL_PIXEL_FORMAT_RGBA_4444:
782             // 0000,1111,0000,1111 | 0000,1111,0000,1111
783             mask = 0x0F0F0F0F;
784             shift = 12;
785             prec = 4;
786             break;
787         case GGL_PIXEL_FORMAT_LA_88:
788             // 0000,0000,1111,1111 | 0000,0000,1111,1111
789             // AALL -> 00AA | 00LL
790             mask = 0x00FF00FF;
791             shift = 8;
792             prec = 8;
793             break;
794         default:
795             // unsupported format, do something sensical...
796             ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
797             LDRH(AL, texel.reg, txPtr.reg);
798             return;
799     }
800 
801     const int adjust = FRAC_BITS*2 - prec;
802     const int round  = 0;
803 
804     // update the texel format
805     texel.format.size = 4;
806     texel.format.bitsPerPixel = 32;
807     texel.flags |= CLEAR_HI|CLEAR_LO;
808     for (int i=0 ; i<4 ; i++) {
809         if (!texel.format.c[i].h) continue;
810         const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
811         texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
812         texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
813     }
814 
815     // ------------------------
816     // about ~40 cycles / pixel
817     Scratch scratches(registerFile());
818 
819     int pixel= scratches.obtain();
820     int d    = scratches.obtain();
821     int u    = scratches.obtain();
822     int k    = scratches.obtain();
823 
824     // RB -> U * V
825     int offset = pixel;
826     CONTEXT_LOAD(offset, generated_vars.rt);
827     CONTEXT_LOAD(u, generated_vars.lb);
828     ADD(AL, 0, offset, offset, u);
829 
830     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
831     SMULBB(AL, u, U, V);
832     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
833     build_and_immediate(pixel, pixel, mask, 32);
834     if (adjust) {
835         if (round)
836             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
837         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
838     }
839     MUL(AL, 0, d, pixel, u);
840     RSB(AL, 0, k, u, imm(1<<prec));
841 
842     // LB -> (1-U) * V
843     CONTEXT_LOAD(offset, generated_vars.lb);
844     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
845     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
846     SMULBB(AL, u, U, V);
847     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
848     build_and_immediate(pixel, pixel, mask, 32);
849     if (adjust) {
850         if (round)
851             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
852         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
853     }
854     MLA(AL, 0, d, pixel, u, d);
855     SUB(AL, 0, k, k, u);
856 
857     // LT -> (1-U)*(1-V)
858     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
859     LDRH(AL, pixel, txPtr.reg);
860     SMULBB(AL, u, U, V);
861     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
862     build_and_immediate(pixel, pixel, mask, 32);
863     if (adjust) {
864         if (round)
865             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
866         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
867     }
868     MLA(AL, 0, d, pixel, u, d);
869 
870     // RT -> U*(1-V)
871     CONTEXT_LOAD(offset, generated_vars.rt);
872     LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
873     SUB(AL, 0, u, k, u);
874     ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
875     build_and_immediate(pixel, pixel, mask, 32);
876     MLA(AL, 0, texel.reg, pixel, u, d);
877 }
878 
filter24(const fragment_parts_t &,pixel_t & texel,const texture_unit_t &,int,int,pointer_t & txPtr,int)879 void GGLAssembler::filter24(
880         const fragment_parts_t& /*parts*/,
881         pixel_t& texel, const texture_unit_t& /*tmu*/,
882         int /*U*/, int /*V*/, pointer_t& txPtr,
883         int /*FRAC_BITS*/)
884 {
885     // not supported yet (currently disabled)
886     load(txPtr, texel, 0);
887 }
888 
filter32(const fragment_parts_t &,pixel_t & texel,const texture_unit_t &,int U,int V,pointer_t & txPtr,int FRAC_BITS)889 void GGLAssembler::filter32(
890         const fragment_parts_t& /*parts*/,
891         pixel_t& texel, const texture_unit_t& /*tmu*/,
892         int U, int V, pointer_t& txPtr,
893         int FRAC_BITS)
894 {
895     const int adjust = FRAC_BITS*2 - 8;
896     const int round  = 0;
897 
898     // ------------------------
899     // about ~38 cycles / pixel
900     Scratch scratches(registerFile());
901 
902     int pixel= scratches.obtain();
903     int dh   = scratches.obtain();
904     int u    = scratches.obtain();
905     int k    = scratches.obtain();
906 
907     int temp = scratches.obtain();
908     int dl   = scratches.obtain();
909     int mask = scratches.obtain();
910 
911     MOV(AL, 0, mask, imm(0xFF));
912     ORR(AL, 0, mask, mask, imm(0xFF0000));
913 
914     // RB -> U * V
915     int offset = pixel;
916     CONTEXT_LOAD(offset, generated_vars.rt);
917     CONTEXT_LOAD(u, generated_vars.lb);
918     ADD(AL, 0, offset, offset, u);
919 
920     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
921     SMULBB(AL, u, U, V);
922     AND(AL, 0, temp, mask, pixel);
923     if (adjust) {
924         if (round)
925             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
926         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
927     }
928     MUL(AL, 0, dh, temp, u);
929     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
930     MUL(AL, 0, dl, temp, u);
931     RSB(AL, 0, k, u, imm(0x100));
932 
933     // LB -> (1-U) * V
934     CONTEXT_LOAD(offset, generated_vars.lb);
935     RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
936     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
937     SMULBB(AL, u, U, V);
938     AND(AL, 0, temp, mask, pixel);
939     if (adjust) {
940         if (round)
941             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
942         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
943     }
944     MLA(AL, 0, dh, temp, u, dh);
945     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
946     MLA(AL, 0, dl, temp, u, dl);
947     SUB(AL, 0, k, k, u);
948 
949     // LT -> (1-U)*(1-V)
950     RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
951     LDR(AL, pixel, txPtr.reg);
952     SMULBB(AL, u, U, V);
953     AND(AL, 0, temp, mask, pixel);
954     if (adjust) {
955         if (round)
956             ADD(AL, 0, u, u, imm(1<<(adjust-1)));
957         MOV(AL, 0, u, reg_imm(u, LSR, adjust));
958     }
959     MLA(AL, 0, dh, temp, u, dh);
960     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
961     MLA(AL, 0, dl, temp, u, dl);
962 
963     // RT -> U*(1-V)
964     CONTEXT_LOAD(offset, generated_vars.rt);
965     LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
966     SUB(AL, 0, u, k, u);
967     AND(AL, 0, temp, mask, pixel);
968     MLA(AL, 0, dh, temp, u, dh);
969     AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
970     MLA(AL, 0, dl, temp, u, dl);
971 
972     AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
973     AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
974     ORR(AL, 0, texel.reg, dh, dl);
975 }
976 
build_texture_environment(component_t & fragment,const fragment_parts_t & parts,int component,Scratch & regs)977 void GGLAssembler::build_texture_environment(
978         component_t& fragment,
979         const fragment_parts_t& parts,
980         int component,
981         Scratch& regs)
982 {
983     const uint32_t component_mask = 1<<component;
984     const bool multiTexture = mTextureMachine.activeUnits > 1;
985     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
986         texture_unit_t& tmu = mTextureMachine.tmu[i];
987 
988         if (tmu.mask & component_mask) {
989             // replace or modulate with this texture
990             if ((tmu.replaced & component_mask) == 0) {
991                 // not replaced by a later tmu...
992 
993                 Scratch scratches(registerFile());
994                 pixel_t texel(parts.texel[i]);
995 
996                 if (multiTexture &&
997                     tmu.swrap == GGL_NEEDS_WRAP_11 &&
998                     tmu.twrap == GGL_NEEDS_WRAP_11)
999                 {
1000                     texel.reg = scratches.obtain();
1001                     texel.flags |= CORRUPTIBLE;
1002                     comment("fetch texel (multitexture 1:1)");
1003                     load(parts.coords[i].ptr, texel, WRITE_BACK);
1004                  }
1005 
1006                 component_t incoming(fragment);
1007                 modify(fragment, regs);
1008 
1009                 switch (tmu.env) {
1010                 case GGL_REPLACE:
1011                     extract(fragment, texel, component);
1012                     break;
1013                 case GGL_MODULATE:
1014                     modulate(fragment, incoming, texel, component);
1015                     break;
1016                 case GGL_DECAL:
1017                     decal(fragment, incoming, texel, component);
1018                     break;
1019                 case GGL_BLEND:
1020                     blend(fragment, incoming, texel, component, i);
1021                     break;
1022                 case GGL_ADD:
1023                     add(fragment, incoming, texel, component);
1024                     break;
1025                 }
1026             }
1027         }
1028     }
1029 }
1030 
1031 // ---------------------------------------------------------------------------
1032 
wrapping(int d,int coord,int size,int tx_wrap,int tx_linear)1033 void GGLAssembler::wrapping(
1034             int d,
1035             int coord, int size,
1036             int tx_wrap, int tx_linear)
1037 {
1038     // notes:
1039     // if tx_linear is set, we need 4 extra bits of precision on the result
1040     // SMULL/UMULL is 3 cycles
1041     Scratch scratches(registerFile());
1042     int c = coord;
1043     if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
1044         // UMULL takes 4 cycles (interlocked), and we can get away with
1045         // 2 cycles using SMULWB, but we're loosing 16 bits of precision
1046         // out of 32 (this is not a problem because the iterator keeps
1047         // its full precision)
1048         // UMULL(AL, 0, size, d, c, size);
1049         // note: we can't use SMULTB because it's signed.
1050         MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
1051         SMULWB(AL, d, d, size);
1052     } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
1053         if (tx_linear) {
1054             // 1 cycle
1055             MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
1056         } else {
1057             // 4 cycles (common case)
1058             MOV(AL, 0, d, reg_imm(coord, ASR, 16));
1059             BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
1060             CMP(AL, d, size);
1061             SUB(GE, 0, d, size, imm(1));
1062         }
1063     }
1064 }
1065 
1066 // ---------------------------------------------------------------------------
1067 
modulate(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1068 void GGLAssembler::modulate(
1069         component_t& dest,
1070         const component_t& incoming,
1071         const pixel_t& incomingTexel, int component)
1072 {
1073     Scratch locals(registerFile());
1074     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1075     extract(texel, incomingTexel, component);
1076 
1077     const int Nt = texel.size();
1078         // Nt should always be less than 10 bits because it comes
1079         // from the TMU.
1080 
1081     int Ni = incoming.size();
1082         // Ni could be big because it comes from previous MODULATEs
1083 
1084     if (Nt == 1) {
1085         // texel acts as a bit-mask
1086         // dest = incoming & ((texel << incoming.h)-texel)
1087         RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
1088         AND(AL, 0, dest.reg, dest.reg, incoming.reg);
1089         dest.l = incoming.l;
1090         dest.h = incoming.h;
1091         dest.flags |= (incoming.flags & CLEAR_LO);
1092     } else if (Ni == 1) {
1093         MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
1094         AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
1095         dest.l = 0;
1096         dest.h = Nt;
1097     } else {
1098         int inReg = incoming.reg;
1099         int shift = incoming.l;
1100         if ((Nt + Ni) > 32) {
1101             // we will overflow, reduce the precision of Ni to 8 bits
1102             // (Note Nt cannot be more than 10 bits which happens with
1103             // 565 textures and GGL_LINEAR)
1104             shift += Ni-8;
1105             Ni = 8;
1106         }
1107 
1108         // modulate by the component with the lowest precision
1109         if (Nt >= Ni) {
1110             if (shift) {
1111                 // XXX: we should be able to avoid this shift
1112                 // when shift==16 && Nt<16 && Ni<16, in which
1113                 // we could use SMULBT below.
1114                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1115                 inReg = dest.reg;
1116                 shift = 0;
1117             }
1118             // operation:           (Cf*Ct)/((1<<Ni)-1)
1119             // approximated with:   Cf*(Ct + Ct>>(Ni-1))>>Ni
1120             // this operation doesn't change texel's size
1121             ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
1122             if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
1123             else                MUL(AL, 0, dest.reg, texel.reg, dest.reg);
1124             dest.l = Ni;
1125             dest.h = Nt + Ni;
1126         } else {
1127             if (shift && (shift != 16)) {
1128                 // if shift==16, we can use 16-bits mul instructions later
1129                 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
1130                 inReg = dest.reg;
1131                 shift = 0;
1132             }
1133             // operation:           (Cf*Ct)/((1<<Nt)-1)
1134             // approximated with:   Ct*(Cf + Cf>>(Nt-1))>>Nt
1135             // this operation doesn't change incoming's size
1136             Scratch scratches(registerFile());
1137             int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
1138             if (t == inReg)
1139                 t = scratches.obtain();
1140             ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
1141             if (Nt<16 && Ni<16) {
1142                 if (shift==16)  SMULBT(AL, dest.reg, t, inReg);
1143                 else            SMULBB(AL, dest.reg, t, inReg);
1144             } else              MUL(AL, 0, dest.reg, t, inReg);
1145             dest.l = Nt;
1146             dest.h = Nt + Ni;
1147         }
1148 
1149         // low bits are not valid
1150         dest.flags |= CLEAR_LO;
1151 
1152         // no need to keep more than 8 bits/component
1153         if (dest.size() > 8)
1154             dest.l = dest.h-8;
1155     }
1156 }
1157 
decal(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1158 void GGLAssembler::decal(
1159         component_t& dest,
1160         const component_t& incoming,
1161         const pixel_t& incomingTexel, int component)
1162 {
1163     // RGBA:
1164     // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
1165     // Av = Af
1166     Scratch locals(registerFile());
1167     integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
1168     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1169     extract(texel, incomingTexel, component);
1170     extract(factor, incomingTexel, GGLFormat::ALPHA);
1171 
1172     // no need to keep more than 8-bits for decal
1173     int Ni = incoming.size();
1174     int shift = incoming.l;
1175     if (Ni > 8) {
1176         shift += Ni-8;
1177         Ni = 8;
1178     }
1179     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1180     if (shift) {
1181         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1182         incomingNorm.reg = dest.reg;
1183         incomingNorm.flags |= CORRUPTIBLE;
1184     }
1185     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1186     build_blendOneMinusFF(dest, factor, incomingNorm, texel);
1187 }
1188 
blend(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component,int tmu)1189 void GGLAssembler::blend(
1190         component_t& dest,
1191         const component_t& incoming,
1192         const pixel_t& incomingTexel, int component, int tmu)
1193 {
1194     // RGBA:
1195     // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
1196     // Av = At*Af
1197 
1198     if (component == GGLFormat::ALPHA) {
1199         modulate(dest, incoming, incomingTexel, component);
1200         return;
1201     }
1202 
1203     Scratch locals(registerFile());
1204     integer_t color(locals.obtain(), 8, CORRUPTIBLE);
1205     integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
1206     LDRB(AL, color.reg, mBuilderContext.Rctx,
1207             immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
1208     extract(factor, incomingTexel, component);
1209 
1210     // no need to keep more than 8-bits for blend
1211     int Ni = incoming.size();
1212     int shift = incoming.l;
1213     if (Ni > 8) {
1214         shift += Ni-8;
1215         Ni = 8;
1216     }
1217     integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
1218     if (shift) {
1219         MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
1220         incomingNorm.reg = dest.reg;
1221         incomingNorm.flags |= CORRUPTIBLE;
1222     }
1223     ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
1224     build_blendOneMinusFF(dest, factor, incomingNorm, color);
1225 }
1226 
add(component_t & dest,const component_t & incoming,const pixel_t & incomingTexel,int component)1227 void GGLAssembler::add(
1228         component_t& dest,
1229         const component_t& incoming,
1230         const pixel_t& incomingTexel, int component)
1231 {
1232     // RGBA:
1233     // Cv = Cf + Ct;
1234     Scratch locals(registerFile());
1235 
1236     component_t incomingTemp(incoming);
1237 
1238     // use "dest" as a temporary for extracting the texel, unless "dest"
1239     // overlaps "incoming".
1240     integer_t texel(dest.reg, 32, CORRUPTIBLE);
1241     if (dest.reg == incomingTemp.reg)
1242         texel.reg = locals.obtain();
1243     extract(texel, incomingTexel, component);
1244 
1245     if (texel.s < incomingTemp.size()) {
1246         expand(texel, texel, incomingTemp.size());
1247     } else if (texel.s > incomingTemp.size()) {
1248         if (incomingTemp.flags & CORRUPTIBLE) {
1249             expand(incomingTemp, incomingTemp, texel.s);
1250         } else {
1251             incomingTemp.reg = locals.obtain();
1252             expand(incomingTemp, incoming, texel.s);
1253         }
1254     }
1255 
1256     if (incomingTemp.l) {
1257         ADD(AL, 0, dest.reg, texel.reg,
1258                 reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
1259     } else {
1260         ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
1261     }
1262     dest.l = 0;
1263     dest.h = texel.size();
1264     component_sat(dest);
1265 }
1266 
1267 // ----------------------------------------------------------------------------
1268 
1269 }; // namespace android
1270 
1271