1 /* libs/pixelflinger/codeflinger/GGLAssembler.cpp
2 **
3 ** Copyright 2006, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17 
18 #define LOG_TAG "GGLAssembler"
19 
20 #include <assert.h>
21 #include <stdint.h>
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <sys/types.h>
25 #include <cutils/log.h>
26 
27 #include "GGLAssembler.h"
28 
29 namespace android {
30 
31 // ----------------------------------------------------------------------------
32 
GGLAssembler(ARMAssemblerInterface * target)33 GGLAssembler::GGLAssembler(ARMAssemblerInterface* target)
34     : ARMAssemblerProxy(target),
35       RegisterAllocator(ARMAssemblerProxy::getCodegenArch()), mOptLevel(7)
36 {
37 }
38 
~GGLAssembler()39 GGLAssembler::~GGLAssembler()
40 {
41 }
42 
prolog()43 void GGLAssembler::prolog()
44 {
45     ARMAssemblerProxy::prolog();
46 }
47 
epilog(uint32_t touched)48 void GGLAssembler::epilog(uint32_t touched)
49 {
50     ARMAssemblerProxy::epilog(touched);
51 }
52 
reset(int opt_level)53 void GGLAssembler::reset(int opt_level)
54 {
55     ARMAssemblerProxy::reset();
56     RegisterAllocator::reset();
57     mOptLevel = opt_level;
58 }
59 
60 // ---------------------------------------------------------------------------
61 
scanline(const needs_t & needs,context_t const * c)62 int GGLAssembler::scanline(const needs_t& needs, context_t const* c)
63 {
64     int err = 0;
65     int opt_level = mOptLevel;
66     while (opt_level >= 0) {
67         reset(opt_level);
68         err = scanline_core(needs, c);
69         if (err == 0)
70             break;
71         opt_level--;
72     }
73 
74     // XXX: in theory, pcForLabel is not valid before generate()
75     uint32_t* fragment_start_pc = pcForLabel("fragment_loop");
76     uint32_t* fragment_end_pc = pcForLabel("epilog");
77     const int per_fragment_ops = int(fragment_end_pc - fragment_start_pc);
78 
79     // build a name for our pipeline
80     char name[64];
81     sprintf(name,
82             "scanline__%08X:%08X_%08X_%08X [%3d ipp]",
83             needs.p, needs.n, needs.t[0], needs.t[1], per_fragment_ops);
84 
85     if (err) {
86         ALOGE("Error while generating ""%s""\n", name);
87         disassemble(name);
88         return -1;
89     }
90 
91     return generate(name);
92 }
93 
scanline_core(const needs_t & needs,context_t const * c)94 int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c)
95 {
96     int64_t duration = ggl_system_time();
97 
98     mBlendFactorCached = 0;
99     mBlending = 0;
100     mMasking = 0;
101     mAA        = GGL_READ_NEEDS(P_AA, needs.p);
102     mDithering = GGL_READ_NEEDS(P_DITHER, needs.p);
103     mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER;
104     mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER;
105     mFog       = GGL_READ_NEEDS(P_FOG, needs.p) != 0;
106     mSmooth    = GGL_READ_NEEDS(SHADE, needs.n) != 0;
107     mBuilderContext.needs = needs;
108     mBuilderContext.c = c;
109     mBuilderContext.Rctx = reserveReg(R0); // context always in R0
110     mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ];
111 
112     // ------------------------------------------------------------------------
113 
114     decodeLogicOpNeeds(needs);
115 
116     decodeTMUNeeds(needs, c);
117 
118     mBlendSrc  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n));
119     mBlendDst  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n));
120     mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n));
121     mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n));
122 
123     if (!mCbFormat.c[GGLFormat::ALPHA].h) {
124         if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) ||
125             (mBlendSrc == GGL_DST_ALPHA)) {
126             mBlendSrc = GGL_ONE;
127         }
128         if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) ||
129             (mBlendSrcA == GGL_DST_ALPHA)) {
130             mBlendSrcA = GGL_ONE;
131         }
132         if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) ||
133             (mBlendDst == GGL_DST_ALPHA)) {
134             mBlendDst = GGL_ONE;
135         }
136         if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) ||
137             (mBlendDstA == GGL_DST_ALPHA)) {
138             mBlendDstA = GGL_ONE;
139         }
140     }
141 
142     // if we need the framebuffer, read it now
143     const int blending =    blending_codes(mBlendSrc, mBlendDst) |
144                             blending_codes(mBlendSrcA, mBlendDstA);
145 
146     // XXX: handle special cases, destination not modified...
147     if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
148         (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) {
149         // Destination unmodified (beware of logic ops)
150     } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
151         (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) {
152         // Destination is zero (beware of logic ops)
153     }
154 
155     int fbComponents = 0;
156     const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n);
157     for (int i=0 ; i<4 ; i++) {
158         const int mask = 1<<i;
159         component_info_t& info = mInfo[i];
160         int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
161         int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
162         if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA)
163             fs = GGL_ONE;
164         info.masked =   !!(masking & mask);
165         info.inDest =   !info.masked && mCbFormat.c[i].h &&
166                         ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp));
167         if (mCbFormat.components >= GGL_LUMINANCE &&
168                 (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
169             info.inDest = false;
170         }
171         info.needed =   (i==GGLFormat::ALPHA) &&
172                         (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS);
173         info.replaced = !!(mTextureMachine.replaced & mask);
174         info.iterated = (!info.replaced && (info.inDest || info.needed));
175         info.smooth =   mSmooth && info.iterated;
176         info.fog =      mFog && info.inDest && (i != GGLFormat::ALPHA);
177         info.blend =    (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
178 
179         mBlending |= (info.blend ? mask : 0);
180         mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0;
181         fbComponents |= mCbFormat.c[i].h ? mask : 0;
182     }
183 
184     mAllMasked = (mMasking == fbComponents);
185     if (mAllMasked) {
186         mDithering = 0;
187     }
188 
189     fragment_parts_t parts;
190 
191     // ------------------------------------------------------------------------
192     prolog();
193     // ------------------------------------------------------------------------
194 
195     build_scanline_prolog(parts, needs);
196 
197     if (registerFile().status())
198         return registerFile().status();
199 
200     // ------------------------------------------------------------------------
201     label("fragment_loop");
202     // ------------------------------------------------------------------------
203     {
204         Scratch regs(registerFile());
205 
206         if (mDithering) {
207             // update the dither index.
208             MOV(AL, 0, parts.count.reg,
209                     reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT));
210             ADD(AL, 0, parts.count.reg, parts.count.reg,
211                     imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT)));
212             MOV(AL, 0, parts.count.reg,
213                     reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT));
214         }
215 
216         // XXX: could we do an early alpha-test here in some cases?
217         // It would probaly be used only with smooth-alpha and no texture
218         // (or no alpha component in the texture).
219 
220         // Early z-test
221         if (mAlphaTest==GGL_ALWAYS) {
222             build_depth_test(parts, Z_TEST|Z_WRITE);
223         } else {
224             // we cannot do the z-write here, because
225             // it might be killed by the alpha-test later
226             build_depth_test(parts, Z_TEST);
227         }
228 
229         { // texture coordinates
230             Scratch scratches(registerFile());
231 
232             // texel generation
233             build_textures(parts, regs);
234             if (registerFile().status())
235                 return registerFile().status();
236         }
237 
238         if ((blending & (FACTOR_DST|BLEND_DST)) ||
239                 (mMasking && !mAllMasked) ||
240                 (mLogicOp & LOGIC_OP_DST))
241         {
242             // blending / logic_op / masking need the framebuffer
243             mDstPixel.setTo(regs.obtain(), &mCbFormat);
244 
245             // load the framebuffer pixel
246             comment("fetch color-buffer");
247             load(parts.cbPtr, mDstPixel);
248         }
249 
250         if (registerFile().status())
251             return registerFile().status();
252 
253         pixel_t pixel;
254         int directTex = mTextureMachine.directTexture;
255         if (directTex | parts.packed) {
256             // note: we can't have both here
257             // iterated color or direct texture
258             pixel = directTex ? parts.texel[directTex-1] : parts.iterated;
259             pixel.flags &= ~CORRUPTIBLE;
260         } else {
261             if (mDithering) {
262                 const int ctxtReg = mBuilderContext.Rctx;
263                 const int mask = GGL_DITHER_SIZE-1;
264                 parts.dither = reg_t(regs.obtain());
265                 AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask));
266                 ADDR_ADD(AL, 0, parts.dither.reg, ctxtReg, parts.dither.reg);
267                 LDRB(AL, parts.dither.reg, parts.dither.reg,
268                         immed12_pre(GGL_OFFSETOF(ditherMatrix)));
269             }
270 
271             // allocate a register for the resulting pixel
272             pixel.setTo(regs.obtain(), &mCbFormat, FIRST);
273 
274             build_component(pixel, parts, GGLFormat::ALPHA,    regs);
275 
276             if (mAlphaTest!=GGL_ALWAYS) {
277                 // only handle the z-write part here. We know z-test
278                 // was successful, as well as alpha-test.
279                 build_depth_test(parts, Z_WRITE);
280             }
281 
282             build_component(pixel, parts, GGLFormat::RED,      regs);
283             build_component(pixel, parts, GGLFormat::GREEN,    regs);
284             build_component(pixel, parts, GGLFormat::BLUE,     regs);
285 
286             pixel.flags |= CORRUPTIBLE;
287         }
288 
289         if (registerFile().status())
290             return registerFile().status();
291 
292         if (pixel.reg == -1) {
293             // be defensive here. if we're here it's probably
294             // that this whole fragment is a no-op.
295             pixel = mDstPixel;
296         }
297 
298         if (!mAllMasked) {
299             // logic operation
300             build_logic_op(pixel, regs);
301 
302             // masking
303             build_masking(pixel, regs);
304 
305             comment("store");
306             store(parts.cbPtr, pixel, WRITE_BACK);
307         }
308     }
309 
310     if (registerFile().status())
311         return registerFile().status();
312 
313     // update the iterated color...
314     if (parts.reload != 3) {
315         build_smooth_shade(parts);
316     }
317 
318     // update iterated z
319     build_iterate_z(parts);
320 
321     // update iterated fog
322     build_iterate_f(parts);
323 
324     SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
325     B(PL, "fragment_loop");
326     label("epilog");
327     epilog(registerFile().touched());
328 
329     if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) {
330         if (mDepthTest!=GGL_ALWAYS) {
331             label("discard_before_textures");
332             build_iterate_texture_coordinates(parts);
333         }
334         label("discard_after_textures");
335         build_smooth_shade(parts);
336         build_iterate_z(parts);
337         build_iterate_f(parts);
338         if (!mAllMasked) {
339             ADDR_ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3));
340         }
341         SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
342         B(PL, "fragment_loop");
343         epilog(registerFile().touched());
344     }
345 
346     return registerFile().status();
347 }
348 
349 // ---------------------------------------------------------------------------
350 
build_scanline_prolog(fragment_parts_t & parts,const needs_t & needs)351 void GGLAssembler::build_scanline_prolog(
352     fragment_parts_t& parts, const needs_t& needs)
353 {
354     Scratch scratches(registerFile());
355     int Rctx = mBuilderContext.Rctx;
356 
357     // compute count
358     comment("compute ct (# of pixels to process)");
359     parts.count.setTo(obtainReg());
360     int Rx = scratches.obtain();
361     int Ry = scratches.obtain();
362     CONTEXT_LOAD(Rx, iterators.xl);
363     CONTEXT_LOAD(parts.count.reg, iterators.xr);
364     CONTEXT_LOAD(Ry, iterators.y);
365 
366     // parts.count = iterators.xr - Rx
367     SUB(AL, 0, parts.count.reg, parts.count.reg, Rx);
368     SUB(AL, 0, parts.count.reg, parts.count.reg, imm(1));
369 
370     if (mDithering) {
371         // parts.count.reg = 0xNNNNXXDD
372         // NNNN = count-1
373         // DD   = dither offset
374         // XX   = 0xxxxxxx (x = garbage)
375         Scratch scratches(registerFile());
376         int tx = scratches.obtain();
377         int ty = scratches.obtain();
378         AND(AL, 0, tx, Rx, imm(GGL_DITHER_MASK));
379         AND(AL, 0, ty, Ry, imm(GGL_DITHER_MASK));
380         ADD(AL, 0, tx, tx, reg_imm(ty, LSL, GGL_DITHER_ORDER_SHIFT));
381         ORR(AL, 0, parts.count.reg, tx, reg_imm(parts.count.reg, LSL, 16));
382     } else {
383         // parts.count.reg = 0xNNNN0000
384         // NNNN = count-1
385         MOV(AL, 0, parts.count.reg, reg_imm(parts.count.reg, LSL, 16));
386     }
387 
388     if (!mAllMasked) {
389         // compute dst ptr
390         comment("compute color-buffer pointer");
391         const int cb_bits = mCbFormat.size*8;
392         int Rs = scratches.obtain();
393         parts.cbPtr.setTo(obtainReg(), cb_bits);
394         CONTEXT_LOAD(Rs, state.buffers.color.stride);
395         CONTEXT_ADDR_LOAD(parts.cbPtr.reg, state.buffers.color.data);
396         SMLABB(AL, Rs, Ry, Rs, Rx);  // Rs = Rx + Ry*Rs
397         base_offset(parts.cbPtr, parts.cbPtr, Rs);
398         scratches.recycle(Rs);
399     }
400 
401     // init fog
402     const int need_fog = GGL_READ_NEEDS(P_FOG, needs.p);
403     if (need_fog) {
404         comment("compute initial fog coordinate");
405         Scratch scratches(registerFile());
406         int dfdx = scratches.obtain();
407         int ydfdy = scratches.obtain();
408         int f = ydfdy;
409         CONTEXT_LOAD(dfdx,  generated_vars.dfdx);
410         CONTEXT_LOAD(ydfdy, iterators.ydfdy);
411         MLA(AL, 0, f, Rx, dfdx, ydfdy);
412         CONTEXT_STORE(f, generated_vars.f);
413     }
414 
415     // init Z coordinate
416     if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
417         parts.z = reg_t(obtainReg());
418         comment("compute initial Z coordinate");
419         Scratch scratches(registerFile());
420         int dzdx = scratches.obtain();
421         int ydzdy = parts.z.reg;
422         CONTEXT_LOAD(dzdx,  generated_vars.dzdx);   // 1.31 fixed-point
423         CONTEXT_LOAD(ydzdy, iterators.ydzdy);       // 1.31 fixed-point
424         MLA(AL, 0, parts.z.reg, Rx, dzdx, ydzdy);
425 
426         // we're going to index zbase of parts.count
427         // zbase = base + (xl-count + stride*y)*2
428         int Rs = dzdx;
429         int zbase = scratches.obtain();
430         CONTEXT_LOAD(Rs, state.buffers.depth.stride);
431         CONTEXT_ADDR_LOAD(zbase, state.buffers.depth.data);
432         SMLABB(AL, Rs, Ry, Rs, Rx);
433         ADD(AL, 0, Rs, Rs, reg_imm(parts.count.reg, LSR, 16));
434         ADDR_ADD(AL, 0, zbase, zbase, reg_imm(Rs, LSL, 1));
435         CONTEXT_ADDR_STORE(zbase, generated_vars.zbase);
436     }
437 
438     // init texture coordinates
439     init_textures(parts.coords, reg_t(Rx), reg_t(Ry));
440     scratches.recycle(Ry);
441 
442     // iterated color
443     init_iterated_color(parts, reg_t(Rx));
444 
445     // init coverage factor application (anti-aliasing)
446     if (mAA) {
447         parts.covPtr.setTo(obtainReg(), 16);
448         CONTEXT_ADDR_LOAD(parts.covPtr.reg, state.buffers.coverage);
449         ADDR_ADD(AL, 0, parts.covPtr.reg, parts.covPtr.reg, reg_imm(Rx, LSL, 1));
450     }
451 }
452 
453 // ---------------------------------------------------------------------------
454 
build_component(pixel_t & pixel,const fragment_parts_t & parts,int component,Scratch & regs)455 void GGLAssembler::build_component( pixel_t& pixel,
456                                     const fragment_parts_t& parts,
457                                     int component,
458                                     Scratch& regs)
459 {
460     static char const * comments[] = {"alpha", "red", "green", "blue"};
461     comment(comments[component]);
462 
463     // local register file
464     Scratch scratches(registerFile());
465     const int dst_component_size = pixel.component_size(component);
466 
467     component_t temp(-1);
468     build_incoming_component( temp, dst_component_size,
469             parts, component, scratches, regs);
470 
471     if (mInfo[component].inDest) {
472 
473         // blending...
474         build_blending( temp, mDstPixel, component, scratches );
475 
476         // downshift component and rebuild pixel...
477         downshift(pixel, component, temp, parts.dither);
478     }
479 }
480 
build_incoming_component(component_t & temp,int dst_size,const fragment_parts_t & parts,int component,Scratch & scratches,Scratch & global_regs)481 void GGLAssembler::build_incoming_component(
482                                     component_t& temp,
483                                     int dst_size,
484                                     const fragment_parts_t& parts,
485                                     int component,
486                                     Scratch& scratches,
487                                     Scratch& global_regs)
488 {
489     const uint32_t component_mask = 1<<component;
490 
491     // Figure out what we need for the blending stage...
492     int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
493     int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
494     if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA) {
495         fs = GGL_ONE;
496     }
497 
498     // Figure out what we need to extract and for what reason
499     const int blending = blending_codes(fs, fd);
500 
501     // Are we actually going to blend?
502     const int need_blending = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
503 
504     // expand the source if the destination has more bits
505     int need_expander = false;
506     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT-1 ; i++) {
507         texture_unit_t& tmu = mTextureMachine.tmu[i];
508         if ((tmu.format_idx) &&
509             (parts.texel[i].component_size(component) < dst_size)) {
510             need_expander = true;
511         }
512     }
513 
514     // do we need to extract this component?
515     const bool multiTexture = mTextureMachine.activeUnits > 1;
516     const int blend_needs_alpha_source = (component==GGLFormat::ALPHA) &&
517                                         (isAlphaSourceNeeded());
518     int need_extract = mInfo[component].needed;
519     if (mInfo[component].inDest)
520     {
521         need_extract |= ((need_blending ?
522                 (blending & (BLEND_SRC|FACTOR_SRC)) : need_expander));
523         need_extract |= (mTextureMachine.mask != mTextureMachine.replaced);
524         need_extract |= mInfo[component].smooth;
525         need_extract |= mInfo[component].fog;
526         need_extract |= mDithering;
527         need_extract |= multiTexture;
528     }
529 
530     if (need_extract) {
531         Scratch& regs = blend_needs_alpha_source ? global_regs : scratches;
532         component_t fragment;
533 
534         // iterated color
535         build_iterated_color(fragment, parts, component, regs);
536 
537         // texture environement (decal, modulate, replace)
538         build_texture_environment(fragment, parts, component, regs);
539 
540         // expand the source if the destination has more bits
541         if (need_expander && (fragment.size() < dst_size)) {
542             // we're here only if we fetched a texel
543             // (so we know for sure fragment is CORRUPTIBLE)
544             expand(fragment, fragment, dst_size);
545         }
546 
547         // We have a few specific things to do for the alpha-channel
548         if ((component==GGLFormat::ALPHA) &&
549             (mInfo[component].needed || fragment.size()<dst_size))
550         {
551             // convert to integer_t first and make sure
552             // we don't corrupt a needed register
553             if (fragment.l) {
554                 component_t incoming(fragment);
555                 modify(fragment, regs);
556                 MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSR, incoming.l));
557                 fragment.h -= fragment.l;
558                 fragment.l = 0;
559             }
560 
561             // coverage factor application
562             build_coverage_application(fragment, parts, regs);
563 
564             // alpha-test
565             build_alpha_test(fragment, parts);
566 
567             if (blend_needs_alpha_source) {
568                 // We keep only 8 bits for the blending stage
569                 const int shift = fragment.h <= 8 ? 0 : fragment.h-8;
570                 if (fragment.flags & CORRUPTIBLE) {
571                     fragment.flags &= ~CORRUPTIBLE;
572                     mAlphaSource.setTo(fragment.reg,
573                             fragment.size(), fragment.flags);
574                     if (shift) {
575                         MOV(AL, 0, mAlphaSource.reg,
576                             reg_imm(mAlphaSource.reg, LSR, shift));
577                     }
578                 } else {
579                     // XXX: it would better to do this in build_blend_factor()
580                     // so we can avoid the extra MOV below.
581                     mAlphaSource.setTo(regs.obtain(),
582                             fragment.size(), CORRUPTIBLE);
583                     if (shift) {
584                         MOV(AL, 0, mAlphaSource.reg,
585                             reg_imm(fragment.reg, LSR, shift));
586                     } else {
587                         MOV(AL, 0, mAlphaSource.reg, fragment.reg);
588                     }
589                 }
590                 mAlphaSource.s -= shift;
591             }
592         }
593 
594         // fog...
595         build_fog( fragment, component, regs );
596 
597         temp = fragment;
598     } else {
599         if (mInfo[component].inDest) {
600             // extraction not needed and replace
601             // we just select the right component
602             if ((mTextureMachine.replaced & component_mask) == 0) {
603                 // component wasn't replaced, so use it!
604                 temp = component_t(parts.iterated, component);
605             }
606             for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
607                 const texture_unit_t& tmu = mTextureMachine.tmu[i];
608                 if ((tmu.mask & component_mask) &&
609                     ((tmu.replaced & component_mask) == 0)) {
610                     temp = component_t(parts.texel[i], component);
611                 }
612             }
613         }
614     }
615 }
616 
isAlphaSourceNeeded() const617 bool GGLAssembler::isAlphaSourceNeeded() const
618 {
619     // XXX: also needed for alpha-test
620     const int bs = mBlendSrc;
621     const int bd = mBlendDst;
622     return  bs==GGL_SRC_ALPHA_SATURATE ||
623             bs==GGL_SRC_ALPHA || bs==GGL_ONE_MINUS_SRC_ALPHA ||
624             bd==GGL_SRC_ALPHA || bd==GGL_ONE_MINUS_SRC_ALPHA ;
625 }
626 
627 // ---------------------------------------------------------------------------
628 
build_smooth_shade(const fragment_parts_t & parts)629 void GGLAssembler::build_smooth_shade(const fragment_parts_t& parts)
630 {
631     if (mSmooth && !parts.iterated_packed) {
632         // update the iterated color in a pipelined way...
633         comment("update iterated color");
634         Scratch scratches(registerFile());
635 
636         const int reload = parts.reload;
637         for (int i=0 ; i<4 ; i++) {
638             if (!mInfo[i].iterated)
639                 continue;
640 
641             int c = parts.argb[i].reg;
642             int dx = parts.argb_dx[i].reg;
643 
644             if (reload & 1) {
645                 c = scratches.obtain();
646                 CONTEXT_LOAD(c, generated_vars.argb[i].c);
647             }
648             if (reload & 2) {
649                 dx = scratches.obtain();
650                 CONTEXT_LOAD(dx, generated_vars.argb[i].dx);
651             }
652 
653             if (mSmooth) {
654                 ADD(AL, 0, c, c, dx);
655             }
656 
657             if (reload & 1) {
658                 CONTEXT_STORE(c, generated_vars.argb[i].c);
659                 scratches.recycle(c);
660             }
661             if (reload & 2) {
662                 scratches.recycle(dx);
663             }
664         }
665     }
666 }
667 
668 // ---------------------------------------------------------------------------
669 
build_coverage_application(component_t & fragment,const fragment_parts_t & parts,Scratch & regs)670 void GGLAssembler::build_coverage_application(component_t& fragment,
671         const fragment_parts_t& parts, Scratch& regs)
672 {
673     // here fragment.l is guarenteed to be 0
674     if (mAA) {
675         // coverages are 1.15 fixed-point numbers
676         comment("coverage application");
677 
678         component_t incoming(fragment);
679         modify(fragment, regs);
680 
681         Scratch scratches(registerFile());
682         int cf = scratches.obtain();
683         LDRH(AL, cf, parts.covPtr.reg, immed8_post(2));
684         if (fragment.h > 31) {
685             fragment.h--;
686             SMULWB(AL, fragment.reg, incoming.reg, cf);
687         } else {
688             MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSL, 1));
689             SMULWB(AL, fragment.reg, fragment.reg, cf);
690         }
691     }
692 }
693 
694 // ---------------------------------------------------------------------------
695 
build_alpha_test(component_t & fragment,const fragment_parts_t &)696 void GGLAssembler::build_alpha_test(component_t& fragment,
697                                     const fragment_parts_t& /*parts*/)
698 {
699     if (mAlphaTest != GGL_ALWAYS) {
700         comment("Alpha Test");
701         Scratch scratches(registerFile());
702         int ref = scratches.obtain();
703         const int shift = GGL_COLOR_BITS-fragment.size();
704         CONTEXT_LOAD(ref, state.alpha_test.ref);
705         if (shift) CMP(AL, fragment.reg, reg_imm(ref, LSR, shift));
706         else       CMP(AL, fragment.reg, ref);
707         int cc = NV;
708         switch (mAlphaTest) {
709         case GGL_NEVER:     cc = NV;    break;
710         case GGL_LESS:      cc = LT;    break;
711         case GGL_EQUAL:     cc = EQ;    break;
712         case GGL_LEQUAL:    cc = LS;    break;
713         case GGL_GREATER:   cc = HI;    break;
714         case GGL_NOTEQUAL:  cc = NE;    break;
715         case GGL_GEQUAL:    cc = HS;    break;
716         }
717         B(cc^1, "discard_after_textures");
718     }
719 }
720 
721 // ---------------------------------------------------------------------------
722 
build_depth_test(const fragment_parts_t & parts,uint32_t mask)723 void GGLAssembler::build_depth_test(
724         const fragment_parts_t& parts, uint32_t mask)
725 {
726     mask &= Z_TEST|Z_WRITE;
727     const needs_t& needs = mBuilderContext.needs;
728     const int zmask = GGL_READ_NEEDS(P_MASK_Z, needs.p);
729     Scratch scratches(registerFile());
730 
731     if (mDepthTest != GGL_ALWAYS || zmask) {
732         int cc=AL, ic=AL;
733         switch (mDepthTest) {
734         case GGL_LESS:      ic = HI;    break;
735         case GGL_EQUAL:     ic = EQ;    break;
736         case GGL_LEQUAL:    ic = HS;    break;
737         case GGL_GREATER:   ic = LT;    break;
738         case GGL_NOTEQUAL:  ic = NE;    break;
739         case GGL_GEQUAL:    ic = LS;    break;
740         case GGL_NEVER:
741             // this never happens, because it's taken care of when
742             // computing the needs. but we keep it for completness.
743             comment("Depth Test (NEVER)");
744             B(AL, "discard_before_textures");
745             return;
746         case GGL_ALWAYS:
747             // we're here because zmask is enabled
748             mask &= ~Z_TEST;    // test always passes.
749             break;
750         }
751 
752         // inverse the condition
753         cc = ic^1;
754 
755         if ((mask & Z_WRITE) && !zmask) {
756             mask &= ~Z_WRITE;
757         }
758 
759         if (!mask)
760             return;
761 
762         comment("Depth Test");
763 
764         int zbase = scratches.obtain();
765         int depth = scratches.obtain();
766         int z = parts.z.reg;
767 
768         CONTEXT_ADDR_LOAD(zbase, generated_vars.zbase);  // stall
769         ADDR_SUB(AL, 0, zbase, zbase, reg_imm(parts.count.reg, LSR, 15));
770             // above does zbase = zbase + ((count >> 16) << 1)
771 
772         if (mask & Z_TEST) {
773             LDRH(AL, depth, zbase);  // stall
774             CMP(AL, depth, reg_imm(z, LSR, 16));
775             B(cc, "discard_before_textures");
776         }
777         if (mask & Z_WRITE) {
778             if (mask == Z_WRITE) {
779                 // only z-write asked, cc is meaningless
780                 ic = AL;
781             }
782             MOV(AL, 0, depth, reg_imm(z, LSR, 16));
783             STRH(ic, depth, zbase);
784         }
785     }
786 }
787 
build_iterate_z(const fragment_parts_t & parts)788 void GGLAssembler::build_iterate_z(const fragment_parts_t& parts)
789 {
790     const needs_t& needs = mBuilderContext.needs;
791     if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
792         Scratch scratches(registerFile());
793         int dzdx = scratches.obtain();
794         CONTEXT_LOAD(dzdx, generated_vars.dzdx);    // stall
795         ADD(AL, 0, parts.z.reg, parts.z.reg, dzdx);
796     }
797 }
798 
build_iterate_f(const fragment_parts_t &)799 void GGLAssembler::build_iterate_f(const fragment_parts_t& /*parts*/)
800 {
801     const needs_t& needs = mBuilderContext.needs;
802     if (GGL_READ_NEEDS(P_FOG, needs.p)) {
803         Scratch scratches(registerFile());
804         int dfdx = scratches.obtain();
805         int f = scratches.obtain();
806         CONTEXT_LOAD(f,     generated_vars.f);
807         CONTEXT_LOAD(dfdx,  generated_vars.dfdx);   // stall
808         ADD(AL, 0, f, f, dfdx);
809         CONTEXT_STORE(f,    generated_vars.f);
810     }
811 }
812 
813 // ---------------------------------------------------------------------------
814 
build_logic_op(pixel_t & pixel,Scratch & regs)815 void GGLAssembler::build_logic_op(pixel_t& pixel, Scratch& regs)
816 {
817     const needs_t& needs = mBuilderContext.needs;
818     const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
819     if (opcode == GGL_COPY)
820         return;
821 
822     comment("logic operation");
823 
824     pixel_t s(pixel);
825     if (!(pixel.flags & CORRUPTIBLE)) {
826         pixel.reg = regs.obtain();
827         pixel.flags |= CORRUPTIBLE;
828     }
829 
830     pixel_t d(mDstPixel);
831     switch(opcode) {
832     case GGL_CLEAR:         MOV(AL, 0, pixel.reg, imm(0));          break;
833     case GGL_AND:           AND(AL, 0, pixel.reg, s.reg, d.reg);    break;
834     case GGL_AND_REVERSE:   BIC(AL, 0, pixel.reg, s.reg, d.reg);    break;
835     case GGL_COPY:                                                  break;
836     case GGL_AND_INVERTED:  BIC(AL, 0, pixel.reg, d.reg, s.reg);    break;
837     case GGL_NOOP:          MOV(AL, 0, pixel.reg, d.reg);           break;
838     case GGL_XOR:           EOR(AL, 0, pixel.reg, s.reg, d.reg);    break;
839     case GGL_OR:            ORR(AL, 0, pixel.reg, s.reg, d.reg);    break;
840     case GGL_NOR:           ORR(AL, 0, pixel.reg, s.reg, d.reg);
841                             MVN(AL, 0, pixel.reg, pixel.reg);       break;
842     case GGL_EQUIV:         EOR(AL, 0, pixel.reg, s.reg, d.reg);
843                             MVN(AL, 0, pixel.reg, pixel.reg);       break;
844     case GGL_INVERT:        MVN(AL, 0, pixel.reg, d.reg);           break;
845     case GGL_OR_REVERSE:    // s | ~d == ~(~s & d)
846                             BIC(AL, 0, pixel.reg, d.reg, s.reg);
847                             MVN(AL, 0, pixel.reg, pixel.reg);       break;
848     case GGL_COPY_INVERTED: MVN(AL, 0, pixel.reg, s.reg);           break;
849     case GGL_OR_INVERTED:   // ~s | d == ~(s & ~d)
850                             BIC(AL, 0, pixel.reg, s.reg, d.reg);
851                             MVN(AL, 0, pixel.reg, pixel.reg);       break;
852     case GGL_NAND:          AND(AL, 0, pixel.reg, s.reg, d.reg);
853                             MVN(AL, 0, pixel.reg, pixel.reg);       break;
854     case GGL_SET:           MVN(AL, 0, pixel.reg, imm(0));          break;
855     };
856 }
857 
858 // ---------------------------------------------------------------------------
859 
find_bottom(uint32_t val)860 static uint32_t find_bottom(uint32_t val)
861 {
862     uint32_t i = 0;
863     while (!(val & (3<<i)))
864         i+= 2;
865     return i;
866 }
867 
normalize(uint32_t & val,uint32_t & rot)868 static void normalize(uint32_t& val, uint32_t& rot)
869 {
870     rot = 0;
871     while (!(val&3)  || (val & 0xFC000000)) {
872         uint32_t newval;
873         newval = val >> 2;
874         newval |= (val&3) << 30;
875         val = newval;
876         rot += 2;
877         if (rot == 32) {
878             rot = 0;
879             break;
880         }
881     }
882 }
883 
build_and_immediate(int d,int s,uint32_t mask,int bits)884 void GGLAssembler::build_and_immediate(int d, int s, uint32_t mask, int bits)
885 {
886     uint32_t rot;
887     uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
888     mask &= size;
889 
890     if (mask == size) {
891         if (d != s)
892             MOV( AL, 0, d, s);
893         return;
894     }
895 
896     if (getCodegenArch() == CODEGEN_ARCH_MIPS) {
897         // MIPS can do 16-bit imm in 1 instr, 32-bit in 3 instr
898         // the below ' while (mask)' code is buggy on mips
899         // since mips returns true on isValidImmediate()
900         // then we get multiple AND instr (positive logic)
901         AND( AL, 0, d, s, imm(mask) );
902         return;
903     }
904     else if (getCodegenArch() == CODEGEN_ARCH_ARM64) {
905         AND( AL, 0, d, s, imm(mask) );
906         return;
907     }
908 
909     int negative_logic = !isValidImmediate(mask);
910     if (negative_logic) {
911         mask = ~mask & size;
912     }
913     normalize(mask, rot);
914 
915     if (mask) {
916         while (mask) {
917             uint32_t bitpos = find_bottom(mask);
918             int shift = rot + bitpos;
919             uint32_t m = mask & (0xff << bitpos);
920             mask &= ~m;
921             m >>= bitpos;
922             int32_t newMask =  (m<<shift) | (m>>(32-shift));
923             if (!negative_logic) {
924                 AND( AL, 0, d, s, imm(newMask) );
925             } else {
926                 BIC( AL, 0, d, s, imm(newMask) );
927             }
928             s = d;
929         }
930     } else {
931         MOV( AL, 0, d, imm(0));
932     }
933 }
934 
build_masking(pixel_t & pixel,Scratch & regs)935 void GGLAssembler::build_masking(pixel_t& pixel, Scratch& regs)
936 {
937     if (!mMasking || mAllMasked) {
938         return;
939     }
940 
941     comment("color mask");
942 
943     pixel_t fb(mDstPixel);
944     pixel_t s(pixel);
945     if (!(pixel.flags & CORRUPTIBLE)) {
946         pixel.reg = regs.obtain();
947         pixel.flags |= CORRUPTIBLE;
948     }
949 
950     int mask = 0;
951     for (int i=0 ; i<4 ; i++) {
952         const int component_mask = 1<<i;
953         const int h = fb.format.c[i].h;
954         const int l = fb.format.c[i].l;
955         if (h && (!(mMasking & component_mask))) {
956             mask |= ((1<<(h-l))-1) << l;
957         }
958     }
959 
960     // There is no need to clear the masked components of the source
961     // (unless we applied a logic op), because they're already zeroed
962     // by construction (masked components are not computed)
963 
964     if (mLogicOp) {
965         const needs_t& needs = mBuilderContext.needs;
966         const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
967         if (opcode != GGL_CLEAR) {
968             // clear masked component of source
969             build_and_immediate(pixel.reg, s.reg, mask, fb.size());
970             s = pixel;
971         }
972     }
973 
974     // clear non masked components of destination
975     build_and_immediate(fb.reg, fb.reg, ~mask, fb.size());
976 
977     // or back the channels that were masked
978     if (s.reg == fb.reg) {
979          // this is in fact a MOV
980         if (s.reg == pixel.reg) {
981             // ugh. this in in fact a nop
982         } else {
983             MOV(AL, 0, pixel.reg, fb.reg);
984         }
985     } else {
986         ORR(AL, 0, pixel.reg, s.reg, fb.reg);
987     }
988 }
989 
990 // ---------------------------------------------------------------------------
991 
base_offset(const pointer_t & d,const pointer_t & b,const reg_t & o)992 void GGLAssembler::base_offset(
993         const pointer_t& d, const pointer_t& b, const reg_t& o)
994 {
995     switch (b.size) {
996     case 32:
997         ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 2));
998         break;
999     case 24:
1000         if (d.reg == b.reg) {
1001             ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
1002             ADDR_ADD(AL, 0, d.reg, d.reg, o.reg);
1003         } else {
1004             ADDR_ADD(AL, 0, d.reg, o.reg, reg_imm(o.reg, LSL, 1));
1005             ADDR_ADD(AL, 0, d.reg, d.reg, b.reg);
1006         }
1007         break;
1008     case 16:
1009         ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
1010         break;
1011     case 8:
1012         ADDR_ADD(AL, 0, d.reg, b.reg, o.reg);
1013         break;
1014     }
1015 }
1016 
1017 // ----------------------------------------------------------------------------
1018 // cheezy register allocator...
1019 // ----------------------------------------------------------------------------
1020 
1021 // Modified to support MIPS processors, in a very simple way. We retain the
1022 // (Arm) limit of 16 total registers, but shift the mapping of those registers
1023 // from 0-15, to 2-17. Register 0 on Mips cannot be used as GP registers, and
1024 // register 1 has a traditional use as a temp).
1025 
RegisterAllocator(int arch)1026 RegisterAllocator::RegisterAllocator(int arch) : mRegs(arch)
1027 {
1028 }
1029 
reset()1030 void RegisterAllocator::reset()
1031 {
1032     mRegs.reset();
1033 }
1034 
reserveReg(int reg)1035 int RegisterAllocator::reserveReg(int reg)
1036 {
1037     return mRegs.reserve(reg);
1038 }
1039 
obtainReg()1040 int RegisterAllocator::obtainReg()
1041 {
1042     return mRegs.obtain();
1043 }
1044 
recycleReg(int reg)1045 void RegisterAllocator::recycleReg(int reg)
1046 {
1047     mRegs.recycle(reg);
1048 }
1049 
registerFile()1050 RegisterAllocator::RegisterFile& RegisterAllocator::registerFile()
1051 {
1052     return mRegs;
1053 }
1054 
1055 // ----------------------------------------------------------------------------
1056 
RegisterFile(int codegen_arch)1057 RegisterAllocator::RegisterFile::RegisterFile(int codegen_arch)
1058     : mRegs(0), mTouched(0), mStatus(0), mArch(codegen_arch), mRegisterOffset(0)
1059 {
1060     if (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) {
1061         mRegisterOffset = 2;    // ARM has regs 0..15, MIPS offset to 2..17
1062     }
1063     reserve(ARMAssemblerInterface::SP);
1064     reserve(ARMAssemblerInterface::PC);
1065 }
1066 
RegisterFile(const RegisterFile & rhs,int codegen_arch)1067 RegisterAllocator::RegisterFile::RegisterFile(const RegisterFile& rhs, int codegen_arch)
1068     : mRegs(rhs.mRegs), mTouched(rhs.mTouched), mArch(codegen_arch), mRegisterOffset(0)
1069 {
1070     if (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) {
1071         mRegisterOffset = 2;    // ARM has regs 0..15, MIPS offset to 2..17
1072     }
1073 }
1074 
~RegisterFile()1075 RegisterAllocator::RegisterFile::~RegisterFile()
1076 {
1077 }
1078 
operator ==(const RegisterFile & rhs) const1079 bool RegisterAllocator::RegisterFile::operator == (const RegisterFile& rhs) const
1080 {
1081     return (mRegs == rhs.mRegs);
1082 }
1083 
reset()1084 void RegisterAllocator::RegisterFile::reset()
1085 {
1086     mRegs = mTouched = mStatus = 0;
1087     reserve(ARMAssemblerInterface::SP);
1088     reserve(ARMAssemblerInterface::PC);
1089 }
1090 
1091 // RegisterFile::reserve() take a register parameter in the
1092 // range 0-15 (Arm compatible), but on a Mips processor, will
1093 // return the actual allocated register in the range 2-17.
reserve(int reg)1094 int RegisterAllocator::RegisterFile::reserve(int reg)
1095 {
1096     reg += mRegisterOffset;
1097     LOG_ALWAYS_FATAL_IF(isUsed(reg),
1098                         "reserving register %d, but already in use",
1099                         reg);
1100     mRegs |= (1<<reg);
1101     mTouched |= mRegs;
1102     return reg;
1103 }
1104 
1105 // This interface uses regMask in range 2-17 on MIPS, no translation.
reserveSeveral(uint32_t regMask)1106 void RegisterAllocator::RegisterFile::reserveSeveral(uint32_t regMask)
1107 {
1108     mRegs |= regMask;
1109     mTouched |= regMask;
1110 }
1111 
isUsed(int reg) const1112 int RegisterAllocator::RegisterFile::isUsed(int reg) const
1113 {
1114     LOG_ALWAYS_FATAL_IF(reg>=16+(int)mRegisterOffset, "invalid register %d", reg);
1115     return mRegs & (1<<reg);
1116 }
1117 
obtain()1118 int RegisterAllocator::RegisterFile::obtain()
1119 {
1120     const char priorityList[14] = {  0,  1, 2, 3,
1121                                     12, 14, 4, 5,
1122                                      6,  7, 8, 9,
1123                                     10, 11 };
1124     const int nbreg = sizeof(priorityList);
1125     int i, r, reg;
1126     for (i=0 ; i<nbreg ; i++) {
1127         r = priorityList[i];
1128         if (!isUsed(r + mRegisterOffset)) {
1129             break;
1130         }
1131     }
1132     // this is not an error anymore because, we'll try again with
1133     // a lower optimization level.
1134     //ALOGE_IF(i >= nbreg, "pixelflinger ran out of registers\n");
1135     if (i >= nbreg) {
1136         mStatus |= OUT_OF_REGISTERS;
1137         // we return SP so we can more easily debug things
1138         // the code will never be run anyway.
1139         return ARMAssemblerInterface::SP;
1140     }
1141     reg = reserve(r);  // Param in Arm range 0-15, returns range 2-17 on Mips.
1142     return reg;
1143 }
1144 
hasFreeRegs() const1145 bool RegisterAllocator::RegisterFile::hasFreeRegs() const
1146 {
1147     uint32_t regs = mRegs >> mRegisterOffset;   // MIPS fix.
1148     return ((regs & 0xFFFF) == 0xFFFF) ? false : true;
1149 }
1150 
countFreeRegs() const1151 int RegisterAllocator::RegisterFile::countFreeRegs() const
1152 {
1153     uint32_t regs = mRegs >> mRegisterOffset;   // MIPS fix.
1154     int f = ~regs & 0xFFFF;
1155     // now count number of 1
1156    f = (f & 0x5555) + ((f>>1) & 0x5555);
1157    f = (f & 0x3333) + ((f>>2) & 0x3333);
1158    f = (f & 0x0F0F) + ((f>>4) & 0x0F0F);
1159    f = (f & 0x00FF) + ((f>>8) & 0x00FF);
1160    return f;
1161 }
1162 
recycle(int reg)1163 void RegisterAllocator::RegisterFile::recycle(int reg)
1164 {
1165     // commented out, since common failure of running out of regs
1166     // triggers this assertion. Since the code is not execectued
1167     // in that case, it does not matter. No reason to FATAL err.
1168     // LOG_FATAL_IF(!isUsed(reg),
1169     //         "recycling unallocated register %d",
1170     //         reg);
1171     mRegs &= ~(1<<reg);
1172 }
1173 
recycleSeveral(uint32_t regMask)1174 void RegisterAllocator::RegisterFile::recycleSeveral(uint32_t regMask)
1175 {
1176     // commented out, since common failure of running out of regs
1177     // triggers this assertion. Since the code is not execectued
1178     // in that case, it does not matter. No reason to FATAL err.
1179     // LOG_FATAL_IF((mRegs & regMask)!=regMask,
1180     //         "recycling unallocated registers "
1181     //         "(recycle=%08x, allocated=%08x, unallocated=%08x)",
1182     //         regMask, mRegs, mRegs&regMask);
1183     mRegs &= ~regMask;
1184 }
1185 
touched() const1186 uint32_t RegisterAllocator::RegisterFile::touched() const
1187 {
1188     return mTouched;
1189 }
1190 
1191 // ----------------------------------------------------------------------------
1192 
1193 }; // namespace android
1194 
1195