1 /* libs/pixelflinger/scanline.cpp
2 **
3 ** Copyright 2006-2011, The Android Open Source Project
4 **
5 ** Licensed under the Apache License, Version 2.0 (the "License");
6 ** you may not use this file except in compliance with the License.
7 ** You may obtain a copy of the License at
8 **
9 **     http://www.apache.org/licenses/LICENSE-2.0
10 **
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 */
17 
18 
19 #define LOG_TAG "pixelflinger"
20 
21 #include <assert.h>
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <string.h>
25 
26 #include <cutils/memory.h>
27 #include <cutils/log.h>
28 
29 #ifdef __arm__
30 #include <machine/cpu-features.h>
31 #endif
32 
33 #include "buffer.h"
34 #include "scanline.h"
35 
36 #include "codeflinger/CodeCache.h"
37 #include "codeflinger/GGLAssembler.h"
38 #if defined(__arm__)
39 #include "codeflinger/ARMAssembler.h"
40 #elif defined(__aarch64__)
41 #include "codeflinger/Arm64Assembler.h"
42 #elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
43 #include "codeflinger/MIPSAssembler.h"
44 #elif defined(__mips__) && defined(__LP64__)
45 #include "codeflinger/MIPS64Assembler.h"
46 #endif
47 //#include "codeflinger/ARMAssemblerOptimizer.h"
48 
49 // ----------------------------------------------------------------------------
50 
51 #define ANDROID_CODEGEN_GENERIC     0   // force generic pixel pipeline
52 #define ANDROID_CODEGEN_C           1   // hand-written C, fallback generic
53 #define ANDROID_CODEGEN_ASM         2   // hand-written asm, fallback generic
54 #define ANDROID_CODEGEN_GENERATED   3   // hand-written asm, fallback codegen
55 
56 #ifdef NDEBUG
57 #   define ANDROID_RELEASE
58 #   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
59 #else
60 #   define ANDROID_DEBUG
61 #   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
62 #endif
63 
64 #if defined(__arm__) || (defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))) || defined(__aarch64__)
65 #   define ANDROID_ARM_CODEGEN  1
66 #else
67 #   define ANDROID_ARM_CODEGEN  0
68 #endif
69 
70 #define DEBUG__CODEGEN_ONLY     0
71 
72 /* Set to 1 to dump to the log the states that need a new
73  * code-generated scanline callback, i.e. those that don't
74  * have a corresponding shortcut function.
75  */
76 #define DEBUG_NEEDS  0
77 
78 #if defined( __mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))
79 #define ASSEMBLY_SCRATCH_SIZE   4096
80 #elif defined(__aarch64__)
81 #define ASSEMBLY_SCRATCH_SIZE   8192
82 #else
83 #define ASSEMBLY_SCRATCH_SIZE   2048
84 #endif
85 
86 // ----------------------------------------------------------------------------
87 namespace android {
88 // ----------------------------------------------------------------------------
89 
90 static void init_y(context_t*, int32_t);
91 static void init_y_noop(context_t*, int32_t);
92 static void init_y_packed(context_t*, int32_t);
93 static void init_y_error(context_t*, int32_t);
94 
95 static void step_y__generic(context_t* c);
96 static void step_y__nop(context_t*);
97 static void step_y__smooth(context_t* c);
98 static void step_y__tmu(context_t* c);
99 static void step_y__w(context_t* c);
100 
101 static void scanline(context_t* c);
102 static void scanline_perspective(context_t* c);
103 static void scanline_perspective_single(context_t* c);
104 static void scanline_t32cb16blend(context_t* c);
105 static void scanline_t32cb16blend_dither(context_t* c);
106 static void scanline_t32cb16blend_srca(context_t* c);
107 static void scanline_t32cb16blend_clamp(context_t* c);
108 static void scanline_t32cb16blend_clamp_dither(context_t* c);
109 static void scanline_t32cb16blend_clamp_mod(context_t* c);
110 static void scanline_x32cb16blend_clamp_mod(context_t* c);
111 static void scanline_t32cb16blend_clamp_mod_dither(context_t* c);
112 static void scanline_x32cb16blend_clamp_mod_dither(context_t* c);
113 static void scanline_t32cb16(context_t* c);
114 static void scanline_t32cb16_dither(context_t* c);
115 static void scanline_t32cb16_clamp(context_t* c);
116 static void scanline_t32cb16_clamp_dither(context_t* c);
117 static void scanline_col32cb16blend(context_t* c);
118 static void scanline_t16cb16_clamp(context_t* c);
119 static void scanline_t16cb16blend_clamp_mod(context_t* c);
120 static void scanline_memcpy(context_t* c);
121 static void scanline_memset8(context_t* c);
122 static void scanline_memset16(context_t* c);
123 static void scanline_memset32(context_t* c);
124 static void scanline_noop(context_t* c);
125 static void scanline_set(context_t* c);
126 static void scanline_clear(context_t* c);
127 
128 static void rect_generic(context_t* c, size_t yc);
129 static void rect_memcpy(context_t* c, size_t yc);
130 
131 #if defined( __arm__)
132 extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
133 extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
134 extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct);
135 extern "C" void scanline_col32cb16blend_arm(uint16_t *dst, uint32_t col, size_t ct);
136 #elif defined(__aarch64__)
137 extern "C" void scanline_t32cb16blend_arm64(uint16_t*, uint32_t*, size_t);
138 extern "C" void scanline_col32cb16blend_arm64(uint16_t *dst, uint32_t col, size_t ct);
139 #elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
140 extern "C" void scanline_t32cb16blend_mips(uint16_t*, uint32_t*, size_t);
141 #elif defined(__mips__) && defined(__LP64__)
142 extern "C" void scanline_t32cb16blend_mips64(uint16_t*, uint32_t*, size_t);
143 extern "C" void scanline_col32cb16blend_mips64(uint16_t *dst, uint32_t col, size_t ct);
144 #endif
145 
146 // ----------------------------------------------------------------------------
147 
convertAbgr8888ToRgb565(uint32_t pix)148 static inline uint16_t  convertAbgr8888ToRgb565(uint32_t  pix)
149 {
150     return uint16_t( ((pix << 8) & 0xf800) |
151                       ((pix >> 5) & 0x07e0) |
152                       ((pix >> 19) & 0x001f) );
153 }
154 
155 struct shortcut_t {
156     needs_filter_t  filter;
157     const char*     desc;
158     void            (*scanline)(context_t*);
159     void            (*init_y)(context_t*, int32_t);
160 };
161 
162 // Keep in sync with needs
163 
164 /* To understand the values here, have a look at:
165  *     system/core/include/private/pixelflinger/ggl_context.h
166  *
167  * Especially the lines defining and using GGL_RESERVE_NEEDS
168  *
169  * Quick reminders:
170  *   - the last nibble of the first value is the destination buffer format.
171  *   - the last nibble of the third value is the source texture format
172  *   - formats: 4=rgb565 1=abgr8888 2=xbgr8888
173  *
174  * In the descriptions below:
175  *
176  *   SRC      means we copy the source pixels to the destination
177  *
178  *   SRC_OVER means we blend the source pixels to the destination
179  *            with dstFactor = 1-srcA, srcFactor=1  (premultiplied source).
180  *            This mode is otherwise called 'blend'.
181  *
182  *   SRCA_OVER means we blend the source pixels to the destination
183  *             with dstFactor=srcA*(1-srcA) srcFactor=srcA (non-premul source).
184  *             This mode is otherwise called 'blend_srca'
185  *
186  *   clamp    means we fetch source pixels from a texture with u/v clamping
187  *
188  *   mod      means the source pixels are modulated (multiplied) by the
189  *            a/r/g/b of the current context's color. Typically used for
190  *            fade-in / fade-out.
191  *
192  *   dither   means we dither 32 bit values to 16 bits
193  */
194 static shortcut_t shortcuts[] = {
195     { { { 0x03515104, 0x00000077, { 0x00000A01, 0x00000000 } },
196         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
197         "565 fb, 8888 tx, blend SRC_OVER", scanline_t32cb16blend, init_y_noop },
198     { { { 0x03010104, 0x00000077, { 0x00000A01, 0x00000000 } },
199         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
200         "565 fb, 8888 tx, SRC", scanline_t32cb16, init_y_noop  },
201     /* same as first entry, but with dithering */
202     { { { 0x03515104, 0x00000177, { 0x00000A01, 0x00000000 } },
203         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
204         "565 fb, 8888 tx, blend SRC_OVER dither", scanline_t32cb16blend_dither, init_y_noop },
205     /* same as second entry, but with dithering */
206     { { { 0x03010104, 0x00000177, { 0x00000A01, 0x00000000 } },
207         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
208         "565 fb, 8888 tx, SRC dither", scanline_t32cb16_dither, init_y_noop  },
209     /* this is used during the boot animation - CHEAT: ignore dithering */
210     { { { 0x03545404, 0x00000077, { 0x00000A01, 0x00000000 } },
211         { 0xFFFFFFFF, 0xFFFFFEFF, { 0xFFFFFFFF, 0x0000003F } } },
212         "565 fb, 8888 tx, blend dst:ONE_MINUS_SRCA src:SRCA", scanline_t32cb16blend_srca, init_y_noop },
213     /* special case for arbitrary texture coordinates (think scaling) */
214     { { { 0x03515104, 0x00000077, { 0x00000001, 0x00000000 } },
215         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
216         "565 fb, 8888 tx, SRC_OVER clamp", scanline_t32cb16blend_clamp, init_y },
217     { { { 0x03515104, 0x00000177, { 0x00000001, 0x00000000 } },
218         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
219         "565 fb, 8888 tx, SRC_OVER clamp dither", scanline_t32cb16blend_clamp_dither, init_y },
220     /* another case used during emulation */
221     { { { 0x03515104, 0x00000077, { 0x00001001, 0x00000000 } },
222         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
223         "565 fb, 8888 tx, SRC_OVER clamp modulate", scanline_t32cb16blend_clamp_mod, init_y },
224     /* and this */
225     { { { 0x03515104, 0x00000077, { 0x00001002, 0x00000000 } },
226         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
227         "565 fb, x888 tx, SRC_OVER clamp modulate", scanline_x32cb16blend_clamp_mod, init_y },
228     { { { 0x03515104, 0x00000177, { 0x00001001, 0x00000000 } },
229         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
230         "565 fb, 8888 tx, SRC_OVER clamp modulate dither", scanline_t32cb16blend_clamp_mod_dither, init_y },
231     { { { 0x03515104, 0x00000177, { 0x00001002, 0x00000000 } },
232         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
233         "565 fb, x888 tx, SRC_OVER clamp modulate dither", scanline_x32cb16blend_clamp_mod_dither, init_y },
234     { { { 0x03010104, 0x00000077, { 0x00000001, 0x00000000 } },
235         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
236         "565 fb, 8888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
237     { { { 0x03010104, 0x00000077, { 0x00000002, 0x00000000 } },
238         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
239         "565 fb, x888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
240     { { { 0x03010104, 0x00000177, { 0x00000001, 0x00000000 } },
241         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
242         "565 fb, 8888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
243     { { { 0x03010104, 0x00000177, { 0x00000002, 0x00000000 } },
244         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
245         "565 fb, x888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
246     { { { 0x03010104, 0x00000077, { 0x00000004, 0x00000000 } },
247         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
248         "565 fb, 565 tx, SRC clamp", scanline_t16cb16_clamp, init_y  },
249     { { { 0x03515104, 0x00000077, { 0x00001004, 0x00000000 } },
250         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
251         "565 fb, 565 tx, SRC_OVER clamp", scanline_t16cb16blend_clamp_mod, init_y  },
252     { { { 0x03515104, 0x00000077, { 0x00000000, 0x00000000 } },
253         { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0xFFFFFFFF } } },
254         "565 fb, 8888 fixed color", scanline_col32cb16blend, init_y_packed  },
255     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
256         { 0x00000000, 0x00000007, { 0x00000000, 0x00000000 } } },
257         "(nop) alpha test", scanline_noop, init_y_noop },
258     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
259         { 0x00000000, 0x00000070, { 0x00000000, 0x00000000 } } },
260         "(nop) depth test", scanline_noop, init_y_noop },
261     { { { 0x05000000, 0x00000000, { 0x00000000, 0x00000000 } },
262         { 0x0F000000, 0x00000080, { 0x00000000, 0x00000000 } } },
263         "(nop) logic_op", scanline_noop, init_y_noop },
264     { { { 0xF0000000, 0x00000000, { 0x00000000, 0x00000000 } },
265         { 0xF0000000, 0x00000080, { 0x00000000, 0x00000000 } } },
266         "(nop) color mask", scanline_noop, init_y_noop },
267     { { { 0x0F000000, 0x00000077, { 0x00000000, 0x00000000 } },
268         { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
269         "(set) logic_op", scanline_set, init_y_noop },
270     { { { 0x00000000, 0x00000077, { 0x00000000, 0x00000000 } },
271         { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
272         "(clear) logic_op", scanline_clear, init_y_noop },
273     { { { 0x03000000, 0x00000077, { 0x00000000, 0x00000000 } },
274         { 0xFFFFFF00, 0x000000F7, { 0x00000000, 0x00000000 } } },
275         "(clear) blending 0/0", scanline_clear, init_y_noop },
276     { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
277         { 0x0000003F, 0x00000000, { 0x00000000, 0x00000000 } } },
278         "(error) invalid color-buffer format", scanline_noop, init_y_error },
279 };
280 static const needs_filter_t noblend1to1 = {
281         // (disregard dithering, see below)
282         { 0x03010100, 0x00000077, { 0x00000A00, 0x00000000 } },
283         { 0xFFFFFFC0, 0xFFFFFEFF, { 0xFFFFFFC0, 0x0000003F } }
284 };
285 static  const needs_filter_t fill16noblend = {
286         { 0x03010100, 0x00000077, { 0x00000000, 0x00000000 } },
287         { 0xFFFFFFC0, 0xFFFFFFFF, { 0x0000003F, 0x0000003F } }
288 };
289 
290 // ----------------------------------------------------------------------------
291 
292 #if ANDROID_ARM_CODEGEN
293 
294 #if defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))
295 static CodeCache gCodeCache(32 * 1024);
296 #elif defined(__aarch64__)
297 static CodeCache gCodeCache(48 * 1024);
298 #else
299 static CodeCache gCodeCache(12 * 1024);
300 #endif
301 
302 class ScanlineAssembly : public Assembly {
303     AssemblyKey<needs_t> mKey;
304 public:
ScanlineAssembly(needs_t needs,size_t size)305     ScanlineAssembly(needs_t needs, size_t size)
306         : Assembly(size), mKey(needs) { }
key() const307     const AssemblyKey<needs_t>& key() const { return mKey; }
308 };
309 #endif
310 
311 // ----------------------------------------------------------------------------
312 
ggl_init_scanline(context_t * c)313 void ggl_init_scanline(context_t* c)
314 {
315     c->init_y = init_y;
316     c->step_y = step_y__generic;
317     c->scanline = scanline;
318 }
319 
ggl_uninit_scanline(context_t * c)320 void ggl_uninit_scanline(context_t* c)
321 {
322     if (c->state.buffers.coverage)
323         free(c->state.buffers.coverage);
324 #if ANDROID_ARM_CODEGEN
325     if (c->scanline_as)
326         c->scanline_as->decStrong(c);
327 #endif
328 }
329 
330 // ----------------------------------------------------------------------------
331 
pick_scanline(context_t * c)332 static void pick_scanline(context_t* c)
333 {
334 #if (!defined(DEBUG__CODEGEN_ONLY) || (DEBUG__CODEGEN_ONLY == 0))
335 
336 #if ANDROID_CODEGEN == ANDROID_CODEGEN_GENERIC
337     c->init_y = init_y;
338     c->step_y = step_y__generic;
339     c->scanline = scanline;
340     return;
341 #endif
342 
343     //printf("*** needs [%08lx:%08lx:%08lx:%08lx]\n",
344     //    c->state.needs.n, c->state.needs.p,
345     //    c->state.needs.t[0], c->state.needs.t[1]);
346 
347     // first handle the special case that we cannot test with a filter
348     const uint32_t cb_format = GGL_READ_NEEDS(CB_FORMAT, c->state.needs.n);
349     if (GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0]) == cb_format) {
350         if (c->state.needs.match(noblend1to1)) {
351             // this will match regardless of dithering state, since both
352             // src and dest have the same format anyway, there is no dithering
353             // to be done.
354             const GGLFormat* f =
355                 &(c->formats[GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0])]);
356             if ((f->components == GGL_RGB) ||
357                 (f->components == GGL_RGBA) ||
358                 (f->components == GGL_LUMINANCE) ||
359                 (f->components == GGL_LUMINANCE_ALPHA))
360             {
361                 // format must have all of RGB components
362                 // (so the current color doesn't show through)
363                 c->scanline = scanline_memcpy;
364                 c->init_y = init_y_noop;
365                 return;
366             }
367         }
368     }
369 
370     if (c->state.needs.match(fill16noblend)) {
371         c->init_y = init_y_packed;
372         switch (c->formats[cb_format].size) {
373         case 1: c->scanline = scanline_memset8;  return;
374         case 2: c->scanline = scanline_memset16; return;
375         case 4: c->scanline = scanline_memset32; return;
376         }
377     }
378 
379     const int numFilters = sizeof(shortcuts)/sizeof(shortcut_t);
380     for (int i=0 ; i<numFilters ; i++) {
381         if (c->state.needs.match(shortcuts[i].filter)) {
382             c->scanline = shortcuts[i].scanline;
383             c->init_y = shortcuts[i].init_y;
384             return;
385         }
386     }
387 
388 #if DEBUG_NEEDS
389     ALOGI("Needs: n=0x%08x p=0x%08x t0=0x%08x t1=0x%08x",
390          c->state.needs.n, c->state.needs.p,
391          c->state.needs.t[0], c->state.needs.t[1]);
392 #endif
393 
394 #endif // DEBUG__CODEGEN_ONLY
395 
396     c->init_y = init_y;
397     c->step_y = step_y__generic;
398 
399 #if ANDROID_ARM_CODEGEN
400     // we're going to have to generate some code...
401     // here, generate code for our pixel pipeline
402     const AssemblyKey<needs_t> key(c->state.needs);
403     sp<Assembly> assembly = gCodeCache.lookup(key);
404     if (assembly == 0) {
405         // create a new assembly region
406         sp<ScanlineAssembly> a = new ScanlineAssembly(c->state.needs,
407                 ASSEMBLY_SCRATCH_SIZE);
408         // initialize our assembler
409 #if defined(__arm__)
410         GGLAssembler assembler( new ARMAssembler(a) );
411         //GGLAssembler assembler(
412         //        new ARMAssemblerOptimizer(new ARMAssembler(a)) );
413 #endif
414 #if defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
415         GGLAssembler assembler( new ArmToMipsAssembler(a) );
416 #elif defined(__mips__) && defined(__LP64__)
417         GGLAssembler assembler( new ArmToMips64Assembler(a) );
418 #elif defined(__aarch64__)
419         GGLAssembler assembler( new ArmToArm64Assembler(a) );
420 #endif
421         // generate the scanline code for the given needs
422         bool err = assembler.scanline(c->state.needs, c) != 0;
423         if (ggl_likely(!err)) {
424             // finally, cache this assembly
425             err = gCodeCache.cache(a->key(), a) < 0;
426         }
427         if (ggl_unlikely(err)) {
428             ALOGE("error generating or caching assembly. Reverting to NOP.");
429             c->scanline = scanline_noop;
430             c->init_y = init_y_noop;
431             c->step_y = step_y__nop;
432             return;
433         }
434         assembly = a;
435     }
436 
437     // release the previous assembly
438     if (c->scanline_as) {
439         c->scanline_as->decStrong(c);
440     }
441 
442     //ALOGI("using generated pixel-pipeline");
443     c->scanline_as = assembly.get();
444     c->scanline_as->incStrong(c); //  hold on to assembly
445     c->scanline = (void(*)(context_t* c))assembly->base();
446 #else
447 //    ALOGW("using generic (slow) pixel-pipeline");
448     c->scanline = scanline;
449 #endif
450 }
451 
ggl_pick_scanline(context_t * c)452 void ggl_pick_scanline(context_t* c)
453 {
454     pick_scanline(c);
455     if ((c->state.enables & GGL_ENABLE_W) &&
456         (c->state.enables & GGL_ENABLE_TMUS))
457     {
458         c->span = c->scanline;
459         c->scanline = scanline_perspective;
460         if (!(c->state.enabled_tmu & (c->state.enabled_tmu - 1))) {
461             // only one TMU enabled
462             c->scanline = scanline_perspective_single;
463         }
464     }
465 }
466 
467 // ----------------------------------------------------------------------------
468 
469 static void blending(context_t* c, pixel_t* fragment, pixel_t* fb);
470 static void blend_factor(context_t* c, pixel_t* r, uint32_t factor,
471         const pixel_t* src, const pixel_t* dst);
472 static void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv);
473 
474 #if ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
475 
476 // no need to compile the generic-pipeline, it can't be reached
scanline(context_t *)477 void scanline(context_t*)
478 {
479 }
480 
481 #else
482 
rescale(uint32_t & u,uint8_t & su,uint32_t & v,uint8_t & sv)483 void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv)
484 {
485     if (su && sv) {
486         if (su > sv) {
487             v = ggl_expand(v, sv, su);
488             sv = su;
489         } else if (su < sv) {
490             u = ggl_expand(u, su, sv);
491             su = sv;
492         }
493     }
494 }
495 
blending(context_t * c,pixel_t * fragment,pixel_t * fb)496 void blending(context_t* c, pixel_t* fragment, pixel_t* fb)
497 {
498     rescale(fragment->c[0], fragment->s[0], fb->c[0], fb->s[0]);
499     rescale(fragment->c[1], fragment->s[1], fb->c[1], fb->s[1]);
500     rescale(fragment->c[2], fragment->s[2], fb->c[2], fb->s[2]);
501     rescale(fragment->c[3], fragment->s[3], fb->c[3], fb->s[3]);
502 
503     pixel_t sf, df;
504     blend_factor(c, &sf, c->state.blend.src, fragment, fb);
505     blend_factor(c, &df, c->state.blend.dst, fragment, fb);
506 
507     fragment->c[1] =
508             gglMulAddx(fragment->c[1], sf.c[1], gglMulx(fb->c[1], df.c[1]));
509     fragment->c[2] =
510             gglMulAddx(fragment->c[2], sf.c[2], gglMulx(fb->c[2], df.c[2]));
511     fragment->c[3] =
512             gglMulAddx(fragment->c[3], sf.c[3], gglMulx(fb->c[3], df.c[3]));
513 
514     if (c->state.blend.alpha_separate) {
515         blend_factor(c, &sf, c->state.blend.src_alpha, fragment, fb);
516         blend_factor(c, &df, c->state.blend.dst_alpha, fragment, fb);
517     }
518 
519     fragment->c[0] =
520             gglMulAddx(fragment->c[0], sf.c[0], gglMulx(fb->c[0], df.c[0]));
521 
522     // clamp to 1.0
523     if (fragment->c[0] >= (1LU<<fragment->s[0]))
524         fragment->c[0] = (1<<fragment->s[0])-1;
525     if (fragment->c[1] >= (1LU<<fragment->s[1]))
526         fragment->c[1] = (1<<fragment->s[1])-1;
527     if (fragment->c[2] >= (1LU<<fragment->s[2]))
528         fragment->c[2] = (1<<fragment->s[2])-1;
529     if (fragment->c[3] >= (1LU<<fragment->s[3]))
530         fragment->c[3] = (1<<fragment->s[3])-1;
531 }
532 
blendfactor(uint32_t x,uint32_t size,uint32_t def=0)533 static inline int blendfactor(uint32_t x, uint32_t size, uint32_t def = 0)
534 {
535     if (!size)
536         return def;
537 
538     // scale to 16 bits
539     if (size > 16) {
540         x >>= (size - 16);
541     } else if (size < 16) {
542         x = ggl_expand(x, size, 16);
543     }
544     x += x >> 15;
545     return x;
546 }
547 
blend_factor(context_t *,pixel_t * r,uint32_t factor,const pixel_t * src,const pixel_t * dst)548 void blend_factor(context_t* /*c*/, pixel_t* r,
549         uint32_t factor, const pixel_t* src, const pixel_t* dst)
550 {
551     switch (factor) {
552         case GGL_ZERO:
553             r->c[1] =
554             r->c[2] =
555             r->c[3] =
556             r->c[0] = 0;
557             break;
558         case GGL_ONE:
559             r->c[1] =
560             r->c[2] =
561             r->c[3] =
562             r->c[0] = FIXED_ONE;
563             break;
564         case GGL_DST_COLOR:
565             r->c[1] = blendfactor(dst->c[1], dst->s[1]);
566             r->c[2] = blendfactor(dst->c[2], dst->s[2]);
567             r->c[3] = blendfactor(dst->c[3], dst->s[3]);
568             r->c[0] = blendfactor(dst->c[0], dst->s[0]);
569             break;
570         case GGL_SRC_COLOR:
571             r->c[1] = blendfactor(src->c[1], src->s[1]);
572             r->c[2] = blendfactor(src->c[2], src->s[2]);
573             r->c[3] = blendfactor(src->c[3], src->s[3]);
574             r->c[0] = blendfactor(src->c[0], src->s[0]);
575             break;
576         case GGL_ONE_MINUS_DST_COLOR:
577             r->c[1] = FIXED_ONE - blendfactor(dst->c[1], dst->s[1]);
578             r->c[2] = FIXED_ONE - blendfactor(dst->c[2], dst->s[2]);
579             r->c[3] = FIXED_ONE - blendfactor(dst->c[3], dst->s[3]);
580             r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0]);
581             break;
582         case GGL_ONE_MINUS_SRC_COLOR:
583             r->c[1] = FIXED_ONE - blendfactor(src->c[1], src->s[1]);
584             r->c[2] = FIXED_ONE - blendfactor(src->c[2], src->s[2]);
585             r->c[3] = FIXED_ONE - blendfactor(src->c[3], src->s[3]);
586             r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0]);
587             break;
588         case GGL_SRC_ALPHA:
589             r->c[1] =
590             r->c[2] =
591             r->c[3] =
592             r->c[0] = blendfactor(src->c[0], src->s[0], FIXED_ONE);
593             break;
594         case GGL_ONE_MINUS_SRC_ALPHA:
595             r->c[1] =
596             r->c[2] =
597             r->c[3] =
598             r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0], FIXED_ONE);
599             break;
600         case GGL_DST_ALPHA:
601             r->c[1] =
602             r->c[2] =
603             r->c[3] =
604             r->c[0] = blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
605             break;
606         case GGL_ONE_MINUS_DST_ALPHA:
607             r->c[1] =
608             r->c[2] =
609             r->c[3] =
610             r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
611             break;
612         case GGL_SRC_ALPHA_SATURATE:
613             // XXX: GGL_SRC_ALPHA_SATURATE
614             break;
615     }
616 }
617 
wrapping(int32_t coord,uint32_t size,int tx_wrap)618 static GGLfixed wrapping(int32_t coord, uint32_t size, int tx_wrap)
619 {
620     GGLfixed d;
621     if (tx_wrap == GGL_REPEAT) {
622         d = (uint32_t(coord)>>16) * size;
623     } else if (tx_wrap == GGL_CLAMP) { // CLAMP_TO_EDGE semantics
624         const GGLfixed clamp_min = FIXED_HALF;
625         const GGLfixed clamp_max = (size << 16) - FIXED_HALF;
626         if (coord < clamp_min)     coord = clamp_min;
627         if (coord > clamp_max)     coord = clamp_max;
628         d = coord;
629     } else { // 1:1
630         const GGLfixed clamp_min = 0;
631         const GGLfixed clamp_max = (size << 16);
632         if (coord < clamp_min)     coord = clamp_min;
633         if (coord > clamp_max)     coord = clamp_max;
634         d = coord;
635     }
636     return d;
637 }
638 
639 static inline
ADJUST_COLOR_ITERATOR(GGLcolor v,GGLcolor dvdx,int len)640 GGLcolor ADJUST_COLOR_ITERATOR(GGLcolor v, GGLcolor dvdx, int len)
641 {
642     const int32_t end = dvdx * (len-1) + v;
643     if (end < 0)
644         v -= end;
645     v &= ~(v>>31);
646     return v;
647 }
648 
scanline(context_t * c)649 void scanline(context_t* c)
650 {
651     const uint32_t enables = c->state.enables;
652     const int xs = c->iterators.xl;
653     const int x1 = c->iterators.xr;
654 	int xc = x1 - xs;
655     const int16_t* covPtr = c->state.buffers.coverage + xs;
656 
657     // All iterated values are sampled at the pixel center
658 
659     // reset iterators for that scanline...
660     GGLcolor r, g, b, a;
661     iterators_t& ci = c->iterators;
662     if (enables & GGL_ENABLE_SMOOTH) {
663         r = (xs * c->shade.drdx) + ci.ydrdy;
664         g = (xs * c->shade.dgdx) + ci.ydgdy;
665         b = (xs * c->shade.dbdx) + ci.ydbdy;
666         a = (xs * c->shade.dadx) + ci.ydady;
667         r = ADJUST_COLOR_ITERATOR(r, c->shade.drdx, xc);
668         g = ADJUST_COLOR_ITERATOR(g, c->shade.dgdx, xc);
669         b = ADJUST_COLOR_ITERATOR(b, c->shade.dbdx, xc);
670         a = ADJUST_COLOR_ITERATOR(a, c->shade.dadx, xc);
671     } else {
672         r = ci.ydrdy;
673         g = ci.ydgdy;
674         b = ci.ydbdy;
675         a = ci.ydady;
676     }
677 
678     // z iterators are 1.31
679     GGLfixed z = (xs * c->shade.dzdx) + ci.ydzdy;
680     GGLfixed f = (xs * c->shade.dfdx) + ci.ydfdy;
681 
682     struct {
683         GGLfixed s, t;
684     } tc[GGL_TEXTURE_UNIT_COUNT];
685     if (enables & GGL_ENABLE_TMUS) {
686         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
687             if (c->state.texture[i].enable) {
688                 texture_iterators_t& ti = c->state.texture[i].iterators;
689                 if (enables & GGL_ENABLE_W) {
690                     tc[i].s = ti.ydsdy;
691                     tc[i].t = ti.ydtdy;
692                 } else {
693                     tc[i].s = (xs * ti.dsdx) + ti.ydsdy;
694                     tc[i].t = (xs * ti.dtdx) + ti.ydtdy;
695                 }
696             }
697         }
698     }
699 
700     pixel_t fragment;
701     pixel_t texel;
702     pixel_t fb;
703 
704 	uint32_t x = xs;
705 	uint32_t y = c->iterators.y;
706 
707 	while (xc--) {
708 
709         { // just a scope
710 
711 		// read color (convert to 8 bits by keeping only the integer part)
712         fragment.s[1] = fragment.s[2] =
713         fragment.s[3] = fragment.s[0] = 8;
714         fragment.c[1] = r >> (GGL_COLOR_BITS-8);
715         fragment.c[2] = g >> (GGL_COLOR_BITS-8);
716         fragment.c[3] = b >> (GGL_COLOR_BITS-8);
717         fragment.c[0] = a >> (GGL_COLOR_BITS-8);
718 
719 		// texturing
720         if (enables & GGL_ENABLE_TMUS) {
721             for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
722                 texture_t& tx = c->state.texture[i];
723                 if (!tx.enable)
724                     continue;
725                 texture_iterators_t& ti = tx.iterators;
726                 int32_t u, v;
727 
728                 // s-coordinate
729                 if (tx.s_coord != GGL_ONE_TO_ONE) {
730                     const int w = tx.surface.width;
731                     u = wrapping(tc[i].s, w, tx.s_wrap);
732                     tc[i].s += ti.dsdx;
733                 } else {
734                     u = (((tx.shade.is0>>16) + x)<<16) + FIXED_HALF;
735                 }
736 
737                 // t-coordinate
738                 if (tx.t_coord != GGL_ONE_TO_ONE) {
739                     const int h = tx.surface.height;
740                     v = wrapping(tc[i].t, h, tx.t_wrap);
741                     tc[i].t += ti.dtdx;
742                 } else {
743                     v = (((tx.shade.it0>>16) + y)<<16) + FIXED_HALF;
744                 }
745 
746                 // read texture
747                 if (tx.mag_filter == GGL_NEAREST &&
748                     tx.min_filter == GGL_NEAREST)
749                 {
750                     u >>= 16;
751                     v >>= 16;
752                     tx.surface.read(&tx.surface, c, u, v, &texel);
753                 } else {
754                     const int w = tx.surface.width;
755                     const int h = tx.surface.height;
756                     u -= FIXED_HALF;
757                     v -= FIXED_HALF;
758                     int u0 = u >> 16;
759                     int v0 = v >> 16;
760                     int u1 = u0 + 1;
761                     int v1 = v0 + 1;
762                     if (tx.s_wrap == GGL_REPEAT) {
763                         if (u0<0)  u0 += w;
764                         if (u1<0)  u1 += w;
765                         if (u0>=w) u0 -= w;
766                         if (u1>=w) u1 -= w;
767                     } else {
768                         if (u0<0)  u0 = 0;
769                         if (u1<0)  u1 = 0;
770                         if (u0>=w) u0 = w-1;
771                         if (u1>=w) u1 = w-1;
772                     }
773                     if (tx.t_wrap == GGL_REPEAT) {
774                         if (v0<0)  v0 += h;
775                         if (v1<0)  v1 += h;
776                         if (v0>=h) v0 -= h;
777                         if (v1>=h) v1 -= h;
778                     } else {
779                         if (v0<0)  v0 = 0;
780                         if (v1<0)  v1 = 0;
781                         if (v0>=h) v0 = h-1;
782                         if (v1>=h) v1 = h-1;
783                     }
784                     pixel_t texels[4];
785                     uint32_t mm[4];
786                     tx.surface.read(&tx.surface, c, u0, v0, &texels[0]);
787                     tx.surface.read(&tx.surface, c, u0, v1, &texels[1]);
788                     tx.surface.read(&tx.surface, c, u1, v0, &texels[2]);
789                     tx.surface.read(&tx.surface, c, u1, v1, &texels[3]);
790                     u = (u >> 12) & 0xF;
791                     v = (v >> 12) & 0xF;
792                     u += u>>3;
793                     v += v>>3;
794                     mm[0] = (0x10 - u) * (0x10 - v);
795                     mm[1] = (0x10 - u) * v;
796                     mm[2] = u * (0x10 - v);
797                     mm[3] = 0x100 - (mm[0] + mm[1] + mm[2]);
798                     for (int j=0 ; j<4 ; j++) {
799                         texel.s[j] = texels[0].s[j];
800                         if (!texel.s[j]) continue;
801                         texel.s[j] += 8;
802                         texel.c[j] =    texels[0].c[j]*mm[0] +
803                                         texels[1].c[j]*mm[1] +
804                                         texels[2].c[j]*mm[2] +
805                                         texels[3].c[j]*mm[3] ;
806                     }
807                 }
808 
809                 // Texture environnement...
810                 for (int j=0 ; j<4 ; j++) {
811                     uint32_t& Cf = fragment.c[j];
812                     uint32_t& Ct = texel.c[j];
813                     uint8_t& sf  = fragment.s[j];
814                     uint8_t& st  = texel.s[j];
815                     uint32_t At = texel.c[0];
816                     uint8_t sat = texel.s[0];
817                     switch (tx.env) {
818                     case GGL_REPLACE:
819                         if (st) {
820                             Cf = Ct;
821                             sf = st;
822                         }
823                         break;
824                     case GGL_MODULATE:
825                         if (st) {
826                             uint32_t factor = Ct + (Ct>>(st-1));
827                             Cf = (Cf * factor) >> st;
828                         }
829                         break;
830                     case GGL_DECAL:
831                         if (sat) {
832                             rescale(Cf, sf, Ct, st);
833                             Cf += ((Ct - Cf) * (At + (At>>(sat-1)))) >> sat;
834                         }
835                         break;
836                     case GGL_BLEND:
837                         if (st) {
838                             uint32_t Cc = tx.env_color[i];
839                             if (sf>8)       Cc = (Cc * ((1<<sf)-1))>>8;
840                             else if (sf<8)  Cc = (Cc - (Cc>>(8-sf)))>>(8-sf);
841                             uint32_t factor = Ct + (Ct>>(st-1));
842                             Cf = ((((1<<st) - factor) * Cf) + Ct*Cc)>>st;
843                         }
844                         break;
845                     case GGL_ADD:
846                         if (st) {
847                             rescale(Cf, sf, Ct, st);
848                             Cf += Ct;
849                         }
850                         break;
851                     }
852                 }
853             }
854 		}
855 
856         // coverage application
857         if (enables & GGL_ENABLE_AA) {
858             int16_t cf = *covPtr++;
859             fragment.c[0] = (int64_t(fragment.c[0]) * cf) >> 15;
860         }
861 
862         // alpha-test
863         if (enables & GGL_ENABLE_ALPHA_TEST) {
864             GGLcolor ref = c->state.alpha_test.ref;
865             GGLcolor alpha = (uint64_t(fragment.c[0]) *
866                     ((1<<GGL_COLOR_BITS)-1)) / ((1<<fragment.s[0])-1);
867             switch (c->state.alpha_test.func) {
868             case GGL_NEVER:     goto discard;
869             case GGL_LESS:      if (alpha<ref)  break; goto discard;
870             case GGL_EQUAL:     if (alpha==ref) break; goto discard;
871             case GGL_LEQUAL:    if (alpha<=ref) break; goto discard;
872             case GGL_GREATER:   if (alpha>ref)  break; goto discard;
873             case GGL_NOTEQUAL:  if (alpha!=ref) break; goto discard;
874             case GGL_GEQUAL:    if (alpha>=ref) break; goto discard;
875             }
876         }
877 
878         // depth test
879         if (c->state.buffers.depth.format) {
880             if (enables & GGL_ENABLE_DEPTH_TEST) {
881                 surface_t* cb = &(c->state.buffers.depth);
882                 uint16_t* p = (uint16_t*)(cb->data)+(x+(cb->stride*y));
883                 uint16_t zz = uint32_t(z)>>(16);
884                 uint16_t depth = *p;
885                 switch (c->state.depth_test.func) {
886                 case GGL_NEVER:     goto discard;
887                 case GGL_LESS:      if (zz<depth)    break; goto discard;
888                 case GGL_EQUAL:     if (zz==depth)   break; goto discard;
889                 case GGL_LEQUAL:    if (zz<=depth)   break; goto discard;
890                 case GGL_GREATER:   if (zz>depth)    break; goto discard;
891                 case GGL_NOTEQUAL:  if (zz!=depth)   break; goto discard;
892                 case GGL_GEQUAL:    if (zz>=depth)   break; goto discard;
893                 }
894                 // depth buffer is not enabled, if depth-test is not enabled
895 /*
896         fragment.s[1] = fragment.s[2] =
897         fragment.s[3] = fragment.s[0] = 8;
898         fragment.c[1] =
899         fragment.c[2] =
900         fragment.c[3] =
901         fragment.c[0] = 255 - (zz>>8);
902 */
903                 if (c->state.mask.depth) {
904                     *p = zz;
905                 }
906             }
907         }
908 
909         // fog
910         if (enables & GGL_ENABLE_FOG) {
911             for (int i=1 ; i<=3 ; i++) {
912                 GGLfixed fc = (c->state.fog.color[i] * 0x10000) / 0xFF;
913                 uint32_t& c = fragment.c[i];
914                 uint8_t& s  = fragment.s[i];
915                 c = (c * 0x10000) / ((1<<s)-1);
916                 c = gglMulAddx(c, f, gglMulx(fc, 0x10000 - f));
917                 s = 16;
918             }
919         }
920 
921         // blending
922         if (enables & GGL_ENABLE_BLENDING) {
923             fb.c[1] = fb.c[2] = fb.c[3] = fb.c[0] = 0; // placate valgrind
924             fb.s[1] = fb.s[2] = fb.s[3] = fb.s[0] = 0;
925             c->state.buffers.color.read(
926                     &(c->state.buffers.color), c, x, y, &fb);
927             blending( c, &fragment, &fb );
928         }
929 
930 		// write
931         c->state.buffers.color.write(
932                 &(c->state.buffers.color), c, x, y, &fragment);
933         }
934 
935 discard:
936 		// iterate...
937         x += 1;
938         if (enables & GGL_ENABLE_SMOOTH) {
939             r += c->shade.drdx;
940             g += c->shade.dgdx;
941             b += c->shade.dbdx;
942             a += c->shade.dadx;
943         }
944         z += c->shade.dzdx;
945         f += c->shade.dfdx;
946 	}
947 }
948 
949 #endif // ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
950 
951 // ----------------------------------------------------------------------------
952 #if 0
953 #pragma mark -
954 #pragma mark Scanline
955 #endif
956 
957 /* Used to parse a 32-bit source texture linearly. Usage is:
958  *
959  * horz_iterator32  hi(context);
960  * while (...) {
961  *    uint32_t  src_pixel = hi.get_pixel32();
962  *    ...
963  * }
964  *
965  * Use only for one-to-one texture mapping.
966  */
967 struct horz_iterator32 {
horz_iterator32android::horz_iterator32968     horz_iterator32(context_t* c) {
969         const int x = c->iterators.xl;
970         const int y = c->iterators.y;
971         texture_t& tx = c->state.texture[0];
972         const int32_t u = (tx.shade.is0>>16) + x;
973         const int32_t v = (tx.shade.it0>>16) + y;
974         m_src = reinterpret_cast<uint32_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
975     }
get_pixel32android::horz_iterator32976     uint32_t  get_pixel32() {
977         return *m_src++;
978     }
979 protected:
980     uint32_t* m_src;
981 };
982 
983 /* A variant for 16-bit source textures. */
984 struct horz_iterator16 {
horz_iterator16android::horz_iterator16985     horz_iterator16(context_t* c) {
986         const int x = c->iterators.xl;
987         const int y = c->iterators.y;
988         texture_t& tx = c->state.texture[0];
989         const int32_t u = (tx.shade.is0>>16) + x;
990         const int32_t v = (tx.shade.it0>>16) + y;
991         m_src = reinterpret_cast<uint16_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
992     }
get_pixel16android::horz_iterator16993     uint16_t  get_pixel16() {
994         return *m_src++;
995     }
996 protected:
997     uint16_t* m_src;
998 };
999 
1000 /* A clamp iterator is used to iterate inside a texture with GGL_CLAMP.
1001  * After initialization, call get_src16() or get_src32() to get the current
1002  * texture pixel value.
1003  */
1004 struct clamp_iterator {
clamp_iteratorandroid::clamp_iterator1005     clamp_iterator(context_t* c) {
1006         const int xs = c->iterators.xl;
1007         texture_t& tx = c->state.texture[0];
1008         texture_iterators_t& ti = tx.iterators;
1009         m_s = (xs * ti.dsdx) + ti.ydsdy;
1010         m_t = (xs * ti.dtdx) + ti.ydtdy;
1011         m_ds = ti.dsdx;
1012         m_dt = ti.dtdx;
1013         m_width_m1 = tx.surface.width - 1;
1014         m_height_m1 = tx.surface.height - 1;
1015         m_data = tx.surface.data;
1016         m_stride = tx.surface.stride;
1017     }
get_pixel16android::clamp_iterator1018     uint16_t get_pixel16() {
1019         int  u, v;
1020         get_uv(u, v);
1021         uint16_t* src = reinterpret_cast<uint16_t*>(m_data) + (u + (m_stride*v));
1022         return src[0];
1023     }
get_pixel32android::clamp_iterator1024     uint32_t get_pixel32() {
1025         int  u, v;
1026         get_uv(u, v);
1027         uint32_t* src = reinterpret_cast<uint32_t*>(m_data) + (u + (m_stride*v));
1028         return src[0];
1029     }
1030 private:
get_uvandroid::clamp_iterator1031     void   get_uv(int& u, int& v) {
1032         int  uu = m_s >> 16;
1033         int  vv = m_t >> 16;
1034         if (uu < 0)
1035             uu = 0;
1036         if (uu > m_width_m1)
1037             uu = m_width_m1;
1038         if (vv < 0)
1039             vv = 0;
1040         if (vv > m_height_m1)
1041             vv = m_height_m1;
1042         u = uu;
1043         v = vv;
1044         m_s += m_ds;
1045         m_t += m_dt;
1046     }
1047 
1048     GGLfixed  m_s, m_t;
1049     GGLfixed  m_ds, m_dt;
1050     int       m_width_m1, m_height_m1;
1051     uint8_t*  m_data;
1052     int       m_stride;
1053 };
1054 
1055 /*
1056  * The 'horizontal clamp iterator' variant corresponds to the case where
1057  * the 'v' coordinate doesn't change. This is useful to avoid one mult and
1058  * extra adds / checks per pixels, if the blending/processing operation after
1059  * this is very fast.
1060  */
is_context_horizontal(const context_t * c)1061 static int is_context_horizontal(const context_t* c) {
1062     return (c->state.texture[0].iterators.dtdx == 0);
1063 }
1064 
1065 struct horz_clamp_iterator {
get_pixel16android::horz_clamp_iterator1066     uint16_t  get_pixel16() {
1067         int  u = m_s >> 16;
1068         m_s += m_ds;
1069         if (u < 0)
1070             u = 0;
1071         if (u > m_width_m1)
1072             u = m_width_m1;
1073         const uint16_t* src = reinterpret_cast<const uint16_t*>(m_data);
1074         return src[u];
1075     }
get_pixel32android::horz_clamp_iterator1076     uint32_t  get_pixel32() {
1077         int  u = m_s >> 16;
1078         m_s += m_ds;
1079         if (u < 0)
1080             u = 0;
1081         if (u > m_width_m1)
1082             u = m_width_m1;
1083         const uint32_t* src = reinterpret_cast<const uint32_t*>(m_data);
1084         return src[u];
1085     }
1086 protected:
1087     void init(const context_t* c, int shift);
1088     GGLfixed       m_s;
1089     GGLfixed       m_ds;
1090     int            m_width_m1;
1091     const uint8_t* m_data;
1092 };
1093 
init(const context_t * c,int shift)1094 void horz_clamp_iterator::init(const context_t* c, int shift)
1095 {
1096     const int xs = c->iterators.xl;
1097     const texture_t& tx = c->state.texture[0];
1098     const texture_iterators_t& ti = tx.iterators;
1099     m_s = (xs * ti.dsdx) + ti.ydsdy;
1100     m_ds = ti.dsdx;
1101     m_width_m1 = tx.surface.width-1;
1102     m_data = tx.surface.data;
1103 
1104     GGLfixed t = (xs * ti.dtdx) + ti.ydtdy;
1105     int      v = t >> 16;
1106     if (v < 0)
1107         v = 0;
1108     else if (v >= (int)tx.surface.height)
1109         v = (int)tx.surface.height-1;
1110 
1111     m_data += (tx.surface.stride*v) << shift;
1112 }
1113 
1114 struct horz_clamp_iterator16 : horz_clamp_iterator {
horz_clamp_iterator16android::horz_clamp_iterator161115     horz_clamp_iterator16(const context_t* c) {
1116         init(c,1);
1117     };
1118 };
1119 
1120 struct horz_clamp_iterator32 : horz_clamp_iterator {
horz_clamp_iterator32android::horz_clamp_iterator321121     horz_clamp_iterator32(context_t* c) {
1122         init(c,2);
1123     };
1124 };
1125 
1126 /* This is used to perform dithering operations.
1127  */
1128 struct ditherer {
dithererandroid::ditherer1129     ditherer(const context_t* c) {
1130         const int x = c->iterators.xl;
1131         const int y = c->iterators.y;
1132         m_line = &c->ditherMatrix[ ((y & GGL_DITHER_MASK)<<GGL_DITHER_ORDER_SHIFT) ];
1133         m_index = x & GGL_DITHER_MASK;
1134     }
stepandroid::ditherer1135     void step(void) {
1136         m_index++;
1137     }
get_valueandroid::ditherer1138     int  get_value(void) {
1139         int ret = m_line[m_index & GGL_DITHER_MASK];
1140         m_index++;
1141         return ret;
1142     }
abgr8888ToRgb565android::ditherer1143     uint16_t abgr8888ToRgb565(uint32_t s) {
1144         uint32_t r = s & 0xff;
1145         uint32_t g = (s >> 8) & 0xff;
1146         uint32_t b = (s >> 16) & 0xff;
1147         return rgb888ToRgb565(r,g,b);
1148     }
1149     /* The following assumes that r/g/b are in the 0..255 range each */
rgb888ToRgb565android::ditherer1150     uint16_t rgb888ToRgb565(uint32_t& r, uint32_t& g, uint32_t &b) {
1151         int threshold = get_value();
1152         /* dither in on GGL_DITHER_BITS, and each of r, g, b is on 8 bits */
1153         r += (threshold >> (GGL_DITHER_BITS-8 +5));
1154         g += (threshold >> (GGL_DITHER_BITS-8 +6));
1155         b += (threshold >> (GGL_DITHER_BITS-8 +5));
1156         if (r > 0xff)
1157             r = 0xff;
1158         if (g > 0xff)
1159             g = 0xff;
1160         if (b > 0xff)
1161             b = 0xff;
1162         return uint16_t(((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3));
1163     }
1164 protected:
1165     const uint8_t* m_line;
1166     int            m_index;
1167 };
1168 
1169 /* This structure is used to blend (SRC_OVER) 32-bit source pixels
1170  * onto 16-bit destination ones. Usage is simply:
1171  *
1172  *   blender.blend(<32-bit-src-pixel-value>,<ptr-to-16-bit-dest-pixel>)
1173  */
1174 struct blender_32to16 {
blender_32to16android::blender_32to161175     blender_32to16(context_t* /*c*/) { }
writeandroid::blender_32to161176     void write(uint32_t s, uint16_t* dst) {
1177         if (s == 0)
1178             return;
1179         s = GGL_RGBA_TO_HOST(s);
1180         int sA = (s>>24);
1181         if (sA == 0xff) {
1182             *dst = convertAbgr8888ToRgb565(s);
1183         } else {
1184             int f = 0x100 - (sA + (sA>>7));
1185             int sR = (s >> (   3))&0x1F;
1186             int sG = (s >> ( 8+2))&0x3F;
1187             int sB = (s >> (16+3))&0x1F;
1188             uint16_t d = *dst;
1189             int dR = (d>>11)&0x1f;
1190             int dG = (d>>5)&0x3f;
1191             int dB = (d)&0x1f;
1192             sR += (f*dR)>>8;
1193             sG += (f*dG)>>8;
1194             sB += (f*dB)>>8;
1195             *dst = uint16_t((sR<<11)|(sG<<5)|sB);
1196         }
1197     }
writeandroid::blender_32to161198     void write(uint32_t s, uint16_t* dst, ditherer& di) {
1199         if (s == 0) {
1200             di.step();
1201             return;
1202         }
1203         s = GGL_RGBA_TO_HOST(s);
1204         int sA = (s>>24);
1205         if (sA == 0xff) {
1206             *dst = di.abgr8888ToRgb565(s);
1207         } else {
1208             int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
1209             int f = 0x100 - (sA + (sA>>7));
1210             int sR = (s >> (   3))&0x1F;
1211             int sG = (s >> ( 8+2))&0x3F;
1212             int sB = (s >> (16+3))&0x1F;
1213             uint16_t d = *dst;
1214             int dR = (d>>11)&0x1f;
1215             int dG = (d>>5)&0x3f;
1216             int dB = (d)&0x1f;
1217             sR = ((sR << 8) + f*dR + threshold)>>8;
1218             sG = ((sG << 8) + f*dG + threshold)>>8;
1219             sB = ((sB << 8) + f*dB + threshold)>>8;
1220             if (sR > 0x1f) sR = 0x1f;
1221             if (sG > 0x3f) sG = 0x3f;
1222             if (sB > 0x1f) sB = 0x1f;
1223             *dst = uint16_t((sR<<11)|(sG<<5)|sB);
1224         }
1225     }
1226 };
1227 
1228 /* This blender does the same for the 'blend_srca' operation.
1229  * where dstFactor=srcA*(1-srcA) srcFactor=srcA
1230  */
1231 struct blender_32to16_srcA {
blender_32to16_srcAandroid::blender_32to16_srcA1232     blender_32to16_srcA(const context_t* /*c*/) { }
writeandroid::blender_32to16_srcA1233     void write(uint32_t s, uint16_t* dst) {
1234         if (!s) {
1235             return;
1236         }
1237         uint16_t d = *dst;
1238         s = GGL_RGBA_TO_HOST(s);
1239         int sR = (s >> (   3))&0x1F;
1240         int sG = (s >> ( 8+2))&0x3F;
1241         int sB = (s >> (16+3))&0x1F;
1242         int sA = (s>>24);
1243         int f1 = (sA + (sA>>7));
1244         int f2 = 0x100-f1;
1245         int dR = (d>>11)&0x1f;
1246         int dG = (d>>5)&0x3f;
1247         int dB = (d)&0x1f;
1248         sR = (f1*sR + f2*dR)>>8;
1249         sG = (f1*sG + f2*dG)>>8;
1250         sB = (f1*sB + f2*dB)>>8;
1251         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
1252     }
1253 };
1254 
1255 /* Common init code the modulating blenders */
1256 struct blender_modulate {
initandroid::blender_modulate1257     void init(const context_t* c) {
1258         const int r = c->iterators.ydrdy >> (GGL_COLOR_BITS-8);
1259         const int g = c->iterators.ydgdy >> (GGL_COLOR_BITS-8);
1260         const int b = c->iterators.ydbdy >> (GGL_COLOR_BITS-8);
1261         const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
1262         m_r = r + (r >> 7);
1263         m_g = g + (g >> 7);
1264         m_b = b + (b >> 7);
1265         m_a = a + (a >> 7);
1266     }
1267 protected:
1268     int m_r, m_g, m_b, m_a;
1269 };
1270 
1271 /* This blender does a normal blend after modulation.
1272  */
1273 struct blender_32to16_modulate : blender_modulate {
blender_32to16_modulateandroid::blender_32to16_modulate1274     blender_32to16_modulate(const context_t* c) {
1275         init(c);
1276     }
writeandroid::blender_32to16_modulate1277     void write(uint32_t s, uint16_t* dst) {
1278         // blend source and destination
1279         if (!s) {
1280             return;
1281         }
1282         s = GGL_RGBA_TO_HOST(s);
1283 
1284         /* We need to modulate s */
1285         uint32_t  sA = (s >> 24);
1286         uint32_t  sB = (s >> 16) & 0xff;
1287         uint32_t  sG = (s >> 8) & 0xff;
1288         uint32_t  sR = s & 0xff;
1289 
1290         sA = (sA*m_a) >> 8;
1291         /* Keep R/G/B scaled to 5.8 or 6.8 fixed float format */
1292         sR = (sR*m_r) >> (8 - 5);
1293         sG = (sG*m_g) >> (8 - 6);
1294         sB = (sB*m_b) >> (8 - 5);
1295 
1296         /* Now do a normal blend */
1297         int f = 0x100 - (sA + (sA>>7));
1298         uint16_t d = *dst;
1299         int dR = (d>>11)&0x1f;
1300         int dG = (d>>5)&0x3f;
1301         int dB = (d)&0x1f;
1302         sR = (sR + f*dR)>>8;
1303         sG = (sG + f*dG)>>8;
1304         sB = (sB + f*dB)>>8;
1305         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
1306     }
writeandroid::blender_32to16_modulate1307     void write(uint32_t s, uint16_t* dst, ditherer& di) {
1308         // blend source and destination
1309         if (!s) {
1310             di.step();
1311             return;
1312         }
1313         s = GGL_RGBA_TO_HOST(s);
1314 
1315         /* We need to modulate s */
1316         uint32_t  sA = (s >> 24);
1317         uint32_t  sB = (s >> 16) & 0xff;
1318         uint32_t  sG = (s >> 8) & 0xff;
1319         uint32_t  sR = s & 0xff;
1320 
1321         sA = (sA*m_a) >> 8;
1322         /* keep R/G/B scaled to 5.8 or 6.8 fixed float format */
1323         sR = (sR*m_r) >> (8 - 5);
1324         sG = (sG*m_g) >> (8 - 6);
1325         sB = (sB*m_b) >> (8 - 5);
1326 
1327         /* Scale threshold to 0.8 fixed float format */
1328         int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
1329         int f = 0x100 - (sA + (sA>>7));
1330         uint16_t d = *dst;
1331         int dR = (d>>11)&0x1f;
1332         int dG = (d>>5)&0x3f;
1333         int dB = (d)&0x1f;
1334         sR = (sR + f*dR + threshold)>>8;
1335         sG = (sG + f*dG + threshold)>>8;
1336         sB = (sB + f*dB + threshold)>>8;
1337         if (sR > 0x1f) sR = 0x1f;
1338         if (sG > 0x3f) sG = 0x3f;
1339         if (sB > 0x1f) sB = 0x1f;
1340         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
1341     }
1342 };
1343 
1344 /* same as 32to16_modulate, except that the input is xRGB, instead of ARGB */
1345 struct blender_x32to16_modulate : blender_modulate {
blender_x32to16_modulateandroid::blender_x32to16_modulate1346     blender_x32to16_modulate(const context_t* c) {
1347         init(c);
1348     }
writeandroid::blender_x32to16_modulate1349     void write(uint32_t s, uint16_t* dst) {
1350         s = GGL_RGBA_TO_HOST(s);
1351 
1352         uint32_t  sB = (s >> 16) & 0xff;
1353         uint32_t  sG = (s >> 8) & 0xff;
1354         uint32_t  sR = s & 0xff;
1355 
1356         /* Keep R/G/B in 5.8 or 6.8 format */
1357         sR = (sR*m_r) >> (8 - 5);
1358         sG = (sG*m_g) >> (8 - 6);
1359         sB = (sB*m_b) >> (8 - 5);
1360 
1361         int f = 0x100 - m_a;
1362         uint16_t d = *dst;
1363         int dR = (d>>11)&0x1f;
1364         int dG = (d>>5)&0x3f;
1365         int dB = (d)&0x1f;
1366         sR = (sR + f*dR)>>8;
1367         sG = (sG + f*dG)>>8;
1368         sB = (sB + f*dB)>>8;
1369         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
1370     }
writeandroid::blender_x32to16_modulate1371     void write(uint32_t s, uint16_t* dst, ditherer& di) {
1372         s = GGL_RGBA_TO_HOST(s);
1373 
1374         uint32_t  sB = (s >> 16) & 0xff;
1375         uint32_t  sG = (s >> 8) & 0xff;
1376         uint32_t  sR = s & 0xff;
1377 
1378         sR = (sR*m_r) >> (8 - 5);
1379         sG = (sG*m_g) >> (8 - 6);
1380         sB = (sB*m_b) >> (8 - 5);
1381 
1382         /* Now do a normal blend */
1383         int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
1384         int f = 0x100 - m_a;
1385         uint16_t d = *dst;
1386         int dR = (d>>11)&0x1f;
1387         int dG = (d>>5)&0x3f;
1388         int dB = (d)&0x1f;
1389         sR = (sR + f*dR + threshold)>>8;
1390         sG = (sG + f*dG + threshold)>>8;
1391         sB = (sB + f*dB + threshold)>>8;
1392         if (sR > 0x1f) sR = 0x1f;
1393         if (sG > 0x3f) sG = 0x3f;
1394         if (sB > 0x1f) sB = 0x1f;
1395         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
1396     }
1397 };
1398 
1399 /* Same as above, but source is 16bit rgb565 */
1400 struct blender_16to16_modulate : blender_modulate {
blender_16to16_modulateandroid::blender_16to16_modulate1401     blender_16to16_modulate(const context_t* c) {
1402         init(c);
1403     }
writeandroid::blender_16to16_modulate1404     void write(uint16_t s16, uint16_t* dst) {
1405         uint32_t  s = s16;
1406 
1407         uint32_t  sR = s >> 11;
1408         uint32_t  sG = (s >> 5) & 0x3f;
1409         uint32_t  sB = s & 0x1f;
1410 
1411         sR = (sR*m_r);
1412         sG = (sG*m_g);
1413         sB = (sB*m_b);
1414 
1415         int f = 0x100 - m_a;
1416         uint16_t d = *dst;
1417         int dR = (d>>11)&0x1f;
1418         int dG = (d>>5)&0x3f;
1419         int dB = (d)&0x1f;
1420         sR = (sR + f*dR)>>8;
1421         sG = (sG + f*dG)>>8;
1422         sB = (sB + f*dB)>>8;
1423         *dst = uint16_t((sR<<11)|(sG<<5)|sB);
1424     }
1425 };
1426 
1427 /* This is used to iterate over a 16-bit destination color buffer.
1428  * Usage is:
1429  *
1430  *   dst_iterator16  di(context);
1431  *   while (di.count--) {
1432  *       <do stuff with dest pixel at di.dst>
1433  *       di.dst++;
1434  *   }
1435  */
1436 struct dst_iterator16 {
dst_iterator16android::dst_iterator161437     dst_iterator16(const context_t* c) {
1438         const int x = c->iterators.xl;
1439         const int width = c->iterators.xr - x;
1440         const int32_t y = c->iterators.y;
1441         const surface_t* cb = &(c->state.buffers.color);
1442         count = width;
1443         dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
1444     }
1445     int        count;
1446     uint16_t*  dst;
1447 };
1448 
1449 
scanline_t32cb16_clamp(context_t * c)1450 static void scanline_t32cb16_clamp(context_t* c)
1451 {
1452     dst_iterator16  di(c);
1453 
1454     if (is_context_horizontal(c)) {
1455         /* Special case for simple horizontal scaling */
1456         horz_clamp_iterator32 ci(c);
1457         while (di.count--) {
1458             uint32_t s = ci.get_pixel32();
1459             *di.dst++ = convertAbgr8888ToRgb565(s);
1460         }
1461     } else {
1462         /* General case */
1463         clamp_iterator ci(c);
1464         while (di.count--) {
1465             uint32_t s = ci.get_pixel32();
1466             *di.dst++ = convertAbgr8888ToRgb565(s);
1467         }
1468     }
1469 }
1470 
scanline_t32cb16_dither(context_t * c)1471 static void scanline_t32cb16_dither(context_t* c)
1472 {
1473     horz_iterator32 si(c);
1474     dst_iterator16  di(c);
1475     ditherer        dither(c);
1476 
1477     while (di.count--) {
1478         uint32_t s = si.get_pixel32();
1479         *di.dst++ = dither.abgr8888ToRgb565(s);
1480     }
1481 }
1482 
scanline_t32cb16_clamp_dither(context_t * c)1483 static void scanline_t32cb16_clamp_dither(context_t* c)
1484 {
1485     dst_iterator16  di(c);
1486     ditherer        dither(c);
1487 
1488     if (is_context_horizontal(c)) {
1489         /* Special case for simple horizontal scaling */
1490         horz_clamp_iterator32 ci(c);
1491         while (di.count--) {
1492             uint32_t s = ci.get_pixel32();
1493             *di.dst++ = dither.abgr8888ToRgb565(s);
1494         }
1495     } else {
1496         /* General case */
1497         clamp_iterator ci(c);
1498         while (di.count--) {
1499             uint32_t s = ci.get_pixel32();
1500             *di.dst++ = dither.abgr8888ToRgb565(s);
1501         }
1502     }
1503 }
1504 
scanline_t32cb16blend_dither(context_t * c)1505 static void scanline_t32cb16blend_dither(context_t* c)
1506 {
1507     dst_iterator16 di(c);
1508     ditherer       dither(c);
1509     blender_32to16 bl(c);
1510     horz_iterator32  hi(c);
1511     while (di.count--) {
1512         uint32_t s = hi.get_pixel32();
1513         bl.write(s, di.dst, dither);
1514         di.dst++;
1515     }
1516 }
1517 
scanline_t32cb16blend_clamp(context_t * c)1518 static void scanline_t32cb16blend_clamp(context_t* c)
1519 {
1520     dst_iterator16  di(c);
1521     blender_32to16  bl(c);
1522 
1523     if (is_context_horizontal(c)) {
1524         horz_clamp_iterator32 ci(c);
1525         while (di.count--) {
1526             uint32_t s = ci.get_pixel32();
1527             bl.write(s, di.dst);
1528             di.dst++;
1529         }
1530     } else {
1531         clamp_iterator ci(c);
1532         while (di.count--) {
1533             uint32_t s = ci.get_pixel32();
1534             bl.write(s, di.dst);
1535             di.dst++;
1536         }
1537     }
1538 }
1539 
scanline_t32cb16blend_clamp_dither(context_t * c)1540 static void scanline_t32cb16blend_clamp_dither(context_t* c)
1541 {
1542     dst_iterator16 di(c);
1543     ditherer       dither(c);
1544     blender_32to16 bl(c);
1545 
1546     clamp_iterator ci(c);
1547     while (di.count--) {
1548         uint32_t s = ci.get_pixel32();
1549         bl.write(s, di.dst, dither);
1550         di.dst++;
1551     }
1552 }
1553 
scanline_t32cb16blend_clamp_mod(context_t * c)1554 void scanline_t32cb16blend_clamp_mod(context_t* c)
1555 {
1556     dst_iterator16 di(c);
1557     blender_32to16_modulate bl(c);
1558 
1559     clamp_iterator ci(c);
1560     while (di.count--) {
1561         uint32_t s = ci.get_pixel32();
1562         bl.write(s, di.dst);
1563         di.dst++;
1564     }
1565 }
1566 
scanline_t32cb16blend_clamp_mod_dither(context_t * c)1567 void scanline_t32cb16blend_clamp_mod_dither(context_t* c)
1568 {
1569     dst_iterator16 di(c);
1570     blender_32to16_modulate bl(c);
1571     ditherer dither(c);
1572 
1573     clamp_iterator ci(c);
1574     while (di.count--) {
1575         uint32_t s = ci.get_pixel32();
1576         bl.write(s, di.dst, dither);
1577         di.dst++;
1578     }
1579 }
1580 
1581 /* Variant of scanline_t32cb16blend_clamp_mod with a xRGB texture */
scanline_x32cb16blend_clamp_mod(context_t * c)1582 void scanline_x32cb16blend_clamp_mod(context_t* c)
1583 {
1584     dst_iterator16 di(c);
1585     blender_x32to16_modulate  bl(c);
1586 
1587     clamp_iterator ci(c);
1588     while (di.count--) {
1589         uint32_t s = ci.get_pixel32();
1590         bl.write(s, di.dst);
1591         di.dst++;
1592     }
1593 }
1594 
scanline_x32cb16blend_clamp_mod_dither(context_t * c)1595 void scanline_x32cb16blend_clamp_mod_dither(context_t* c)
1596 {
1597     dst_iterator16 di(c);
1598     blender_x32to16_modulate  bl(c);
1599     ditherer dither(c);
1600 
1601     clamp_iterator ci(c);
1602     while (di.count--) {
1603         uint32_t s = ci.get_pixel32();
1604         bl.write(s, di.dst, dither);
1605         di.dst++;
1606     }
1607 }
1608 
scanline_t16cb16_clamp(context_t * c)1609 void scanline_t16cb16_clamp(context_t* c)
1610 {
1611     dst_iterator16  di(c);
1612 
1613     /* Special case for simple horizontal scaling */
1614     if (is_context_horizontal(c)) {
1615         horz_clamp_iterator16 ci(c);
1616         while (di.count--) {
1617             *di.dst++ = ci.get_pixel16();
1618         }
1619     } else {
1620         clamp_iterator ci(c);
1621         while (di.count--) {
1622             *di.dst++ = ci.get_pixel16();
1623         }
1624     }
1625 }
1626 
1627 
1628 
1629 template <typename T, typename U>
1630 static inline __attribute__((const))
interpolate(int y,T v0,U dvdx,U dvdy)1631 T interpolate(int y, T v0, U dvdx, U dvdy) {
1632     // interpolates in pixel's centers
1633     // v = v0 + (y + 0.5) * dvdy + (0.5 * dvdx)
1634     return (y * dvdy) + (v0 + ((dvdy + dvdx) >> 1));
1635 }
1636 
1637 // ----------------------------------------------------------------------------
1638 #if 0
1639 #pragma mark -
1640 #endif
1641 
init_y(context_t * c,int32_t ys)1642 void init_y(context_t* c, int32_t ys)
1643 {
1644     const uint32_t enables = c->state.enables;
1645 
1646     // compute iterators...
1647     iterators_t& ci = c->iterators;
1648 
1649     // sample in the center
1650     ci.y = ys;
1651 
1652     if (enables & (GGL_ENABLE_DEPTH_TEST|GGL_ENABLE_W|GGL_ENABLE_FOG)) {
1653         ci.ydzdy = interpolate(ys, c->shade.z0, c->shade.dzdx, c->shade.dzdy);
1654         ci.ydwdy = interpolate(ys, c->shade.w0, c->shade.dwdx, c->shade.dwdy);
1655         ci.ydfdy = interpolate(ys, c->shade.f0, c->shade.dfdx, c->shade.dfdy);
1656     }
1657 
1658     if (ggl_unlikely(enables & GGL_ENABLE_SMOOTH)) {
1659         ci.ydrdy = interpolate(ys, c->shade.r0, c->shade.drdx, c->shade.drdy);
1660         ci.ydgdy = interpolate(ys, c->shade.g0, c->shade.dgdx, c->shade.dgdy);
1661         ci.ydbdy = interpolate(ys, c->shade.b0, c->shade.dbdx, c->shade.dbdy);
1662         ci.ydady = interpolate(ys, c->shade.a0, c->shade.dadx, c->shade.dady);
1663         c->step_y = step_y__smooth;
1664     } else {
1665         ci.ydrdy = c->shade.r0;
1666         ci.ydgdy = c->shade.g0;
1667         ci.ydbdy = c->shade.b0;
1668         ci.ydady = c->shade.a0;
1669         // XXX: do only if needed, or make sure this is fast
1670         c->packed = ggl_pack_color(c, c->state.buffers.color.format,
1671                 ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
1672         c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
1673                 ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
1674     }
1675 
1676     // initialize the variables we need in the shader
1677     generated_vars_t& gen = c->generated_vars;
1678     gen.argb[GGLFormat::ALPHA].c  = ci.ydady;
1679     gen.argb[GGLFormat::ALPHA].dx = c->shade.dadx;
1680     gen.argb[GGLFormat::RED  ].c  = ci.ydrdy;
1681     gen.argb[GGLFormat::RED  ].dx = c->shade.drdx;
1682     gen.argb[GGLFormat::GREEN].c  = ci.ydgdy;
1683     gen.argb[GGLFormat::GREEN].dx = c->shade.dgdx;
1684     gen.argb[GGLFormat::BLUE ].c  = ci.ydbdy;
1685     gen.argb[GGLFormat::BLUE ].dx = c->shade.dbdx;
1686     gen.dzdx = c->shade.dzdx;
1687     gen.f    = ci.ydfdy;
1688     gen.dfdx = c->shade.dfdx;
1689 
1690     if (enables & GGL_ENABLE_TMUS) {
1691         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
1692             texture_t& t = c->state.texture[i];
1693             if (!t.enable) continue;
1694 
1695             texture_iterators_t& ti = t.iterators;
1696             if (t.s_coord == GGL_ONE_TO_ONE && t.t_coord == GGL_ONE_TO_ONE) {
1697                 // we need to set all of these to 0 because in some cases
1698                 // step_y__generic() or step_y__tmu() will be used and
1699                 // therefore will update dtdy, however, in 1:1 mode
1700                 // this is always done by the scanline rasterizer.
1701                 ti.dsdx = ti.dsdy = ti.dtdx = ti.dtdy = 0;
1702                 ti.ydsdy = t.shade.is0;
1703                 ti.ydtdy = t.shade.it0;
1704             } else {
1705                 const int adjustSWrap = ((t.s_wrap==GGL_CLAMP)?0:16);
1706                 const int adjustTWrap = ((t.t_wrap==GGL_CLAMP)?0:16);
1707                 ti.sscale = t.shade.sscale + adjustSWrap;
1708                 ti.tscale = t.shade.tscale + adjustTWrap;
1709                 if (!(enables & GGL_ENABLE_W)) {
1710                     // S coordinate
1711                     const int32_t sscale = ti.sscale;
1712                     const int32_t sy = interpolate(ys,
1713                             t.shade.is0, t.shade.idsdx, t.shade.idsdy);
1714                     if (sscale>=0) {
1715                         ti.ydsdy= sy            << sscale;
1716                         ti.dsdx = t.shade.idsdx << sscale;
1717                         ti.dsdy = t.shade.idsdy << sscale;
1718                     } else {
1719                         ti.ydsdy= sy            >> -sscale;
1720                         ti.dsdx = t.shade.idsdx >> -sscale;
1721                         ti.dsdy = t.shade.idsdy >> -sscale;
1722                     }
1723                     // T coordinate
1724                     const int32_t tscale = ti.tscale;
1725                     const int32_t ty = interpolate(ys,
1726                             t.shade.it0, t.shade.idtdx, t.shade.idtdy);
1727                     if (tscale>=0) {
1728                         ti.ydtdy= ty            << tscale;
1729                         ti.dtdx = t.shade.idtdx << tscale;
1730                         ti.dtdy = t.shade.idtdy << tscale;
1731                     } else {
1732                         ti.ydtdy= ty            >> -tscale;
1733                         ti.dtdx = t.shade.idtdx >> -tscale;
1734                         ti.dtdy = t.shade.idtdy >> -tscale;
1735                     }
1736                 }
1737             }
1738             // mirror for generated code...
1739             generated_tex_vars_t& gen = c->generated_vars.texture[i];
1740             gen.width   = t.surface.width;
1741             gen.height  = t.surface.height;
1742             gen.stride  = t.surface.stride;
1743             gen.data    = uintptr_t(t.surface.data);
1744             gen.dsdx = ti.dsdx;
1745             gen.dtdx = ti.dtdx;
1746         }
1747     }
1748 
1749     // choose the y-stepper
1750     c->step_y = step_y__nop;
1751     if (enables & GGL_ENABLE_FOG) {
1752         c->step_y = step_y__generic;
1753     } else if (enables & GGL_ENABLE_TMUS) {
1754         if (enables & GGL_ENABLE_SMOOTH) {
1755             c->step_y = step_y__generic;
1756         } else if (enables & GGL_ENABLE_W) {
1757             c->step_y = step_y__w;
1758         } else {
1759             c->step_y = step_y__tmu;
1760         }
1761     } else {
1762         if (enables & GGL_ENABLE_SMOOTH) {
1763             c->step_y = step_y__smooth;
1764         }
1765     }
1766 
1767     // choose the rectangle blitter
1768     c->rect = rect_generic;
1769     if ((c->step_y == step_y__nop) &&
1770         (c->scanline == scanline_memcpy))
1771     {
1772         c->rect = rect_memcpy;
1773     }
1774 }
1775 
init_y_packed(context_t * c,int32_t y0)1776 void init_y_packed(context_t* c, int32_t y0)
1777 {
1778     uint8_t f = c->state.buffers.color.format;
1779     c->packed = ggl_pack_color(c, f,
1780             c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
1781     c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
1782             c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
1783     c->iterators.y = y0;
1784     c->step_y = step_y__nop;
1785     // choose the rectangle blitter
1786     c->rect = rect_generic;
1787     if (c->scanline == scanline_memcpy) {
1788         c->rect = rect_memcpy;
1789     }
1790 }
1791 
init_y_noop(context_t * c,int32_t y0)1792 void init_y_noop(context_t* c, int32_t y0)
1793 {
1794     c->iterators.y = y0;
1795     c->step_y = step_y__nop;
1796     // choose the rectangle blitter
1797     c->rect = rect_generic;
1798     if (c->scanline == scanline_memcpy) {
1799         c->rect = rect_memcpy;
1800     }
1801 }
1802 
init_y_error(context_t * c,int32_t y0)1803 void init_y_error(context_t* c, int32_t y0)
1804 {
1805     // woooops, shoud never happen,
1806     // fail gracefully (don't display anything)
1807     init_y_noop(c, y0);
1808     ALOGE("color-buffer has an invalid format!");
1809 }
1810 
1811 // ----------------------------------------------------------------------------
1812 #if 0
1813 #pragma mark -
1814 #endif
1815 
step_y__generic(context_t * c)1816 void step_y__generic(context_t* c)
1817 {
1818     const uint32_t enables = c->state.enables;
1819 
1820     // iterate...
1821     iterators_t& ci = c->iterators;
1822     ci.y += 1;
1823 
1824     if (enables & GGL_ENABLE_SMOOTH) {
1825         ci.ydrdy += c->shade.drdy;
1826         ci.ydgdy += c->shade.dgdy;
1827         ci.ydbdy += c->shade.dbdy;
1828         ci.ydady += c->shade.dady;
1829     }
1830 
1831     const uint32_t mask =
1832             GGL_ENABLE_DEPTH_TEST |
1833             GGL_ENABLE_W |
1834             GGL_ENABLE_FOG;
1835     if (enables & mask) {
1836         ci.ydzdy += c->shade.dzdy;
1837         ci.ydwdy += c->shade.dwdy;
1838         ci.ydfdy += c->shade.dfdy;
1839     }
1840 
1841     if ((enables & GGL_ENABLE_TMUS) && (!(enables & GGL_ENABLE_W))) {
1842         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
1843             if (c->state.texture[i].enable) {
1844                 texture_iterators_t& ti = c->state.texture[i].iterators;
1845                 ti.ydsdy += ti.dsdy;
1846                 ti.ydtdy += ti.dtdy;
1847             }
1848         }
1849     }
1850 }
1851 
step_y__nop(context_t * c)1852 void step_y__nop(context_t* c)
1853 {
1854     c->iterators.y += 1;
1855     c->iterators.ydzdy += c->shade.dzdy;
1856 }
1857 
step_y__smooth(context_t * c)1858 void step_y__smooth(context_t* c)
1859 {
1860     iterators_t& ci = c->iterators;
1861     ci.y += 1;
1862     ci.ydrdy += c->shade.drdy;
1863     ci.ydgdy += c->shade.dgdy;
1864     ci.ydbdy += c->shade.dbdy;
1865     ci.ydady += c->shade.dady;
1866     ci.ydzdy += c->shade.dzdy;
1867 }
1868 
step_y__w(context_t * c)1869 void step_y__w(context_t* c)
1870 {
1871     iterators_t& ci = c->iterators;
1872     ci.y += 1;
1873     ci.ydzdy += c->shade.dzdy;
1874     ci.ydwdy += c->shade.dwdy;
1875 }
1876 
step_y__tmu(context_t * c)1877 void step_y__tmu(context_t* c)
1878 {
1879     iterators_t& ci = c->iterators;
1880     ci.y += 1;
1881     ci.ydzdy += c->shade.dzdy;
1882     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
1883         if (c->state.texture[i].enable) {
1884             texture_iterators_t& ti = c->state.texture[i].iterators;
1885             ti.ydsdy += ti.dsdy;
1886             ti.ydtdy += ti.dtdy;
1887         }
1888     }
1889 }
1890 
1891 // ----------------------------------------------------------------------------
1892 #if 0
1893 #pragma mark -
1894 #endif
1895 
scanline_perspective(context_t * c)1896 void scanline_perspective(context_t* c)
1897 {
1898     struct {
1899         union {
1900             struct {
1901                 int32_t s, sq;
1902                 int32_t t, tq;
1903             } sqtq;
1904             struct {
1905                 int32_t v, q;
1906             } st[2];
1907         };
1908     } tc[GGL_TEXTURE_UNIT_COUNT] __attribute__((aligned(16)));
1909 
1910     // XXX: we should have a special case when dwdx = 0
1911 
1912     // 32 pixels spans works okay. 16 is a lot better,
1913     // but hey, it's a software renderer...
1914     const uint32_t SPAN_BITS = 5;
1915     const uint32_t ys = c->iterators.y;
1916     const uint32_t xs = c->iterators.xl;
1917     const uint32_t x1 = c->iterators.xr;
1918 	const uint32_t xc = x1 - xs;
1919     uint32_t remainder = xc & ((1<<SPAN_BITS)-1);
1920     uint32_t numSpans = xc >> SPAN_BITS;
1921 
1922     const iterators_t& ci = c->iterators;
1923     int32_t w0 = (xs * c->shade.dwdx) + ci.ydwdy;
1924     int32_t q0 = gglRecipQ(w0, 30);
1925     const int iwscale = 32 - gglClz(q0);
1926 
1927     const int32_t dwdx = c->shade.dwdx << SPAN_BITS;
1928     int32_t xl = c->iterators.xl;
1929 
1930     // We process s & t with a loop to reduce the code size
1931     // (and i-cache pressure).
1932 
1933     for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
1934         const texture_t& tmu = c->state.texture[i];
1935         if (!tmu.enable) continue;
1936         int32_t s =   tmu.shade.is0 +
1937                      (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
1938                      ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
1939         int32_t t =   tmu.shade.it0 +
1940                      (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
1941                      ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
1942         tc[i].sqtq.s  = s;
1943         tc[i].sqtq.t  = t;
1944         tc[i].sqtq.sq = gglMulx(s, q0, iwscale);
1945         tc[i].sqtq.tq = gglMulx(t, q0, iwscale);
1946     }
1947 
1948     int32_t span = 0;
1949     do {
1950         int32_t w1;
1951         if (ggl_likely(numSpans)) {
1952             w1 = w0 + dwdx;
1953         } else {
1954             if (remainder) {
1955                 // finish off the scanline...
1956                 span = remainder;
1957                 w1 = (c->shade.dwdx * span) + w0;
1958             } else {
1959                 break;
1960             }
1961         }
1962         int32_t q1 = gglRecipQ(w1, 30);
1963         for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
1964             texture_t& tmu = c->state.texture[i];
1965             if (!tmu.enable) continue;
1966             texture_iterators_t& ti = tmu.iterators;
1967 
1968             for (int j=0 ; j<2 ; j++) {
1969                 int32_t v = tc[i].st[j].v;
1970                 if (span)   v += (tmu.shade.st[j].dx)*span;
1971                 else        v += (tmu.shade.st[j].dx)<<SPAN_BITS;
1972                 const int32_t v0 = tc[i].st[j].q;
1973                 const int32_t v1 = gglMulx(v, q1, iwscale);
1974                 int32_t dvdx = v1 - v0;
1975                 if (span)   dvdx /= span;
1976                 else        dvdx >>= SPAN_BITS;
1977                 tc[i].st[j].v = v;
1978                 tc[i].st[j].q = v1;
1979 
1980                 const int scale = ti.st[j].scale + (iwscale - 30);
1981                 if (scale >= 0) {
1982                     ti.st[j].ydvdy = v0   << scale;
1983                     ti.st[j].dvdx  = dvdx << scale;
1984                 } else {
1985                     ti.st[j].ydvdy = v0   >> -scale;
1986                     ti.st[j].dvdx  = dvdx >> -scale;
1987                 }
1988             }
1989             generated_tex_vars_t& gen = c->generated_vars.texture[i];
1990             gen.dsdx = ti.st[0].dvdx;
1991             gen.dtdx = ti.st[1].dvdx;
1992         }
1993         c->iterators.xl = xl;
1994         c->iterators.xr = xl = xl + (span ? span : (1<<SPAN_BITS));
1995         w0 = w1;
1996         q0 = q1;
1997         c->span(c);
1998     } while(numSpans--);
1999 }
2000 
scanline_perspective_single(context_t * c)2001 void scanline_perspective_single(context_t* c)
2002 {
2003     // 32 pixels spans works okay. 16 is a lot better,
2004     // but hey, it's a software renderer...
2005     const uint32_t SPAN_BITS = 5;
2006     const uint32_t ys = c->iterators.y;
2007     const uint32_t xs = c->iterators.xl;
2008     const uint32_t x1 = c->iterators.xr;
2009 	const uint32_t xc = x1 - xs;
2010 
2011     const iterators_t& ci = c->iterators;
2012     int32_t w = (xs * c->shade.dwdx) + ci.ydwdy;
2013     int32_t iw = gglRecipQ(w, 30);
2014     const int iwscale = 32 - gglClz(iw);
2015 
2016     const int i = 31 - gglClz(c->state.enabled_tmu);
2017     generated_tex_vars_t& gen = c->generated_vars.texture[i];
2018     texture_t& tmu = c->state.texture[i];
2019     texture_iterators_t& ti = tmu.iterators;
2020     const int sscale = ti.sscale + (iwscale - 30);
2021     const int tscale = ti.tscale + (iwscale - 30);
2022     int32_t s =   tmu.shade.is0 +
2023                  (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
2024                  ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
2025     int32_t t =   tmu.shade.it0 +
2026                  (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
2027                  ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
2028     int32_t s0 = gglMulx(s, iw, iwscale);
2029     int32_t t0 = gglMulx(t, iw, iwscale);
2030     int32_t xl = c->iterators.xl;
2031 
2032     int32_t sq, tq, dsdx, dtdx;
2033     int32_t premainder = xc & ((1<<SPAN_BITS)-1);
2034     uint32_t numSpans = xc >> SPAN_BITS;
2035     if (c->shade.dwdx == 0) {
2036         // XXX: we could choose to do this if the error is small enough
2037         numSpans = 0;
2038         premainder = xc;
2039         goto no_perspective;
2040     }
2041 
2042     if (premainder) {
2043         w += c->shade.dwdx   * premainder;
2044         iw = gglRecipQ(w, 30);
2045 no_perspective:
2046         s += tmu.shade.idsdx * premainder;
2047         t += tmu.shade.idtdx * premainder;
2048         sq = gglMulx(s, iw, iwscale);
2049         tq = gglMulx(t, iw, iwscale);
2050         dsdx = (sq - s0) / premainder;
2051         dtdx = (tq - t0) / premainder;
2052         c->iterators.xl = xl;
2053         c->iterators.xr = xl = xl + premainder;
2054         goto finish;
2055     }
2056 
2057     while (numSpans--) {
2058         w += c->shade.dwdx   << SPAN_BITS;
2059         s += tmu.shade.idsdx << SPAN_BITS;
2060         t += tmu.shade.idtdx << SPAN_BITS;
2061         iw = gglRecipQ(w, 30);
2062         sq = gglMulx(s, iw, iwscale);
2063         tq = gglMulx(t, iw, iwscale);
2064         dsdx = (sq - s0) >> SPAN_BITS;
2065         dtdx = (tq - t0) >> SPAN_BITS;
2066         c->iterators.xl = xl;
2067         c->iterators.xr = xl = xl + (1<<SPAN_BITS);
2068 finish:
2069         if (sscale >= 0) {
2070             ti.ydsdy = s0   << sscale;
2071             ti.dsdx  = dsdx << sscale;
2072         } else {
2073             ti.ydsdy = s0   >>-sscale;
2074             ti.dsdx  = dsdx >>-sscale;
2075         }
2076         if (tscale >= 0) {
2077             ti.ydtdy = t0   << tscale;
2078             ti.dtdx  = dtdx << tscale;
2079         } else {
2080             ti.ydtdy = t0   >>-tscale;
2081             ti.dtdx  = dtdx >>-tscale;
2082         }
2083         s0 = sq;
2084         t0 = tq;
2085         gen.dsdx = ti.dsdx;
2086         gen.dtdx = ti.dtdx;
2087         c->span(c);
2088     }
2089 }
2090 
2091 // ----------------------------------------------------------------------------
2092 
scanline_col32cb16blend(context_t * c)2093 void scanline_col32cb16blend(context_t* c)
2094 {
2095     int32_t x = c->iterators.xl;
2096     size_t ct = c->iterators.xr - x;
2097     int32_t y = c->iterators.y;
2098     surface_t* cb = &(c->state.buffers.color);
2099     union {
2100         uint16_t* dst;
2101         uint32_t* dst32;
2102     };
2103     dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
2104 
2105 #if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__))
2106 #if defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
2107     scanline_col32cb16blend_neon(dst, &(c->packed8888), ct);
2108 #else  // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
2109     scanline_col32cb16blend_arm(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
2110 #endif // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
2111 #elif ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__aarch64__))
2112     scanline_col32cb16blend_arm64(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
2113 #elif ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__mips__) && defined(__LP64__)))
2114     scanline_col32cb16blend_mips64(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
2115 #else
2116     uint32_t s = GGL_RGBA_TO_HOST(c->packed8888);
2117     int sA = (s>>24);
2118     int f = 0x100 - (sA + (sA>>7));
2119     while (ct--) {
2120         uint16_t d = *dst;
2121         int dR = (d>>11)&0x1f;
2122         int dG = (d>>5)&0x3f;
2123         int dB = (d)&0x1f;
2124         int sR = (s >> (   3))&0x1F;
2125         int sG = (s >> ( 8+2))&0x3F;
2126         int sB = (s >> (16+3))&0x1F;
2127         sR += (f*dR)>>8;
2128         sG += (f*dG)>>8;
2129         sB += (f*dB)>>8;
2130         *dst++ = uint16_t((sR<<11)|(sG<<5)|sB);
2131     }
2132 #endif
2133 
2134 }
2135 
scanline_t32cb16(context_t * c)2136 void scanline_t32cb16(context_t* c)
2137 {
2138     int32_t x = c->iterators.xl;
2139     size_t ct = c->iterators.xr - x;
2140     int32_t y = c->iterators.y;
2141     surface_t* cb = &(c->state.buffers.color);
2142     union {
2143         uint16_t* dst;
2144         uint32_t* dst32;
2145     };
2146     dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
2147 
2148     surface_t* tex = &(c->state.texture[0].surface);
2149     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
2150     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
2151     uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
2152     int sR, sG, sB;
2153     uint32_t s, d;
2154 
2155     if (ct==1 || uintptr_t(dst)&2) {
2156 last_one:
2157         s = GGL_RGBA_TO_HOST( *src++ );
2158         *dst++ = convertAbgr8888ToRgb565(s);
2159         ct--;
2160     }
2161 
2162     while (ct >= 2) {
2163 #if BYTE_ORDER == BIG_ENDIAN
2164         s = GGL_RGBA_TO_HOST( *src++ );
2165         d = convertAbgr8888ToRgb565_hi16(s);
2166 
2167         s = GGL_RGBA_TO_HOST( *src++ );
2168         d |= convertAbgr8888ToRgb565(s);
2169 #else
2170         s = GGL_RGBA_TO_HOST( *src++ );
2171         d = convertAbgr8888ToRgb565(s);
2172 
2173         s = GGL_RGBA_TO_HOST( *src++ );
2174         d |= convertAbgr8888ToRgb565(s) << 16;
2175 #endif
2176         *dst32++ = d;
2177         ct -= 2;
2178     }
2179 
2180     if (ct > 0) {
2181         goto last_one;
2182     }
2183 }
2184 
scanline_t32cb16blend(context_t * c)2185 void scanline_t32cb16blend(context_t* c)
2186 {
2187 #if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__arm__) || defined(__aarch64__) || \
2188     (defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__)))))
2189     int32_t x = c->iterators.xl;
2190     size_t ct = c->iterators.xr - x;
2191     int32_t y = c->iterators.y;
2192     surface_t* cb = &(c->state.buffers.color);
2193     uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
2194 
2195     surface_t* tex = &(c->state.texture[0].surface);
2196     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
2197     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
2198     uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
2199 
2200 #ifdef __arm__
2201     scanline_t32cb16blend_arm(dst, src, ct);
2202 #elif defined(__aarch64__)
2203     scanline_t32cb16blend_arm64(dst, src, ct);
2204 #elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
2205     scanline_t32cb16blend_mips(dst, src, ct);
2206 #elif defined(__mips__) && defined(__LP64__)
2207     scanline_t32cb16blend_mips64(dst, src, ct);
2208 #endif
2209 #else
2210     dst_iterator16  di(c);
2211     horz_iterator32  hi(c);
2212     blender_32to16  bl(c);
2213     while (di.count--) {
2214         uint32_t s = hi.get_pixel32();
2215         bl.write(s, di.dst);
2216         di.dst++;
2217     }
2218 #endif
2219 }
2220 
scanline_t32cb16blend_srca(context_t * c)2221 void scanline_t32cb16blend_srca(context_t* c)
2222 {
2223     dst_iterator16  di(c);
2224     horz_iterator32  hi(c);
2225     blender_32to16_srcA  blender(c);
2226 
2227     while (di.count--) {
2228         uint32_t s = hi.get_pixel32();
2229         blender.write(s,di.dst);
2230         di.dst++;
2231     }
2232 }
2233 
scanline_t16cb16blend_clamp_mod(context_t * c)2234 void scanline_t16cb16blend_clamp_mod(context_t* c)
2235 {
2236     const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
2237     if (a == 0) {
2238         return;
2239     }
2240 
2241     if (a == 255) {
2242         scanline_t16cb16_clamp(c);
2243         return;
2244     }
2245 
2246     dst_iterator16  di(c);
2247     blender_16to16_modulate  blender(c);
2248     clamp_iterator  ci(c);
2249 
2250     while (di.count--) {
2251         uint16_t s = ci.get_pixel16();
2252         blender.write(s, di.dst);
2253         di.dst++;
2254     }
2255 }
2256 
scanline_memcpy(context_t * c)2257 void scanline_memcpy(context_t* c)
2258 {
2259     int32_t x = c->iterators.xl;
2260     size_t ct = c->iterators.xr - x;
2261     int32_t y = c->iterators.y;
2262     surface_t* cb = &(c->state.buffers.color);
2263     const GGLFormat* fp = &(c->formats[cb->format]);
2264     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
2265                             (x + (cb->stride * y)) * fp->size;
2266 
2267     surface_t* tex = &(c->state.texture[0].surface);
2268     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
2269     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
2270     uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
2271                             (u + (tex->stride * v)) * fp->size;
2272 
2273     const size_t size = ct * fp->size;
2274     memcpy(dst, src, size);
2275 }
2276 
scanline_memset8(context_t * c)2277 void scanline_memset8(context_t* c)
2278 {
2279     int32_t x = c->iterators.xl;
2280     size_t ct = c->iterators.xr - x;
2281     int32_t y = c->iterators.y;
2282     surface_t* cb = &(c->state.buffers.color);
2283     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) + (x+(cb->stride*y));
2284     uint32_t packed = c->packed;
2285     memset(dst, packed, ct);
2286 }
2287 
scanline_memset16(context_t * c)2288 void scanline_memset16(context_t* c)
2289 {
2290     int32_t x = c->iterators.xl;
2291     size_t ct = c->iterators.xr - x;
2292     int32_t y = c->iterators.y;
2293     surface_t* cb = &(c->state.buffers.color);
2294     uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
2295     uint32_t packed = c->packed;
2296     android_memset16(dst, packed, ct*2);
2297 }
2298 
scanline_memset32(context_t * c)2299 void scanline_memset32(context_t* c)
2300 {
2301     int32_t x = c->iterators.xl;
2302     size_t ct = c->iterators.xr - x;
2303     int32_t y = c->iterators.y;
2304     surface_t* cb = &(c->state.buffers.color);
2305     uint32_t* dst = reinterpret_cast<uint32_t*>(cb->data) + (x+(cb->stride*y));
2306     uint32_t packed = GGL_HOST_TO_RGBA(c->packed);
2307     android_memset32(dst, packed, ct*4);
2308 }
2309 
scanline_clear(context_t * c)2310 void scanline_clear(context_t* c)
2311 {
2312     int32_t x = c->iterators.xl;
2313     size_t ct = c->iterators.xr - x;
2314     int32_t y = c->iterators.y;
2315     surface_t* cb = &(c->state.buffers.color);
2316     const GGLFormat* fp = &(c->formats[cb->format]);
2317     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
2318                             (x + (cb->stride * y)) * fp->size;
2319     const size_t size = ct * fp->size;
2320     memset(dst, 0, size);
2321 }
2322 
scanline_set(context_t * c)2323 void scanline_set(context_t* c)
2324 {
2325     int32_t x = c->iterators.xl;
2326     size_t ct = c->iterators.xr - x;
2327     int32_t y = c->iterators.y;
2328     surface_t* cb = &(c->state.buffers.color);
2329     const GGLFormat* fp = &(c->formats[cb->format]);
2330     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
2331                             (x + (cb->stride * y)) * fp->size;
2332     const size_t size = ct * fp->size;
2333     memset(dst, 0xFF, size);
2334 }
2335 
scanline_noop(context_t *)2336 void scanline_noop(context_t* /*c*/)
2337 {
2338 }
2339 
rect_generic(context_t * c,size_t yc)2340 void rect_generic(context_t* c, size_t yc)
2341 {
2342     do {
2343         c->scanline(c);
2344         c->step_y(c);
2345     } while (--yc);
2346 }
2347 
rect_memcpy(context_t * c,size_t yc)2348 void rect_memcpy(context_t* c, size_t yc)
2349 {
2350     int32_t x = c->iterators.xl;
2351     size_t ct = c->iterators.xr - x;
2352     int32_t y = c->iterators.y;
2353     surface_t* cb = &(c->state.buffers.color);
2354     const GGLFormat* fp = &(c->formats[cb->format]);
2355     uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
2356                             (x + (cb->stride * y)) * fp->size;
2357 
2358     surface_t* tex = &(c->state.texture[0].surface);
2359     const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
2360     const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
2361     uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
2362                             (u + (tex->stride * v)) * fp->size;
2363 
2364     if (cb->stride == tex->stride && ct == size_t(cb->stride)) {
2365         memcpy(dst, src, ct * fp->size * yc);
2366     } else {
2367         const size_t size = ct * fp->size;
2368         const size_t dbpr = cb->stride  * fp->size;
2369         const size_t sbpr = tex->stride * fp->size;
2370         do {
2371             memcpy(dst, src, size);
2372             dst += dbpr;
2373             src += sbpr;
2374         } while (--yc);
2375     }
2376 }
2377 // ----------------------------------------------------------------------------
2378 }; // namespace android
2379 
2380