1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <sys/mman.h>
18 #include <unistd.h>
19 
20 #include "rsCpuIntrinsic.h"
21 #include "rsCpuIntrinsicInlines.h"
22 #include "linkloader/include/MemChunk.h"
23 #include "linkloader/utils/flush_cpu_cache.h"
24 
25 #include <sys/mman.h>
26 #include <stddef.h>
27 #include <stdint.h>
28 #include <stdlib.h>
29 //#include <utils/StopWatch.h>
30 
31 
32 /*  uint kernel
33  *  Q0  D0:  Load slot for R
34  *      D1:  Load slot for G
35  *  Q1  D2:  Load slot for B
36  *      D3:  Load slot for A
37  *  Q2  D4:  Matrix
38  *      D5:  =
39  *  Q3  D6:  =
40  *      D7:  =
41  *  Q4  D8:  Add R
42  *      D9:
43  *  Q5  D10: Add G
44  *      D11:
45  *  Q6  D12: Add B
46  *      D13:
47  *  Q7  D14: Add A
48  *      D15:
49  *  Q8  D16:  I32: R Sum
50  *      D17:
51  *  Q9  D18:  I32: G Sum
52  *      D19:
53  *  Q10 D20:  I32: B Sum
54  *      D21:
55  *  Q11 D22:  I32: A Sum
56  *      D23:
57  *  Q12 D24:  U16: expanded R
58  *      D25:
59  *  Q13 D26:  U16: expanded G
60  *      D27:
61  *  Q14 D28:  U16: expanded B
62  *      D29:
63  *  Q15 D30:  U16: expanded A
64  *      D31:
65  *
66  */
67 
68 /*  float kernel
69  *  Q0  D0:  Load slot for R
70  *      D1:  =
71  *  Q1  D2:  Load slot for G
72  *      D3:  =
73  *  Q2  D4:  Load slot for B
74  *      D5:  =
75  *  Q3  D6:  Load slot for A
76  *      D7:  =
77  *  Q4  D8:  Matrix
78  *      D9:  =
79  *  Q5  D10: =
80  *      D11: =
81  *  Q6  D12: =
82  *      D13: =
83  *  Q7  D14: =
84  *      D15: =
85  *  Q8  D16: Add R
86  *      D17: =
87  *  Q9  D18: Add G
88  *      D19: =
89  *  Q10 D20: Add B
90  *      D21: =
91  *  Q11 D22: Add A
92  *      D23: =
93  *  Q12 D24: Sum R
94  *      D25: =
95  *  Q13 D26: Sum G
96  *      D27: =
97  *  Q14 D28: Sum B
98  *      D29: =
99  *  Q15 D30: Sum A
100  *      D31: =
101  *
102  */
103 
104 
105 
106 using namespace android;
107 using namespace android::renderscript;
108 
109 namespace android {
110 namespace renderscript {
111 
112 typedef union {
113     uint64_t key;
114     struct {
115         uint32_t inVecSize          :2;  // [0 - 1]
116         uint32_t outVecSize         :2;  // [2 - 3]
117         uint32_t inType             :4;  // [4 - 7]
118         uint32_t outType            :4;  // [8 - 11]
119         uint32_t dot                :1;  // [12]
120         uint32_t _unused1           :1;  // [13]
121         uint32_t copyAlpha          :1;  // [14]
122         uint32_t _unused2           :1;  // [15]
123         uint32_t coeffMask          :16; // [16-31]
124         uint32_t addMask            :4;  // [32-35]
125     } u;
126 } Key_t;
127 
128 //Re-enable when intrinsic is fixed
129 #if defined(ARCH_ARM64_USE_INTRINSICS)
130 typedef struct {
131     void (*column[4])(void);
132     void (*store)(void);
133     void (*load)(void);
134     void (*store_end)(void);
135     void (*load_end)(void);
136 } FunctionTab_t;
137 
138 extern "C" void rsdIntrinsicColorMatrix_int_K(
139              void *out, void const *in, size_t count,
140              FunctionTab_t const *fns,
141              int16_t const *mult, int32_t const *add);
142 
143 extern "C" void rsdIntrinsicColorMatrix_float_K(
144              void *out, void const *in, size_t count,
145              FunctionTab_t const *fns,
146              float const *mult, float const *add);
147 
148 /* The setup functions fill in function tables to be used by above functions;
149  * this code also eliminates jump-to-another-jump cases by short-circuiting
150  * empty functions.  While it's not performance critical, it works out easier
151  * to write the set-up code in assembly than to try to expose the same symbols
152  * and write the code in C.
153  */
154 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
155              FunctionTab_t *fns,
156              uint32_t mask, int dt, int st);
157 
158 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
159              FunctionTab_t *fns,
160              uint32_t mask, int dt, int st);
161 #endif
162 
163 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
164 public:
165     virtual void populateScript(Script *);
166 
167     virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
168 
169     virtual ~RsdCpuScriptIntrinsicColorMatrix();
170     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
171 
172     virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
173                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
174     virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
175                             const void * usr, uint32_t usrLen, const RsScriptCall *sc);
176 
177 protected:
178     float fp[16];
179     float fpa[4];
180 
181     // The following four fields are read as constants
182     // by the SIMD assembly code.
183     short ip[16];
184     int ipa[4];
185     float tmpFp[16];
186     float tmpFpa[4];
187 #if defined(ARCH_ARM64_USE_INTRINSICS)
188     FunctionTab_t mFnTab;
189 #endif
190 
191     static void kernel(const RsForEachStubParamStruct *p,
192                        uint32_t xstart, uint32_t xend,
193                        uint32_t instep, uint32_t outstep);
194     void updateCoeffCache(float fpMul, float addMul);
195 
196     Key_t mLastKey;
197     unsigned char *mBuf;
198     size_t mBufSize;
199 
200     Key_t computeKey(const Element *ein, const Element *eout);
201 
202     bool build(Key_t key);
203 
204     void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
205 
206 };
207 
208 }
209 }
210 
211 
computeKey(const Element * ein,const Element * eout)212 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
213         const Element *ein, const Element *eout) {
214 
215     Key_t key;
216     key.key = 0;
217 
218     // Compute a unique code key for this operation
219 
220     // Add to the key the input and output types
221     bool hasFloat = false;
222     if (ein->getType() == RS_TYPE_FLOAT_32) {
223         hasFloat = true;
224         key.u.inType = RS_TYPE_FLOAT_32;
225         rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
226     }
227     if (eout->getType() == RS_TYPE_FLOAT_32) {
228         hasFloat = true;
229         key.u.outType = RS_TYPE_FLOAT_32;
230         rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
231     }
232 
233     // Mask in the bits indicating which coefficients in the
234     // color matrix are needed.
235     if (hasFloat) {
236         for (uint32_t i=0; i < 16; i++) {
237             if (fabs(fp[i]) != 0.f) {
238                 key.u.coeffMask |= 1 << i;
239             }
240         }
241         if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
242         if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
243         if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
244         if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
245 
246     } else {
247         for (uint32_t i=0; i < 16; i++) {
248             if (ip[i] != 0) {
249                 key.u.coeffMask |= 1 << i;
250             }
251         }
252         if (ipa[0] != 0) key.u.addMask |= 0x1;
253         if (ipa[1] != 0) key.u.addMask |= 0x2;
254         if (ipa[2] != 0) key.u.addMask |= 0x4;
255         if (ipa[3] != 0) key.u.addMask |= 0x8;
256     }
257 
258     // Look for a dot product where the r,g,b colums are the same
259     if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
260         (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
261         (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
262         (ip[12] == ip[13]) && (ip[12] == ip[14])) {
263 
264         if (!key.u.addMask) key.u.dot = 1;
265     }
266 
267     // Is alpha a simple copy
268     if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
269         key.u.copyAlpha = !(key.u.inType || key.u.outType);
270     }
271 
272     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
273 
274     switch (ein->getVectorSize()) {
275     case 4:
276         key.u.inVecSize = 3;
277         break;
278     case 3:
279         key.u.inVecSize = 2;
280         key.u.coeffMask &= ~0xF000;
281         break;
282     case 2:
283         key.u.inVecSize = 1;
284         key.u.coeffMask &= ~0xFF00;
285         break;
286     default:
287         key.u.coeffMask &= ~0xFFF0;
288         break;
289     }
290 
291     switch (eout->getVectorSize()) {
292     case 4:
293         key.u.outVecSize = 3;
294         break;
295     case 3:
296         key.u.outVecSize = 2;
297         key.u.coeffMask &= ~0x8888;
298         key.u.addMask &= 7;
299         break;
300     case 2:
301         key.u.outVecSize = 1;
302         key.u.coeffMask &= ~0xCCCC;
303         key.u.addMask &= 3;
304         break;
305     default:
306         key.u.coeffMask &= ~0xEEEE;
307         key.u.addMask &= 1;
308         break;
309     }
310 
311     if (key.u.inType && !key.u.outType) {
312         key.u.addMask |= 1;
313         if (key.u.outVecSize > 0) key.u.addMask |= 2;
314         if (key.u.outVecSize > 1) key.u.addMask |= 4;
315         if (key.u.outVecSize > 2) key.u.addMask |= 8;
316     }
317 
318     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
319     return key;
320 }
321 
322 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
323 
324 #define DEF_SYM(x)                                  \
325     extern "C" uint32_t _N_ColorMatrix_##x;      \
326     extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
327     extern "C" uint32_t _N_ColorMatrix_##x##_len;
328 
329 DEF_SYM(prefix_i)
DEF_SYM(prefix_f)330 DEF_SYM(prefix_f)
331 DEF_SYM(postfix1)
332 DEF_SYM(postfix2)
333 
334 DEF_SYM(load_u8_4)
335 DEF_SYM(load_u8_3)
336 DEF_SYM(load_u8_2)
337 DEF_SYM(load_u8_1)
338 DEF_SYM(load_u8f_4)
339 DEF_SYM(load_u8f_3)
340 DEF_SYM(load_u8f_2)
341 DEF_SYM(load_u8f_1)
342 DEF_SYM(load_f32_4)
343 DEF_SYM(load_f32_3)
344 DEF_SYM(load_f32_2)
345 DEF_SYM(load_f32_1)
346 
347 DEF_SYM(store_u8_4)
348 DEF_SYM(store_u8_2)
349 DEF_SYM(store_u8_1)
350 DEF_SYM(store_f32_4)
351 DEF_SYM(store_f32_3)
352 DEF_SYM(store_f32_2)
353 DEF_SYM(store_f32_1)
354 DEF_SYM(store_f32u_4)
355 DEF_SYM(store_f32u_2)
356 DEF_SYM(store_f32u_1)
357 
358 DEF_SYM(unpack_u8_4)
359 DEF_SYM(unpack_u8_3)
360 DEF_SYM(unpack_u8_2)
361 DEF_SYM(unpack_u8_1)
362 DEF_SYM(pack_u8_4)
363 DEF_SYM(pack_u8_3)
364 DEF_SYM(pack_u8_2)
365 DEF_SYM(pack_u8_1)
366 DEF_SYM(dot)
367 DEF_SYM(add_0_u8)
368 DEF_SYM(add_1_u8)
369 DEF_SYM(add_2_u8)
370 DEF_SYM(add_3_u8)
371 
372 #define ADD_CHUNK(x) \
373     memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
374     buf += _N_ColorMatrix_##x##_len
375 
376 
377 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
378     size_t off = (target - buf - 8) >> 2;
379     rsAssert(((off & 0xff000000) == 0) ||
380            ((off & 0xff000000) == 0xff000000));
381 
382     uint32_t op = (condition << 28);
383     op |= 0xa << 24;  // branch
384     op |= 0xffffff & off;
385     ((uint32_t *)buf)[0] = op;
386     return buf + 4;
387 }
388 
encodeSIMDRegs(uint32_t vd,uint32_t vn,uint32_t vm)389 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
390     rsAssert(vd < 32);
391     rsAssert(vm < 32);
392     rsAssert(vn < 32);
393 
394     uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
395     op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
396     op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
397     return op;
398 }
399 
addVMLAL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)400 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
401     //vmlal.s16 Q#1, D#1, D#2[#]
402     uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
403     ((uint32_t *)buf)[0] = op;
404     return buf + 4;
405 }
406 
addVMULL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)407 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
408     //vmull.s16 Q#1, D#1, D#2[#]
409     uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
410     ((uint32_t *)buf)[0] = op;
411     return buf + 4;
412 }
413 
addVQADD_S32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)414 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
415     //vqadd.s32 Q#1, Q#1, Q#2
416     uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
417     ((uint32_t *)buf)[0] = op;
418     return buf + 4;
419 }
420 
addVMLAL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)421 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
422     //vmlal.f32 Q#1, D#1, D#2[#]
423     uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
424     ((uint32_t *)buf)[0] = op;
425     return buf + 4;
426 }
427 
addVMULL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)428 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
429     //vmull.f32 Q#1, D#1, D#2[#]
430     uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
431     ((uint32_t *)buf)[0] = op;
432     return buf + 4;
433 }
434 
addVORR_32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)435 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
436     //vadd.f32 Q#1, D#1, D#2
437     uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
438     ((uint32_t *)buf)[0] = op;
439     return buf + 4;
440 }
441 
addVMOV_32(uint8_t * buf,uint32_t dest_q,uint32_t imm)442 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
443     //vmov.32 Q#1, #imm
444     rsAssert(imm == 0);
445     uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
446     ((uint32_t *)buf)[0] = op;
447     return buf + 4;
448 }
449 
addVADD_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)450 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
451     //vadd.f32 Q#1, D#1, D#2
452     uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
453     ((uint32_t *)buf)[0] = op;
454     return buf + 4;
455 }
456 #endif
457 
458 #if defined(ARCH_X86_HAVE_SSSE3)
459 extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
460                                   const short *coef, uint32_t count);
461 extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
462                                   const short *coef, uint32_t count);
463 extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
464                                   const short *coef, uint32_t count);
465 
selectKernel(Key_t key)466 void * selectKernel(Key_t key)
467 {
468     void * kernel = NULL;
469 
470     // inType, outType float if nonzero
471     if (!(key.u.inType || key.u.outType)) {
472         if (key.u.dot)
473             kernel = (void *)rsdIntrinsicColorMatrixDot_K;
474         else if (key.u.copyAlpha)
475             kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
476         else
477             kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
478     }
479 
480     return kernel;
481 }
482 #endif
483 
build(Key_t key)484 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
485 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
486     mBufSize = 4096;
487     //StopWatch build_time("rs cm: build time");
488     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
489                                   MAP_PRIVATE | MAP_ANON, -1, 0);
490     if (mBuf == MAP_FAILED) {
491         mBuf = NULL;
492         return false;
493     }
494 
495     uint8_t *buf = mBuf;
496     uint8_t *buf2 = NULL;
497 
498     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
499     int opInit[4] = {0, 0, 0, 0};
500 
501     memset(ops, 0, sizeof(ops));
502     for (int i=0; i < 4; i++) {
503         if (key.u.coeffMask & (1 << (i*4))) {
504             ops[i][0] = 0x2 | opInit[0];
505             opInit[0] = 1;
506         }
507         if (!key.u.dot) {
508             if (key.u.coeffMask & (1 << (1 + i*4))) {
509                 ops[i][1] = 0x2 | opInit[1];
510                 opInit[1] = 1;
511             }
512             if (key.u.coeffMask & (1 << (2 + i*4))) {
513                 ops[i][2] = 0x2 | opInit[2];
514                 opInit[2] = 1;
515             }
516         }
517         if (!key.u.copyAlpha) {
518             if (key.u.coeffMask & (1 << (3 + i*4))) {
519                 ops[i][3] = 0x2 | opInit[3];
520                 opInit[3] = 1;
521             }
522         }
523     }
524 
525     if (key.u.inType || key.u.outType) {
526         key.u.copyAlpha = 0;
527         ADD_CHUNK(prefix_f);
528         buf2 = buf;
529 
530         // Load the incoming r,g,b,a as needed
531         if (key.u.inType) {
532             switch(key.u.inVecSize) {
533             case 3:
534                 ADD_CHUNK(load_f32_4);
535                 break;
536             case 2:
537                 ADD_CHUNK(load_f32_3);
538                 break;
539             case 1:
540                 ADD_CHUNK(load_f32_2);
541                 break;
542             case 0:
543                 ADD_CHUNK(load_f32_1);
544                 break;
545             }
546         } else {
547             switch(key.u.inVecSize) {
548             case 3:
549                 ADD_CHUNK(load_u8f_4);
550                 break;
551             case 2:
552                 ADD_CHUNK(load_u8f_3);
553                 break;
554             case 1:
555                 ADD_CHUNK(load_u8f_2);
556                 break;
557             case 0:
558                 ADD_CHUNK(load_u8f_1);
559                 break;
560             }
561         }
562 
563         for (int i=0; i < 4; i++) {
564             for (int j=0; j < 4; j++) {
565                 switch(ops[i][j]) {
566                 case 0:
567                     break;
568                 case 2:
569                     buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
570                     break;
571                 case 3:
572                     buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
573                     break;
574                 }
575             }
576         }
577         for (int j=0; j < 4; j++) {
578             if (opInit[j]) {
579                 if (key.u.addMask & (1 << j)) {
580                     buf = addVADD_F32(buf, j, 12+j, 8+j);
581                 } else {
582                     buf = addVORR_32(buf, j, 12+j, 12+j);
583                 }
584             } else {
585                 if (key.u.addMask & (1 << j)) {
586                     buf = addVORR_32(buf, j, 8+j, 8+j);
587                 } else {
588                     buf = addVMOV_32(buf, j, 0);
589                 }
590             }
591         }
592 
593         if (key.u.outType) {
594             switch(key.u.outVecSize) {
595             case 3:
596                 ADD_CHUNK(store_f32_4);
597                 break;
598             case 2:
599                 ADD_CHUNK(store_f32_3);
600                 break;
601             case 1:
602                 ADD_CHUNK(store_f32_2);
603                 break;
604             case 0:
605                 ADD_CHUNK(store_f32_1);
606                 break;
607             }
608         } else {
609             switch(key.u.outVecSize) {
610             case 3:
611             case 2:
612                 ADD_CHUNK(store_f32u_4);
613                 break;
614             case 1:
615                 ADD_CHUNK(store_f32u_2);
616                 break;
617             case 0:
618                 ADD_CHUNK(store_f32u_1);
619                 break;
620             }
621         }
622 
623 
624     } else {
625         // Add the function prefix
626         // Store the address for the loop return
627         ADD_CHUNK(prefix_i);
628         buf2 = buf;
629 
630         // Load the incoming r,g,b,a as needed
631         switch(key.u.inVecSize) {
632         case 3:
633             ADD_CHUNK(load_u8_4);
634             if (key.u.copyAlpha) {
635                 ADD_CHUNK(unpack_u8_3);
636             } else {
637                 ADD_CHUNK(unpack_u8_4);
638             }
639             break;
640         case 2:
641             ADD_CHUNK(load_u8_3);
642             ADD_CHUNK(unpack_u8_3);
643             break;
644         case 1:
645             ADD_CHUNK(load_u8_2);
646             ADD_CHUNK(unpack_u8_2);
647             break;
648         case 0:
649             ADD_CHUNK(load_u8_1);
650             ADD_CHUNK(unpack_u8_1);
651             break;
652         }
653 
654         // Add multiply and accumulate
655         // use MULL to init the output register,
656         // use MLAL from there
657         for (int i=0; i < 4; i++) {
658             for (int j=0; j < 4; j++) {
659                 switch(ops[i][j]) {
660                 case 0:
661                     break;
662                 case 2:
663                     buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
664                     break;
665                 case 3:
666                     buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
667                     break;
668                 }
669             }
670         }
671         for (int j=0; j < 4; j++) {
672             if (opInit[j]) {
673                 if (key.u.addMask & (1 << j)) {
674                     buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
675                 }
676             } else {
677                 if (key.u.addMask & (1 << j)) {
678                     buf = addVORR_32(buf, 8+j, 4+j, 4+j);
679                 }
680             }
681         }
682 
683         // If we have a dot product, perform the special pack.
684         if (key.u.dot) {
685             ADD_CHUNK(pack_u8_1);
686             ADD_CHUNK(dot);
687         } else {
688             switch(key.u.outVecSize) {
689             case 3:
690                 if (key.u.copyAlpha) {
691                     ADD_CHUNK(pack_u8_3);
692                 } else {
693                     ADD_CHUNK(pack_u8_4);
694                 }
695                 break;
696             case 2:
697                 ADD_CHUNK(pack_u8_3);
698                 break;
699             case 1:
700                 ADD_CHUNK(pack_u8_2);
701                 break;
702             case 0:
703                 ADD_CHUNK(pack_u8_1);
704                 break;
705             }
706         }
707 
708         // Write out result
709         switch(key.u.outVecSize) {
710         case 3:
711         case 2:
712             ADD_CHUNK(store_u8_4);
713             break;
714         case 1:
715             ADD_CHUNK(store_u8_2);
716             break;
717         case 0:
718             ADD_CHUNK(store_u8_1);
719             break;
720         }
721     }
722 
723     if (key.u.inType != key.u.outType) {
724         key.u.copyAlpha = 0;
725         key.u.dot = 0;
726     }
727 
728     // Loop, branch, and cleanup
729     ADD_CHUNK(postfix1);
730     buf = addBranch(buf, buf2, 0x01);
731     ADD_CHUNK(postfix2);
732 
733     int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
734     if (ret == -1) {
735         ALOGE("mprotect error %i", ret);
736         return false;
737     }
738 
739     FLUSH_CPU_CACHE(mBuf, (char*) mBuf + mBufSize);
740     return true;
741 #else
742     return false;
743 #endif
744 }
745 
updateCoeffCache(float fpMul,float addMul)746 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
747     for(int ct=0; ct < 16; ct++) {
748         ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
749         tmpFp[ct] = fp[ct] * fpMul;
750         //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
751     }
752 
753     float add = 0.f;
754     if (fpMul > 254.f) add = 0.5f;
755     for(int ct=0; ct < 4; ct++) {
756         tmpFpa[ct] = fpa[ct] * addMul + add;
757         //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
758     }
759 
760     for(int ct=0; ct < 4; ct++) {
761         ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
762     }
763 }
764 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)765 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
766                                                     size_t dataLength) {
767     switch(slot) {
768     case 0:
769         memcpy (fp, data, sizeof(fp));
770         break;
771     case 1:
772         memcpy (fpa, data, sizeof(fpa));
773         break;
774     default:
775         rsAssert(0);
776         break;
777     }
778     mRootPtr = &kernel;
779 }
780 
781 
One(const RsForEachStubParamStruct * p,void * out,const void * py,const float * coeff,const float * add,uint32_t vsin,uint32_t vsout,bool fin,bool fout)782 static void One(const RsForEachStubParamStruct *p, void *out,
783                 const void *py, const float* coeff, const float *add,
784                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
785 
786     float4 f = 0.f;
787     if (fin) {
788         switch(vsin) {
789         case 3:
790             f = ((const float4 *)py)[0];
791             break;
792         case 2:
793             f = ((const float4 *)py)[0];
794             f.w = 0.f;
795             break;
796         case 1:
797             f.xy = ((const float2 *)py)[0];
798             break;
799         case 0:
800             f.x = ((const float *)py)[0];
801             break;
802         }
803     } else {
804         switch(vsin) {
805         case 3:
806             f = convert_float4(((const uchar4 *)py)[0]);
807             break;
808         case 2:
809             f = convert_float4(((const uchar4 *)py)[0]);
810             f.w = 0.f;
811             break;
812         case 1:
813             f.xy = convert_float2(((const uchar2 *)py)[0]);
814             break;
815         case 0:
816             f.x = (float)(((const uchar *)py)[0]);
817             break;
818         }
819     }
820     //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
821 
822     float4 sum;
823     sum.x = f.x * coeff[0] +
824             f.y * coeff[4] +
825             f.z * coeff[8] +
826             f.w * coeff[12];
827     sum.y = f.x * coeff[1] +
828             f.y * coeff[5] +
829             f.z * coeff[9] +
830             f.w * coeff[13];
831     sum.z = f.x * coeff[2] +
832             f.y * coeff[6] +
833             f.z * coeff[10] +
834             f.w * coeff[14];
835     sum.w = f.x * coeff[3] +
836             f.y * coeff[7] +
837             f.z * coeff[11] +
838             f.w * coeff[15];
839     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
840 
841     sum.x += add[0];
842     sum.y += add[1];
843     sum.z += add[2];
844     sum.w += add[3];
845 
846 
847     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
848     if (fout) {
849         switch(vsout) {
850         case 3:
851         case 2:
852             ((float4 *)out)[0] = sum;
853             break;
854         case 1:
855             ((float2 *)out)[0] = sum.xy;
856             break;
857         case 0:
858             ((float *)out)[0] = sum.x;
859             break;
860         }
861     } else {
862         sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
863         sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
864         sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
865         sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
866 
867         switch(vsout) {
868         case 3:
869         case 2:
870             ((uchar4 *)out)[0] = convert_uchar4(sum);
871             break;
872         case 1:
873             ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
874             break;
875         case 0:
876             ((uchar *)out)[0] = sum.x;
877             break;
878         }
879     }
880     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
881 }
882 
kernel(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)883 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
884                                               uint32_t xstart, uint32_t xend,
885                                               uint32_t instep, uint32_t outstep) {
886     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
887     uchar *out = (uchar *)p->out;
888     uchar *in = (uchar *)p->in;
889     uint32_t x1 = xstart;
890     uint32_t x2 = xend;
891 
892     uint32_t vsin = cp->mLastKey.u.inVecSize;
893     uint32_t vsout = cp->mLastKey.u.outVecSize;
894     bool floatIn = !!cp->mLastKey.u.inType;
895     bool floatOut = !!cp->mLastKey.u.outType;
896 
897     //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
898 
899     if(x2 > x1) {
900         int32_t len = x2 - x1;
901         if (gArchUseSIMD) {
902             if((cp->mOptKernel != NULL) && (len >= 4)) {
903                 // The optimized kernel processes 4 pixels at once
904                 // and requires a minimum of 1 chunk of 4
905                 cp->mOptKernel(out, in, cp->ip, len >> 2);
906                 // Update the len and pointers so the generic code can
907                 // finish any leftover pixels
908                 len &= ~3;
909                 x1 += len;
910                 out += outstep * len;
911                 in += instep * len;
912             }
913 #if defined(ARCH_ARM64_USE_INTRINSICS)
914             else {
915                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
916                     // Currently this generates off by one errors.
917                     //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
918                     //x1 += len;
919                     //out += outstep * len;
920                     //in += instep * len;
921                 } else {
922                     rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
923                     x1 += len;
924                     out += outstep * len;
925                     in += instep * len;
926                 }
927             }
928 #endif
929         }
930 
931         while(x1 != x2) {
932             One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
933             out += outstep;
934             in += instep;
935             x1++;
936         }
937     }
938 }
939 
preLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)940 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
941         uint32_t slot, const Allocation * ain, Allocation * aout,
942         const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
943 
944     const Element *ein = ain->mHal.state.type->getElement();
945     const Element *eout = aout->mHal.state.type->getElement();
946 
947     if (ein->getType() == eout->getType()) {
948         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
949             updateCoeffCache(1.f, 255.f);
950         } else {
951             updateCoeffCache(1.f, 1.f);
952         }
953     } else {
954         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
955             updateCoeffCache(255.f, 255.f);
956         } else {
957             updateCoeffCache(1.f / 255.f, 1.f);
958         }
959     }
960 
961     Key_t key = computeKey(ain->mHal.state.type->getElement(),
962                            aout->mHal.state.type->getElement());
963 #if defined(ARCH_X86_HAVE_SSSE3)
964     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
965         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
966         // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
967         mLastKey = key;
968     }
969 
970 #else //if !defined(ARCH_X86_HAVE_SSSE3)
971     if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
972         if (mBuf) munmap(mBuf, mBufSize);
973         mBuf = NULL;
974         mOptKernel = NULL;
975         if (build(key)) {
976             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
977         }
978 #if defined(ARCH_ARM64_USE_INTRINSICS)
979         else {
980             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
981             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
982             uint32_t mm = 0;
983             int i;
984             for (i = 0; i < 4; i++)
985             {
986                 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
987                 m = ((m * 0x249) >> 9) & 15;
988                 m |= ((key.u.addMask >> i) & 1) << 4;
989                 mm |= m << (i * 5);
990             }
991 
992             if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
993                 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
994             } else {
995                 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
996             }
997         }
998 #endif
999         mLastKey = key;
1000     }
1001 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1002 }
1003 
postLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)1004 void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
1005         uint32_t slot, const Allocation * ain, Allocation * aout,
1006         const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
1007 
1008 }
1009 
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1010 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1011             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1012             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1013 
1014     mLastKey.key = 0;
1015     mBuf = NULL;
1016     mBufSize = 0;
1017     mOptKernel = NULL;
1018     const static float defaultMatrix[] = {
1019         1.f, 0.f, 0.f, 0.f,
1020         0.f, 1.f, 0.f, 0.f,
1021         0.f, 0.f, 1.f, 0.f,
1022         0.f, 0.f, 0.f, 1.f
1023     };
1024     const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1025     setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
1026     setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1027 }
1028 
~RsdCpuScriptIntrinsicColorMatrix()1029 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
1030     if (mBuf) munmap(mBuf, mBufSize);
1031     mBuf = NULL;
1032     mOptKernel = NULL;
1033 }
1034 
populateScript(Script * s)1035 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
1036     s->mHal.info.exportedVariableCount = 2;
1037 }
1038 
rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1039 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1040                                             const Script *s, const Element *e) {
1041 
1042     return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1043 }
1044