1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <sys/mman.h>
18 #include <unistd.h>
19 
20 #include "rsCpuIntrinsic.h"
21 #include "rsCpuIntrinsicInlines.h"
22 
23 #include <sys/mman.h>
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 //#include <utils/StopWatch.h>
28 
29 
30 /*  uint kernel
31  *  Q0  D0:  Load slot for R
32  *      D1:  Load slot for G
33  *  Q1  D2:  Load slot for B
34  *      D3:  Load slot for A
35  *  Q2  D4:  Matrix
36  *      D5:  =
37  *  Q3  D6:  =
38  *      D7:  =
39  *  Q4  D8:  Add R
40  *      D9:
41  *  Q5  D10: Add G
42  *      D11:
43  *  Q6  D12: Add B
44  *      D13:
45  *  Q7  D14: Add A
46  *      D15:
47  *  Q8  D16:  I32: R Sum
48  *      D17:
49  *  Q9  D18:  I32: G Sum
50  *      D19:
51  *  Q10 D20:  I32: B Sum
52  *      D21:
53  *  Q11 D22:  I32: A Sum
54  *      D23:
55  *  Q12 D24:  U16: expanded R
56  *      D25:
57  *  Q13 D26:  U16: expanded G
58  *      D27:
59  *  Q14 D28:  U16: expanded B
60  *      D29:
61  *  Q15 D30:  U16: expanded A
62  *      D31:
63  *
64  */
65 
66 /*  float kernel
67  *  Q0  D0:  Load slot for R
68  *      D1:  =
69  *  Q1  D2:  Load slot for G
70  *      D3:  =
71  *  Q2  D4:  Load slot for B
72  *      D5:  =
73  *  Q3  D6:  Load slot for A
74  *      D7:  =
75  *  Q4  D8:  Matrix
76  *      D9:  =
77  *  Q5  D10: =
78  *      D11: =
79  *  Q6  D12: =
80  *      D13: =
81  *  Q7  D14: =
82  *      D15: =
83  *  Q8  D16: Add R
84  *      D17: =
85  *  Q9  D18: Add G
86  *      D19: =
87  *  Q10 D20: Add B
88  *      D21: =
89  *  Q11 D22: Add A
90  *      D23: =
91  *  Q12 D24: Sum R
92  *      D25: =
93  *  Q13 D26: Sum G
94  *      D27: =
95  *  Q14 D28: Sum B
96  *      D29: =
97  *  Q15 D30: Sum A
98  *      D31: =
99  *
100  */
101 
102 
103 
104 using namespace android;
105 using namespace android::renderscript;
106 
107 namespace android {
108 namespace renderscript {
109 
110 typedef union {
111     uint64_t key;
112     struct {
113         uint32_t inVecSize          :2;  // [0 - 1]
114         uint32_t outVecSize         :2;  // [2 - 3]
115         uint32_t inType             :4;  // [4 - 7]
116         uint32_t outType            :4;  // [8 - 11]
117         uint32_t dot                :1;  // [12]
118         uint32_t _unused1           :1;  // [13]
119         uint32_t copyAlpha          :1;  // [14]
120         uint32_t _unused2           :1;  // [15]
121         uint32_t coeffMask          :16; // [16-31]
122         uint32_t addMask            :4;  // [32-35]
123     } u;
124 } Key_t;
125 
126 //Re-enable when intrinsic is fixed
127 #if defined(ARCH_ARM64_USE_INTRINSICS)
128 typedef struct {
129     void (*column[4])(void);
130     void (*store)(void);
131     void (*load)(void);
132     void (*store_end)(void);
133     void (*load_end)(void);
134 } FunctionTab_t;
135 
136 extern "C" void rsdIntrinsicColorMatrix_int_K(
137              void *out, void const *in, size_t count,
138              FunctionTab_t const *fns,
139              int16_t const *mult, int32_t const *add);
140 
141 extern "C" void rsdIntrinsicColorMatrix_float_K(
142              void *out, void const *in, size_t count,
143              FunctionTab_t const *fns,
144              float const *mult, float const *add);
145 
146 /* The setup functions fill in function tables to be used by above functions;
147  * this code also eliminates jump-to-another-jump cases by short-circuiting
148  * empty functions.  While it's not performance critical, it works out easier
149  * to write the set-up code in assembly than to try to expose the same symbols
150  * and write the code in C.
151  */
152 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
153              FunctionTab_t *fns,
154              uint32_t mask, int dt, int st);
155 
156 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
157              FunctionTab_t *fns,
158              uint32_t mask, int dt, int st);
159 #endif
160 
161 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
162 public:
163     void populateScript(Script *) override;
164 
165     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
166 
167     ~RsdCpuScriptIntrinsicColorMatrix() override;
168     RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
169 
170     void preLaunch(uint32_t slot, const Allocation ** ains,
171                    uint32_t inLen, Allocation * aout, const void * usr,
172                    uint32_t usrLen, const RsScriptCall *sc) override;
173 
174 protected:
175     float fp[16];
176     float fpa[4];
177 
178     // The following four fields are read as constants
179     // by the SIMD assembly code.
180     short ip[16];
181     int ipa[4];
182     float tmpFp[16];
183     float tmpFpa[4];
184 #if defined(ARCH_ARM64_USE_INTRINSICS)
185     FunctionTab_t mFnTab;
186 #endif
187 
188     static void kernel(const RsExpandKernelDriverInfo *info,
189                        uint32_t xstart, uint32_t xend,
190                        uint32_t outstep);
191     void updateCoeffCache(float fpMul, float addMul);
192 
193     Key_t mLastKey;
194     unsigned char *mBuf;
195     size_t mBufSize;
196 
197     Key_t computeKey(const Element *ein, const Element *eout);
198 
199     bool build(Key_t key);
200 
201     void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
202 
203 };
204 
205 }
206 }
207 
208 
computeKey(const Element * ein,const Element * eout)209 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
210         const Element *ein, const Element *eout) {
211 
212     Key_t key;
213     key.key = 0;
214 
215     // Compute a unique code key for this operation
216 
217     // Add to the key the input and output types
218     bool hasFloat = false;
219     if (ein->getType() == RS_TYPE_FLOAT_32) {
220         hasFloat = true;
221         key.u.inType = RS_TYPE_FLOAT_32;
222         rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
223     }
224     if (eout->getType() == RS_TYPE_FLOAT_32) {
225         hasFloat = true;
226         key.u.outType = RS_TYPE_FLOAT_32;
227         rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
228     }
229 
230     // Mask in the bits indicating which coefficients in the
231     // color matrix are needed.
232     if (hasFloat) {
233         for (uint32_t i=0; i < 16; i++) {
234             if (fabs(fp[i]) != 0.f) {
235                 key.u.coeffMask |= 1 << i;
236             }
237         }
238         if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
239         if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
240         if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
241         if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
242 
243     } else {
244         for (uint32_t i=0; i < 16; i++) {
245             if (ip[i] != 0) {
246                 key.u.coeffMask |= 1 << i;
247             }
248         }
249         if (ipa[0] != 0) key.u.addMask |= 0x1;
250         if (ipa[1] != 0) key.u.addMask |= 0x2;
251         if (ipa[2] != 0) key.u.addMask |= 0x4;
252         if (ipa[3] != 0) key.u.addMask |= 0x8;
253     }
254 
255     // Look for a dot product where the r,g,b colums are the same
256     if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
257         (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
258         (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
259         (ip[12] == ip[13]) && (ip[12] == ip[14])) {
260 
261         if (!key.u.addMask) key.u.dot = 1;
262     }
263 
264     // Is alpha a simple copy
265     if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
266         key.u.copyAlpha = !(key.u.inType || key.u.outType);
267     }
268 
269     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
270 
271     switch (ein->getVectorSize()) {
272     case 4:
273         key.u.inVecSize = 3;
274         break;
275     case 3:
276         key.u.inVecSize = 2;
277         key.u.coeffMask &= ~0xF000;
278         break;
279     case 2:
280         key.u.inVecSize = 1;
281         key.u.coeffMask &= ~0xFF00;
282         break;
283     default:
284         key.u.coeffMask &= ~0xFFF0;
285         break;
286     }
287 
288     switch (eout->getVectorSize()) {
289     case 4:
290         key.u.outVecSize = 3;
291         break;
292     case 3:
293         key.u.outVecSize = 2;
294         key.u.coeffMask &= ~0x8888;
295         key.u.addMask &= 7;
296         break;
297     case 2:
298         key.u.outVecSize = 1;
299         key.u.coeffMask &= ~0xCCCC;
300         key.u.addMask &= 3;
301         break;
302     default:
303         key.u.coeffMask &= ~0xEEEE;
304         key.u.addMask &= 1;
305         break;
306     }
307 
308     if (key.u.inType && !key.u.outType) {
309         key.u.addMask |= 1;
310         if (key.u.outVecSize > 0) key.u.addMask |= 2;
311         if (key.u.outVecSize > 1) key.u.addMask |= 4;
312         if (key.u.outVecSize > 2) key.u.addMask |= 8;
313     }
314 
315     //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
316     return key;
317 }
318 
319 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
320 
321 #define DEF_SYM(x)                                  \
322     extern "C" uint32_t _N_ColorMatrix_##x;      \
323     extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
324     extern "C" uint32_t _N_ColorMatrix_##x##_len;
325 
326 DEF_SYM(prefix_i)
DEF_SYM(prefix_f)327 DEF_SYM(prefix_f)
328 DEF_SYM(postfix1)
329 DEF_SYM(postfix2)
330 
331 DEF_SYM(load_u8_4)
332 DEF_SYM(load_u8_3)
333 DEF_SYM(load_u8_2)
334 DEF_SYM(load_u8_1)
335 DEF_SYM(load_u8f_4)
336 DEF_SYM(load_u8f_3)
337 DEF_SYM(load_u8f_2)
338 DEF_SYM(load_u8f_1)
339 DEF_SYM(load_f32_4)
340 DEF_SYM(load_f32_3)
341 DEF_SYM(load_f32_2)
342 DEF_SYM(load_f32_1)
343 
344 DEF_SYM(store_u8_4)
345 DEF_SYM(store_u8_2)
346 DEF_SYM(store_u8_1)
347 DEF_SYM(store_f32_4)
348 DEF_SYM(store_f32_3)
349 DEF_SYM(store_f32_2)
350 DEF_SYM(store_f32_1)
351 DEF_SYM(store_f32u_4)
352 DEF_SYM(store_f32u_2)
353 DEF_SYM(store_f32u_1)
354 
355 DEF_SYM(unpack_u8_4)
356 DEF_SYM(unpack_u8_3)
357 DEF_SYM(unpack_u8_2)
358 DEF_SYM(unpack_u8_1)
359 DEF_SYM(pack_u8_4)
360 DEF_SYM(pack_u8_3)
361 DEF_SYM(pack_u8_2)
362 DEF_SYM(pack_u8_1)
363 DEF_SYM(dot)
364 DEF_SYM(add_0_u8)
365 DEF_SYM(add_1_u8)
366 DEF_SYM(add_2_u8)
367 DEF_SYM(add_3_u8)
368 
369 #define ADD_CHUNK(x) \
370     memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
371     buf += _N_ColorMatrix_##x##_len
372 
373 
374 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
375     size_t off = (target - buf - 8) >> 2;
376     rsAssert(((off & 0xff000000) == 0) ||
377            ((off & 0xff000000) == 0xff000000));
378 
379     uint32_t op = (condition << 28);
380     op |= 0xa << 24;  // branch
381     op |= 0xffffff & off;
382     ((uint32_t *)buf)[0] = op;
383     return buf + 4;
384 }
385 
encodeSIMDRegs(uint32_t vd,uint32_t vn,uint32_t vm)386 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
387     rsAssert(vd < 32);
388     rsAssert(vm < 32);
389     rsAssert(vn < 32);
390 
391     uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
392     op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
393     op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
394     return op;
395 }
396 
addVMLAL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)397 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
398     //vmlal.s16 Q#1, D#1, D#2[#]
399     uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
400     ((uint32_t *)buf)[0] = op;
401     return buf + 4;
402 }
403 
addVMULL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)404 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
405     //vmull.s16 Q#1, D#1, D#2[#]
406     uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
407     ((uint32_t *)buf)[0] = op;
408     return buf + 4;
409 }
410 
addVQADD_S32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)411 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
412     //vqadd.s32 Q#1, Q#1, Q#2
413     uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
414     ((uint32_t *)buf)[0] = op;
415     return buf + 4;
416 }
417 
addVMLAL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)418 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
419     //vmlal.f32 Q#1, D#1, D#2[#]
420     uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
421     ((uint32_t *)buf)[0] = op;
422     return buf + 4;
423 }
424 
addVMULL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)425 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
426     //vmull.f32 Q#1, D#1, D#2[#]
427     uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
428     ((uint32_t *)buf)[0] = op;
429     return buf + 4;
430 }
431 
addVORR_32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)432 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
433     //vadd.f32 Q#1, D#1, D#2
434     uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
435     ((uint32_t *)buf)[0] = op;
436     return buf + 4;
437 }
438 
addVMOV_32(uint8_t * buf,uint32_t dest_q,uint32_t imm)439 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
440     //vmov.32 Q#1, #imm
441     rsAssert(imm == 0);
442     uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
443     ((uint32_t *)buf)[0] = op;
444     return buf + 4;
445 }
446 
addVADD_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)447 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
448     //vadd.f32 Q#1, D#1, D#2
449     uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
450     ((uint32_t *)buf)[0] = op;
451     return buf + 4;
452 }
453 #endif
454 
455 #if defined(ARCH_X86_HAVE_SSSE3)
456 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
457                                   const short *coef, uint32_t count);
458 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
459                                   const short *coef, uint32_t count);
460 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
461                                   const short *coef, uint32_t count);
462 
selectKernel(Key_t key)463 void * selectKernel(Key_t key)
464 {
465     void * kernel = nullptr;
466 
467     // inType, outType float if nonzero
468     if (!(key.u.inType || key.u.outType)) {
469         if (key.u.dot)
470             kernel = (void *)rsdIntrinsicColorMatrixDot_K;
471         else if (key.u.copyAlpha)
472             kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
473         else
474             kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
475     }
476 
477     return kernel;
478 }
479 #endif
480 
build(Key_t key)481 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
482 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
483     mBufSize = 4096;
484     //StopWatch build_time("rs cm: build time");
485     mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
486                                   MAP_PRIVATE | MAP_ANON, -1, 0);
487     if (mBuf == MAP_FAILED) {
488         mBuf = NULL;
489         return false;
490     }
491 
492     uint8_t *buf = mBuf;
493     uint8_t *buf2 = nullptr;
494 
495     int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
496     int opInit[4] = {0, 0, 0, 0};
497 
498     memset(ops, 0, sizeof(ops));
499     for (int i=0; i < 4; i++) {
500         if (key.u.coeffMask & (1 << (i*4))) {
501             ops[i][0] = 0x2 | opInit[0];
502             opInit[0] = 1;
503         }
504         if (!key.u.dot) {
505             if (key.u.coeffMask & (1 << (1 + i*4))) {
506                 ops[i][1] = 0x2 | opInit[1];
507                 opInit[1] = 1;
508             }
509             if (key.u.coeffMask & (1 << (2 + i*4))) {
510                 ops[i][2] = 0x2 | opInit[2];
511                 opInit[2] = 1;
512             }
513         }
514         if (!key.u.copyAlpha) {
515             if (key.u.coeffMask & (1 << (3 + i*4))) {
516                 ops[i][3] = 0x2 | opInit[3];
517                 opInit[3] = 1;
518             }
519         }
520     }
521 
522     if (key.u.inType || key.u.outType) {
523         key.u.copyAlpha = 0;
524         ADD_CHUNK(prefix_f);
525         buf2 = buf;
526 
527         // Load the incoming r,g,b,a as needed
528         if (key.u.inType) {
529             switch(key.u.inVecSize) {
530             case 3:
531                 ADD_CHUNK(load_f32_4);
532                 break;
533             case 2:
534                 ADD_CHUNK(load_f32_3);
535                 break;
536             case 1:
537                 ADD_CHUNK(load_f32_2);
538                 break;
539             case 0:
540                 ADD_CHUNK(load_f32_1);
541                 break;
542             }
543         } else {
544             switch(key.u.inVecSize) {
545             case 3:
546                 ADD_CHUNK(load_u8f_4);
547                 break;
548             case 2:
549                 ADD_CHUNK(load_u8f_3);
550                 break;
551             case 1:
552                 ADD_CHUNK(load_u8f_2);
553                 break;
554             case 0:
555                 ADD_CHUNK(load_u8f_1);
556                 break;
557             }
558         }
559 
560         for (int i=0; i < 4; i++) {
561             for (int j=0; j < 4; j++) {
562                 switch(ops[i][j]) {
563                 case 0:
564                     break;
565                 case 2:
566                     buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
567                     break;
568                 case 3:
569                     buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
570                     break;
571                 }
572             }
573         }
574         for (int j=0; j < 4; j++) {
575             if (opInit[j]) {
576                 if (key.u.addMask & (1 << j)) {
577                     buf = addVADD_F32(buf, j, 12+j, 8+j);
578                 } else {
579                     buf = addVORR_32(buf, j, 12+j, 12+j);
580                 }
581             } else {
582                 if (key.u.addMask & (1 << j)) {
583                     buf = addVORR_32(buf, j, 8+j, 8+j);
584                 } else {
585                     buf = addVMOV_32(buf, j, 0);
586                 }
587             }
588         }
589 
590         if (key.u.outType) {
591             switch(key.u.outVecSize) {
592             case 3:
593                 ADD_CHUNK(store_f32_4);
594                 break;
595             case 2:
596                 ADD_CHUNK(store_f32_3);
597                 break;
598             case 1:
599                 ADD_CHUNK(store_f32_2);
600                 break;
601             case 0:
602                 ADD_CHUNK(store_f32_1);
603                 break;
604             }
605         } else {
606             switch(key.u.outVecSize) {
607             case 3:
608             case 2:
609                 ADD_CHUNK(store_f32u_4);
610                 break;
611             case 1:
612                 ADD_CHUNK(store_f32u_2);
613                 break;
614             case 0:
615                 ADD_CHUNK(store_f32u_1);
616                 break;
617             }
618         }
619 
620 
621     } else {
622         // Add the function prefix
623         // Store the address for the loop return
624         ADD_CHUNK(prefix_i);
625         buf2 = buf;
626 
627         // Load the incoming r,g,b,a as needed
628         switch(key.u.inVecSize) {
629         case 3:
630             ADD_CHUNK(load_u8_4);
631             if (key.u.copyAlpha) {
632                 ADD_CHUNK(unpack_u8_3);
633             } else {
634                 ADD_CHUNK(unpack_u8_4);
635             }
636             break;
637         case 2:
638             ADD_CHUNK(load_u8_3);
639             ADD_CHUNK(unpack_u8_3);
640             break;
641         case 1:
642             ADD_CHUNK(load_u8_2);
643             ADD_CHUNK(unpack_u8_2);
644             break;
645         case 0:
646             ADD_CHUNK(load_u8_1);
647             ADD_CHUNK(unpack_u8_1);
648             break;
649         }
650 
651         // Add multiply and accumulate
652         // use MULL to init the output register,
653         // use MLAL from there
654         for (int i=0; i < 4; i++) {
655             for (int j=0; j < 4; j++) {
656                 switch(ops[i][j]) {
657                 case 0:
658                     break;
659                 case 2:
660                     buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
661                     break;
662                 case 3:
663                     buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
664                     break;
665                 }
666             }
667         }
668         for (int j=0; j < 4; j++) {
669             if (opInit[j]) {
670                 if (key.u.addMask & (1 << j)) {
671                     buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
672                 }
673             } else {
674                 if (key.u.addMask & (1 << j)) {
675                     buf = addVORR_32(buf, 8+j, 4+j, 4+j);
676                 }
677             }
678         }
679 
680         // If we have a dot product, perform the special pack.
681         if (key.u.dot) {
682             ADD_CHUNK(pack_u8_1);
683             ADD_CHUNK(dot);
684         } else {
685             switch(key.u.outVecSize) {
686             case 3:
687                 if (key.u.copyAlpha) {
688                     ADD_CHUNK(pack_u8_3);
689                 } else {
690                     ADD_CHUNK(pack_u8_4);
691                 }
692                 break;
693             case 2:
694                 ADD_CHUNK(pack_u8_3);
695                 break;
696             case 1:
697                 ADD_CHUNK(pack_u8_2);
698                 break;
699             case 0:
700                 ADD_CHUNK(pack_u8_1);
701                 break;
702             }
703         }
704 
705         // Write out result
706         switch(key.u.outVecSize) {
707         case 3:
708         case 2:
709             ADD_CHUNK(store_u8_4);
710             break;
711         case 1:
712             ADD_CHUNK(store_u8_2);
713             break;
714         case 0:
715             ADD_CHUNK(store_u8_1);
716             break;
717         }
718     }
719 
720     if (key.u.inType != key.u.outType) {
721         key.u.copyAlpha = 0;
722         key.u.dot = 0;
723     }
724 
725     // Loop, branch, and cleanup
726     ADD_CHUNK(postfix1);
727     buf = addBranch(buf, buf2, 0x01);
728     ADD_CHUNK(postfix2);
729 
730     int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
731     if (ret == -1) {
732         ALOGE("mprotect error %i", ret);
733         return false;
734     }
735 
736     __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
737     return true;
738 #else
739     return false;
740 #endif
741 }
742 
updateCoeffCache(float fpMul,float addMul)743 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
744     for(int ct=0; ct < 16; ct++) {
745         ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
746         tmpFp[ct] = fp[ct] * fpMul;
747         //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
748     }
749 
750     float add = 0.f;
751     if (fpMul > 254.f) add = 0.5f;
752     for(int ct=0; ct < 4; ct++) {
753         tmpFpa[ct] = fpa[ct] * addMul + add;
754         //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
755     }
756 
757     for(int ct=0; ct < 4; ct++) {
758         ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
759     }
760 }
761 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)762 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
763                                                     size_t dataLength) {
764     switch(slot) {
765     case 0:
766         memcpy (fp, data, sizeof(fp));
767         break;
768     case 1:
769         memcpy (fpa, data, sizeof(fpa));
770         break;
771     default:
772         rsAssert(0);
773         break;
774     }
775     mRootPtr = &kernel;
776 }
777 
778 
One(const RsExpandKernelDriverInfo * info,void * out,const void * py,const float * coeff,const float * add,uint32_t vsin,uint32_t vsout,bool fin,bool fout)779 static void One(const RsExpandKernelDriverInfo *info, void *out,
780                 const void *py, const float* coeff, const float *add,
781                 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
782 
783     float4 f = 0.f;
784     if (fin) {
785         switch(vsin) {
786         case 3:
787             f = ((const float4 *)py)[0];
788             break;
789         case 2:
790             f = ((const float4 *)py)[0];
791             f.w = 0.f;
792             break;
793         case 1:
794             f.xy = ((const float2 *)py)[0];
795             break;
796         case 0:
797             f.x = ((const float *)py)[0];
798             break;
799         }
800     } else {
801         switch(vsin) {
802         case 3:
803             f = convert_float4(((const uchar4 *)py)[0]);
804             break;
805         case 2:
806             f = convert_float4(((const uchar4 *)py)[0]);
807             f.w = 0.f;
808             break;
809         case 1:
810             f.xy = convert_float2(((const uchar2 *)py)[0]);
811             break;
812         case 0:
813             f.x = (float)(((const uchar *)py)[0]);
814             break;
815         }
816     }
817     //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
818 
819     float4 sum;
820     sum.x = f.x * coeff[0] +
821             f.y * coeff[4] +
822             f.z * coeff[8] +
823             f.w * coeff[12];
824     sum.y = f.x * coeff[1] +
825             f.y * coeff[5] +
826             f.z * coeff[9] +
827             f.w * coeff[13];
828     sum.z = f.x * coeff[2] +
829             f.y * coeff[6] +
830             f.z * coeff[10] +
831             f.w * coeff[14];
832     sum.w = f.x * coeff[3] +
833             f.y * coeff[7] +
834             f.z * coeff[11] +
835             f.w * coeff[15];
836     //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
837 
838     sum.x += add[0];
839     sum.y += add[1];
840     sum.z += add[2];
841     sum.w += add[3];
842 
843 
844     //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
845     if (fout) {
846         switch(vsout) {
847         case 3:
848         case 2:
849             ((float4 *)out)[0] = sum;
850             break;
851         case 1:
852             ((float2 *)out)[0] = sum.xy;
853             break;
854         case 0:
855             ((float *)out)[0] = sum.x;
856             break;
857         }
858     } else {
859         sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
860         sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
861         sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
862         sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
863 
864         switch(vsout) {
865         case 3:
866         case 2:
867             ((uchar4 *)out)[0] = convert_uchar4(sum);
868             break;
869         case 1:
870             ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
871             break;
872         case 0:
873             ((uchar *)out)[0] = sum.x;
874             break;
875         }
876     }
877     //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
878 }
879 
kernel(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)880 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
881                                               uint32_t xstart, uint32_t xend,
882                                               uint32_t outstep) {
883     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
884 
885     uint32_t instep = info->inStride[0];
886 
887     uchar *out = (uchar *)info->outPtr[0];
888     uchar *in = (uchar *)info->inPtr[0];
889     uint32_t x1 = xstart;
890     uint32_t x2 = xend;
891 
892     uint32_t vsin = cp->mLastKey.u.inVecSize;
893     uint32_t vsout = cp->mLastKey.u.outVecSize;
894     bool floatIn = !!cp->mLastKey.u.inType;
895     bool floatOut = !!cp->mLastKey.u.outType;
896 
897     //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
898 
899     if(x2 > x1) {
900         int32_t len = x2 - x1;
901         if (gArchUseSIMD) {
902             if((cp->mOptKernel != nullptr) && (len >= 4)) {
903                 // The optimized kernel processes 4 pixels at once
904                 // and requires a minimum of 1 chunk of 4
905                 cp->mOptKernel(out, in, cp->ip, len >> 2);
906                 // Update the len and pointers so the generic code can
907                 // finish any leftover pixels
908                 len &= ~3;
909                 x1 += len;
910                 out += outstep * len;
911                 in += instep * len;
912             }
913 #if defined(ARCH_ARM64_USE_INTRINSICS)
914             else {
915                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
916                     // Currently this generates off by one errors.
917                     //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
918                     //x1 += len;
919                     //out += outstep * len;
920                     //in += instep * len;
921                 } else {
922                     rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
923                     x1 += len;
924                     out += outstep * len;
925                     in += instep * len;
926                 }
927             }
928 #endif
929         }
930 
931         while(x1 != x2) {
932             One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
933             out += outstep;
934             in += instep;
935             x1++;
936         }
937     }
938 }
939 
preLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)940 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
941                                                  const Allocation ** ains,
942                                                  uint32_t inLen,
943                                                  Allocation * aout,
944                                                  const void * usr,
945                                                  uint32_t usrLen,
946                                                  const RsScriptCall *sc) {
947 
948     const Element *ein = ains[0]->mHal.state.type->getElement();
949     const Element *eout = aout->mHal.state.type->getElement();
950 
951     if (ein->getType() == eout->getType()) {
952         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
953             updateCoeffCache(1.f, 255.f);
954         } else {
955             updateCoeffCache(1.f, 1.f);
956         }
957     } else {
958         if (eout->getType() == RS_TYPE_UNSIGNED_8) {
959             updateCoeffCache(255.f, 255.f);
960         } else {
961             updateCoeffCache(1.f / 255.f, 1.f);
962         }
963     }
964 
965     Key_t key = computeKey(ein, eout);
966 
967 #if defined(ARCH_X86_HAVE_SSSE3)
968     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
969         // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
970         // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
971         mLastKey = key;
972     }
973 
974 #else //if !defined(ARCH_X86_HAVE_SSSE3)
975     if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
976         if (mBuf) munmap(mBuf, mBufSize);
977         mBuf = nullptr;
978         mOptKernel = nullptr;
979         if (build(key)) {
980             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
981         }
982 #if defined(ARCH_ARM64_USE_INTRINSICS)
983         else {
984             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
985             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
986             uint32_t mm = 0;
987             int i;
988             for (i = 0; i < 4; i++)
989             {
990                 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
991                 m = ((m * 0x249) >> 9) & 15;
992                 m |= ((key.u.addMask >> i) & 1) << 4;
993                 mm |= m << (i * 5);
994             }
995 
996             if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
997                 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
998             } else {
999                 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1000             }
1001         }
1002 #endif
1003         mLastKey = key;
1004     }
1005 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1006 }
1007 
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1008 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1009             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1010             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1011 
1012     mLastKey.key = 0;
1013     mBuf = nullptr;
1014     mBufSize = 0;
1015     mOptKernel = nullptr;
1016     const static float defaultMatrix[] = {
1017         1.f, 0.f, 0.f, 0.f,
1018         0.f, 1.f, 0.f, 0.f,
1019         0.f, 0.f, 1.f, 0.f,
1020         0.f, 0.f, 0.f, 1.f
1021     };
1022     const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1023     setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
1024     setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1025 }
1026 
~RsdCpuScriptIntrinsicColorMatrix()1027 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
1028     if (mBuf) munmap(mBuf, mBufSize);
1029     mBuf = nullptr;
1030     mOptKernel = nullptr;
1031 }
1032 
populateScript(Script * s)1033 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
1034     s->mHal.info.exportedVariableCount = 2;
1035 }
1036 
rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1037 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1038                                             const Script *s, const Element *e) {
1039 
1040     return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1041 }
1042