1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <sys/mman.h>
18 #include <unistd.h>
19
20 #include "rsCpuIntrinsic.h"
21 #include "rsCpuIntrinsicInlines.h"
22 #include "linkloader/include/MemChunk.h"
23 #include "linkloader/utils/flush_cpu_cache.h"
24
25 #include <sys/mman.h>
26 #include <stddef.h>
27 #include <stdint.h>
28 #include <stdlib.h>
29 //#include <utils/StopWatch.h>
30
31
32 /* uint kernel
33 * Q0 D0: Load slot for R
34 * D1: Load slot for G
35 * Q1 D2: Load slot for B
36 * D3: Load slot for A
37 * Q2 D4: Matrix
38 * D5: =
39 * Q3 D6: =
40 * D7: =
41 * Q4 D8: Add R
42 * D9:
43 * Q5 D10: Add G
44 * D11:
45 * Q6 D12: Add B
46 * D13:
47 * Q7 D14: Add A
48 * D15:
49 * Q8 D16: I32: R Sum
50 * D17:
51 * Q9 D18: I32: G Sum
52 * D19:
53 * Q10 D20: I32: B Sum
54 * D21:
55 * Q11 D22: I32: A Sum
56 * D23:
57 * Q12 D24: U16: expanded R
58 * D25:
59 * Q13 D26: U16: expanded G
60 * D27:
61 * Q14 D28: U16: expanded B
62 * D29:
63 * Q15 D30: U16: expanded A
64 * D31:
65 *
66 */
67
68 /* float kernel
69 * Q0 D0: Load slot for R
70 * D1: =
71 * Q1 D2: Load slot for G
72 * D3: =
73 * Q2 D4: Load slot for B
74 * D5: =
75 * Q3 D6: Load slot for A
76 * D7: =
77 * Q4 D8: Matrix
78 * D9: =
79 * Q5 D10: =
80 * D11: =
81 * Q6 D12: =
82 * D13: =
83 * Q7 D14: =
84 * D15: =
85 * Q8 D16: Add R
86 * D17: =
87 * Q9 D18: Add G
88 * D19: =
89 * Q10 D20: Add B
90 * D21: =
91 * Q11 D22: Add A
92 * D23: =
93 * Q12 D24: Sum R
94 * D25: =
95 * Q13 D26: Sum G
96 * D27: =
97 * Q14 D28: Sum B
98 * D29: =
99 * Q15 D30: Sum A
100 * D31: =
101 *
102 */
103
104
105
106 using namespace android;
107 using namespace android::renderscript;
108
109 namespace android {
110 namespace renderscript {
111
112 typedef union {
113 uint64_t key;
114 struct {
115 uint32_t inVecSize :2; // [0 - 1]
116 uint32_t outVecSize :2; // [2 - 3]
117 uint32_t inType :4; // [4 - 7]
118 uint32_t outType :4; // [8 - 11]
119 uint32_t dot :1; // [12]
120 uint32_t _unused1 :1; // [13]
121 uint32_t copyAlpha :1; // [14]
122 uint32_t _unused2 :1; // [15]
123 uint32_t coeffMask :16; // [16-31]
124 uint32_t addMask :4; // [32-35]
125 } u;
126 } Key_t;
127
128 //Re-enable when intrinsic is fixed
129 #if defined(ARCH_ARM64_USE_INTRINSICS)
130 typedef struct {
131 void (*column[4])(void);
132 void (*store)(void);
133 void (*load)(void);
134 void (*store_end)(void);
135 void (*load_end)(void);
136 } FunctionTab_t;
137
138 extern "C" void rsdIntrinsicColorMatrix_int_K(
139 void *out, void const *in, size_t count,
140 FunctionTab_t const *fns,
141 int16_t const *mult, int32_t const *add);
142
143 extern "C" void rsdIntrinsicColorMatrix_float_K(
144 void *out, void const *in, size_t count,
145 FunctionTab_t const *fns,
146 float const *mult, float const *add);
147
148 /* The setup functions fill in function tables to be used by above functions;
149 * this code also eliminates jump-to-another-jump cases by short-circuiting
150 * empty functions. While it's not performance critical, it works out easier
151 * to write the set-up code in assembly than to try to expose the same symbols
152 * and write the code in C.
153 */
154 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
155 FunctionTab_t *fns,
156 uint32_t mask, int dt, int st);
157
158 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
159 FunctionTab_t *fns,
160 uint32_t mask, int dt, int st);
161 #endif
162
163 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
164 public:
165 virtual void populateScript(Script *);
166
167 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
168
169 virtual ~RsdCpuScriptIntrinsicColorMatrix();
170 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
171
172 virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
173 const void * usr, uint32_t usrLen, const RsScriptCall *sc);
174 virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
175 const void * usr, uint32_t usrLen, const RsScriptCall *sc);
176
177 protected:
178 float fp[16];
179 float fpa[4];
180
181 // The following four fields are read as constants
182 // by the SIMD assembly code.
183 short ip[16];
184 int ipa[4];
185 float tmpFp[16];
186 float tmpFpa[4];
187 #if defined(ARCH_ARM64_USE_INTRINSICS)
188 FunctionTab_t mFnTab;
189 #endif
190
191 static void kernel(const RsForEachStubParamStruct *p,
192 uint32_t xstart, uint32_t xend,
193 uint32_t instep, uint32_t outstep);
194 void updateCoeffCache(float fpMul, float addMul);
195
196 Key_t mLastKey;
197 unsigned char *mBuf;
198 size_t mBufSize;
199
200 Key_t computeKey(const Element *ein, const Element *eout);
201
202 bool build(Key_t key);
203
204 void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
205
206 };
207
208 }
209 }
210
211
computeKey(const Element * ein,const Element * eout)212 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
213 const Element *ein, const Element *eout) {
214
215 Key_t key;
216 key.key = 0;
217
218 // Compute a unique code key for this operation
219
220 // Add to the key the input and output types
221 bool hasFloat = false;
222 if (ein->getType() == RS_TYPE_FLOAT_32) {
223 hasFloat = true;
224 key.u.inType = RS_TYPE_FLOAT_32;
225 rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
226 }
227 if (eout->getType() == RS_TYPE_FLOAT_32) {
228 hasFloat = true;
229 key.u.outType = RS_TYPE_FLOAT_32;
230 rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
231 }
232
233 // Mask in the bits indicating which coefficients in the
234 // color matrix are needed.
235 if (hasFloat) {
236 for (uint32_t i=0; i < 16; i++) {
237 if (fabs(fp[i]) != 0.f) {
238 key.u.coeffMask |= 1 << i;
239 }
240 }
241 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
242 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
243 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
244 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
245
246 } else {
247 for (uint32_t i=0; i < 16; i++) {
248 if (ip[i] != 0) {
249 key.u.coeffMask |= 1 << i;
250 }
251 }
252 if (ipa[0] != 0) key.u.addMask |= 0x1;
253 if (ipa[1] != 0) key.u.addMask |= 0x2;
254 if (ipa[2] != 0) key.u.addMask |= 0x4;
255 if (ipa[3] != 0) key.u.addMask |= 0x8;
256 }
257
258 // Look for a dot product where the r,g,b colums are the same
259 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
260 (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
261 (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
262 (ip[12] == ip[13]) && (ip[12] == ip[14])) {
263
264 if (!key.u.addMask) key.u.dot = 1;
265 }
266
267 // Is alpha a simple copy
268 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
269 key.u.copyAlpha = !(key.u.inType || key.u.outType);
270 }
271
272 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
273
274 switch (ein->getVectorSize()) {
275 case 4:
276 key.u.inVecSize = 3;
277 break;
278 case 3:
279 key.u.inVecSize = 2;
280 key.u.coeffMask &= ~0xF000;
281 break;
282 case 2:
283 key.u.inVecSize = 1;
284 key.u.coeffMask &= ~0xFF00;
285 break;
286 default:
287 key.u.coeffMask &= ~0xFFF0;
288 break;
289 }
290
291 switch (eout->getVectorSize()) {
292 case 4:
293 key.u.outVecSize = 3;
294 break;
295 case 3:
296 key.u.outVecSize = 2;
297 key.u.coeffMask &= ~0x8888;
298 key.u.addMask &= 7;
299 break;
300 case 2:
301 key.u.outVecSize = 1;
302 key.u.coeffMask &= ~0xCCCC;
303 key.u.addMask &= 3;
304 break;
305 default:
306 key.u.coeffMask &= ~0xEEEE;
307 key.u.addMask &= 1;
308 break;
309 }
310
311 if (key.u.inType && !key.u.outType) {
312 key.u.addMask |= 1;
313 if (key.u.outVecSize > 0) key.u.addMask |= 2;
314 if (key.u.outVecSize > 1) key.u.addMask |= 4;
315 if (key.u.outVecSize > 2) key.u.addMask |= 8;
316 }
317
318 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
319 return key;
320 }
321
322 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
323
324 #define DEF_SYM(x) \
325 extern "C" uint32_t _N_ColorMatrix_##x; \
326 extern "C" uint32_t _N_ColorMatrix_##x##_end; \
327 extern "C" uint32_t _N_ColorMatrix_##x##_len;
328
329 DEF_SYM(prefix_i)
DEF_SYM(prefix_f)330 DEF_SYM(prefix_f)
331 DEF_SYM(postfix1)
332 DEF_SYM(postfix2)
333
334 DEF_SYM(load_u8_4)
335 DEF_SYM(load_u8_3)
336 DEF_SYM(load_u8_2)
337 DEF_SYM(load_u8_1)
338 DEF_SYM(load_u8f_4)
339 DEF_SYM(load_u8f_3)
340 DEF_SYM(load_u8f_2)
341 DEF_SYM(load_u8f_1)
342 DEF_SYM(load_f32_4)
343 DEF_SYM(load_f32_3)
344 DEF_SYM(load_f32_2)
345 DEF_SYM(load_f32_1)
346
347 DEF_SYM(store_u8_4)
348 DEF_SYM(store_u8_2)
349 DEF_SYM(store_u8_1)
350 DEF_SYM(store_f32_4)
351 DEF_SYM(store_f32_3)
352 DEF_SYM(store_f32_2)
353 DEF_SYM(store_f32_1)
354 DEF_SYM(store_f32u_4)
355 DEF_SYM(store_f32u_2)
356 DEF_SYM(store_f32u_1)
357
358 DEF_SYM(unpack_u8_4)
359 DEF_SYM(unpack_u8_3)
360 DEF_SYM(unpack_u8_2)
361 DEF_SYM(unpack_u8_1)
362 DEF_SYM(pack_u8_4)
363 DEF_SYM(pack_u8_3)
364 DEF_SYM(pack_u8_2)
365 DEF_SYM(pack_u8_1)
366 DEF_SYM(dot)
367 DEF_SYM(add_0_u8)
368 DEF_SYM(add_1_u8)
369 DEF_SYM(add_2_u8)
370 DEF_SYM(add_3_u8)
371
372 #define ADD_CHUNK(x) \
373 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
374 buf += _N_ColorMatrix_##x##_len
375
376
377 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
378 size_t off = (target - buf - 8) >> 2;
379 rsAssert(((off & 0xff000000) == 0) ||
380 ((off & 0xff000000) == 0xff000000));
381
382 uint32_t op = (condition << 28);
383 op |= 0xa << 24; // branch
384 op |= 0xffffff & off;
385 ((uint32_t *)buf)[0] = op;
386 return buf + 4;
387 }
388
encodeSIMDRegs(uint32_t vd,uint32_t vn,uint32_t vm)389 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
390 rsAssert(vd < 32);
391 rsAssert(vm < 32);
392 rsAssert(vn < 32);
393
394 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
395 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
396 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
397 return op;
398 }
399
addVMLAL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)400 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
401 //vmlal.s16 Q#1, D#1, D#2[#]
402 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
403 ((uint32_t *)buf)[0] = op;
404 return buf + 4;
405 }
406
addVMULL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)407 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
408 //vmull.s16 Q#1, D#1, D#2[#]
409 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
410 ((uint32_t *)buf)[0] = op;
411 return buf + 4;
412 }
413
addVQADD_S32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)414 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
415 //vqadd.s32 Q#1, Q#1, Q#2
416 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
417 ((uint32_t *)buf)[0] = op;
418 return buf + 4;
419 }
420
addVMLAL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)421 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
422 //vmlal.f32 Q#1, D#1, D#2[#]
423 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
424 ((uint32_t *)buf)[0] = op;
425 return buf + 4;
426 }
427
addVMULL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)428 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
429 //vmull.f32 Q#1, D#1, D#2[#]
430 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
431 ((uint32_t *)buf)[0] = op;
432 return buf + 4;
433 }
434
addVORR_32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)435 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
436 //vadd.f32 Q#1, D#1, D#2
437 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
438 ((uint32_t *)buf)[0] = op;
439 return buf + 4;
440 }
441
addVMOV_32(uint8_t * buf,uint32_t dest_q,uint32_t imm)442 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
443 //vmov.32 Q#1, #imm
444 rsAssert(imm == 0);
445 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
446 ((uint32_t *)buf)[0] = op;
447 return buf + 4;
448 }
449
addVADD_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)450 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
451 //vadd.f32 Q#1, D#1, D#2
452 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
453 ((uint32_t *)buf)[0] = op;
454 return buf + 4;
455 }
456 #endif
457
458 #if defined(ARCH_X86_HAVE_SSSE3)
459 extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
460 const short *coef, uint32_t count);
461 extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
462 const short *coef, uint32_t count);
463 extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
464 const short *coef, uint32_t count);
465
selectKernel(Key_t key)466 void * selectKernel(Key_t key)
467 {
468 void * kernel = NULL;
469
470 // inType, outType float if nonzero
471 if (!(key.u.inType || key.u.outType)) {
472 if (key.u.dot)
473 kernel = (void *)rsdIntrinsicColorMatrixDot_K;
474 else if (key.u.copyAlpha)
475 kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
476 else
477 kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
478 }
479
480 return kernel;
481 }
482 #endif
483
build(Key_t key)484 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
485 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
486 mBufSize = 4096;
487 //StopWatch build_time("rs cm: build time");
488 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
489 MAP_PRIVATE | MAP_ANON, -1, 0);
490 if (mBuf == MAP_FAILED) {
491 mBuf = NULL;
492 return false;
493 }
494
495 uint8_t *buf = mBuf;
496 uint8_t *buf2 = NULL;
497
498 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
499 int opInit[4] = {0, 0, 0, 0};
500
501 memset(ops, 0, sizeof(ops));
502 for (int i=0; i < 4; i++) {
503 if (key.u.coeffMask & (1 << (i*4))) {
504 ops[i][0] = 0x2 | opInit[0];
505 opInit[0] = 1;
506 }
507 if (!key.u.dot) {
508 if (key.u.coeffMask & (1 << (1 + i*4))) {
509 ops[i][1] = 0x2 | opInit[1];
510 opInit[1] = 1;
511 }
512 if (key.u.coeffMask & (1 << (2 + i*4))) {
513 ops[i][2] = 0x2 | opInit[2];
514 opInit[2] = 1;
515 }
516 }
517 if (!key.u.copyAlpha) {
518 if (key.u.coeffMask & (1 << (3 + i*4))) {
519 ops[i][3] = 0x2 | opInit[3];
520 opInit[3] = 1;
521 }
522 }
523 }
524
525 if (key.u.inType || key.u.outType) {
526 key.u.copyAlpha = 0;
527 ADD_CHUNK(prefix_f);
528 buf2 = buf;
529
530 // Load the incoming r,g,b,a as needed
531 if (key.u.inType) {
532 switch(key.u.inVecSize) {
533 case 3:
534 ADD_CHUNK(load_f32_4);
535 break;
536 case 2:
537 ADD_CHUNK(load_f32_3);
538 break;
539 case 1:
540 ADD_CHUNK(load_f32_2);
541 break;
542 case 0:
543 ADD_CHUNK(load_f32_1);
544 break;
545 }
546 } else {
547 switch(key.u.inVecSize) {
548 case 3:
549 ADD_CHUNK(load_u8f_4);
550 break;
551 case 2:
552 ADD_CHUNK(load_u8f_3);
553 break;
554 case 1:
555 ADD_CHUNK(load_u8f_2);
556 break;
557 case 0:
558 ADD_CHUNK(load_u8f_1);
559 break;
560 }
561 }
562
563 for (int i=0; i < 4; i++) {
564 for (int j=0; j < 4; j++) {
565 switch(ops[i][j]) {
566 case 0:
567 break;
568 case 2:
569 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
570 break;
571 case 3:
572 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
573 break;
574 }
575 }
576 }
577 for (int j=0; j < 4; j++) {
578 if (opInit[j]) {
579 if (key.u.addMask & (1 << j)) {
580 buf = addVADD_F32(buf, j, 12+j, 8+j);
581 } else {
582 buf = addVORR_32(buf, j, 12+j, 12+j);
583 }
584 } else {
585 if (key.u.addMask & (1 << j)) {
586 buf = addVORR_32(buf, j, 8+j, 8+j);
587 } else {
588 buf = addVMOV_32(buf, j, 0);
589 }
590 }
591 }
592
593 if (key.u.outType) {
594 switch(key.u.outVecSize) {
595 case 3:
596 ADD_CHUNK(store_f32_4);
597 break;
598 case 2:
599 ADD_CHUNK(store_f32_3);
600 break;
601 case 1:
602 ADD_CHUNK(store_f32_2);
603 break;
604 case 0:
605 ADD_CHUNK(store_f32_1);
606 break;
607 }
608 } else {
609 switch(key.u.outVecSize) {
610 case 3:
611 case 2:
612 ADD_CHUNK(store_f32u_4);
613 break;
614 case 1:
615 ADD_CHUNK(store_f32u_2);
616 break;
617 case 0:
618 ADD_CHUNK(store_f32u_1);
619 break;
620 }
621 }
622
623
624 } else {
625 // Add the function prefix
626 // Store the address for the loop return
627 ADD_CHUNK(prefix_i);
628 buf2 = buf;
629
630 // Load the incoming r,g,b,a as needed
631 switch(key.u.inVecSize) {
632 case 3:
633 ADD_CHUNK(load_u8_4);
634 if (key.u.copyAlpha) {
635 ADD_CHUNK(unpack_u8_3);
636 } else {
637 ADD_CHUNK(unpack_u8_4);
638 }
639 break;
640 case 2:
641 ADD_CHUNK(load_u8_3);
642 ADD_CHUNK(unpack_u8_3);
643 break;
644 case 1:
645 ADD_CHUNK(load_u8_2);
646 ADD_CHUNK(unpack_u8_2);
647 break;
648 case 0:
649 ADD_CHUNK(load_u8_1);
650 ADD_CHUNK(unpack_u8_1);
651 break;
652 }
653
654 // Add multiply and accumulate
655 // use MULL to init the output register,
656 // use MLAL from there
657 for (int i=0; i < 4; i++) {
658 for (int j=0; j < 4; j++) {
659 switch(ops[i][j]) {
660 case 0:
661 break;
662 case 2:
663 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
664 break;
665 case 3:
666 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
667 break;
668 }
669 }
670 }
671 for (int j=0; j < 4; j++) {
672 if (opInit[j]) {
673 if (key.u.addMask & (1 << j)) {
674 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
675 }
676 } else {
677 if (key.u.addMask & (1 << j)) {
678 buf = addVORR_32(buf, 8+j, 4+j, 4+j);
679 }
680 }
681 }
682
683 // If we have a dot product, perform the special pack.
684 if (key.u.dot) {
685 ADD_CHUNK(pack_u8_1);
686 ADD_CHUNK(dot);
687 } else {
688 switch(key.u.outVecSize) {
689 case 3:
690 if (key.u.copyAlpha) {
691 ADD_CHUNK(pack_u8_3);
692 } else {
693 ADD_CHUNK(pack_u8_4);
694 }
695 break;
696 case 2:
697 ADD_CHUNK(pack_u8_3);
698 break;
699 case 1:
700 ADD_CHUNK(pack_u8_2);
701 break;
702 case 0:
703 ADD_CHUNK(pack_u8_1);
704 break;
705 }
706 }
707
708 // Write out result
709 switch(key.u.outVecSize) {
710 case 3:
711 case 2:
712 ADD_CHUNK(store_u8_4);
713 break;
714 case 1:
715 ADD_CHUNK(store_u8_2);
716 break;
717 case 0:
718 ADD_CHUNK(store_u8_1);
719 break;
720 }
721 }
722
723 if (key.u.inType != key.u.outType) {
724 key.u.copyAlpha = 0;
725 key.u.dot = 0;
726 }
727
728 // Loop, branch, and cleanup
729 ADD_CHUNK(postfix1);
730 buf = addBranch(buf, buf2, 0x01);
731 ADD_CHUNK(postfix2);
732
733 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
734 if (ret == -1) {
735 ALOGE("mprotect error %i", ret);
736 return false;
737 }
738
739 FLUSH_CPU_CACHE(mBuf, (char*) mBuf + mBufSize);
740 return true;
741 #else
742 return false;
743 #endif
744 }
745
updateCoeffCache(float fpMul,float addMul)746 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
747 for(int ct=0; ct < 16; ct++) {
748 ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
749 tmpFp[ct] = fp[ct] * fpMul;
750 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]);
751 }
752
753 float add = 0.f;
754 if (fpMul > 254.f) add = 0.5f;
755 for(int ct=0; ct < 4; ct++) {
756 tmpFpa[ct] = fpa[ct] * addMul + add;
757 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
758 }
759
760 for(int ct=0; ct < 4; ct++) {
761 ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
762 }
763 }
764
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)765 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
766 size_t dataLength) {
767 switch(slot) {
768 case 0:
769 memcpy (fp, data, sizeof(fp));
770 break;
771 case 1:
772 memcpy (fpa, data, sizeof(fpa));
773 break;
774 default:
775 rsAssert(0);
776 break;
777 }
778 mRootPtr = &kernel;
779 }
780
781
One(const RsForEachStubParamStruct * p,void * out,const void * py,const float * coeff,const float * add,uint32_t vsin,uint32_t vsout,bool fin,bool fout)782 static void One(const RsForEachStubParamStruct *p, void *out,
783 const void *py, const float* coeff, const float *add,
784 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
785
786 float4 f = 0.f;
787 if (fin) {
788 switch(vsin) {
789 case 3:
790 f = ((const float4 *)py)[0];
791 break;
792 case 2:
793 f = ((const float4 *)py)[0];
794 f.w = 0.f;
795 break;
796 case 1:
797 f.xy = ((const float2 *)py)[0];
798 break;
799 case 0:
800 f.x = ((const float *)py)[0];
801 break;
802 }
803 } else {
804 switch(vsin) {
805 case 3:
806 f = convert_float4(((const uchar4 *)py)[0]);
807 break;
808 case 2:
809 f = convert_float4(((const uchar4 *)py)[0]);
810 f.w = 0.f;
811 break;
812 case 1:
813 f.xy = convert_float2(((const uchar2 *)py)[0]);
814 break;
815 case 0:
816 f.x = (float)(((const uchar *)py)[0]);
817 break;
818 }
819 }
820 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w);
821
822 float4 sum;
823 sum.x = f.x * coeff[0] +
824 f.y * coeff[4] +
825 f.z * coeff[8] +
826 f.w * coeff[12];
827 sum.y = f.x * coeff[1] +
828 f.y * coeff[5] +
829 f.z * coeff[9] +
830 f.w * coeff[13];
831 sum.z = f.x * coeff[2] +
832 f.y * coeff[6] +
833 f.z * coeff[10] +
834 f.w * coeff[14];
835 sum.w = f.x * coeff[3] +
836 f.y * coeff[7] +
837 f.z * coeff[11] +
838 f.w * coeff[15];
839 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
840
841 sum.x += add[0];
842 sum.y += add[1];
843 sum.z += add[2];
844 sum.w += add[3];
845
846
847 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
848 if (fout) {
849 switch(vsout) {
850 case 3:
851 case 2:
852 ((float4 *)out)[0] = sum;
853 break;
854 case 1:
855 ((float2 *)out)[0] = sum.xy;
856 break;
857 case 0:
858 ((float *)out)[0] = sum.x;
859 break;
860 }
861 } else {
862 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
863 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
864 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
865 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
866
867 switch(vsout) {
868 case 3:
869 case 2:
870 ((uchar4 *)out)[0] = convert_uchar4(sum);
871 break;
872 case 1:
873 ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
874 break;
875 case 0:
876 ((uchar *)out)[0] = sum.x;
877 break;
878 }
879 }
880 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
881 }
882
kernel(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)883 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
884 uint32_t xstart, uint32_t xend,
885 uint32_t instep, uint32_t outstep) {
886 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
887 uchar *out = (uchar *)p->out;
888 uchar *in = (uchar *)p->in;
889 uint32_t x1 = xstart;
890 uint32_t x2 = xend;
891
892 uint32_t vsin = cp->mLastKey.u.inVecSize;
893 uint32_t vsout = cp->mLastKey.u.outVecSize;
894 bool floatIn = !!cp->mLastKey.u.inType;
895 bool floatOut = !!cp->mLastKey.u.outType;
896
897 //if (!p->y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
898
899 if(x2 > x1) {
900 int32_t len = x2 - x1;
901 if (gArchUseSIMD) {
902 if((cp->mOptKernel != NULL) && (len >= 4)) {
903 // The optimized kernel processes 4 pixels at once
904 // and requires a minimum of 1 chunk of 4
905 cp->mOptKernel(out, in, cp->ip, len >> 2);
906 // Update the len and pointers so the generic code can
907 // finish any leftover pixels
908 len &= ~3;
909 x1 += len;
910 out += outstep * len;
911 in += instep * len;
912 }
913 #if defined(ARCH_ARM64_USE_INTRINSICS)
914 else {
915 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
916 // Currently this generates off by one errors.
917 //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
918 //x1 += len;
919 //out += outstep * len;
920 //in += instep * len;
921 } else {
922 rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
923 x1 += len;
924 out += outstep * len;
925 in += instep * len;
926 }
927 }
928 #endif
929 }
930
931 while(x1 != x2) {
932 One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
933 out += outstep;
934 in += instep;
935 x1++;
936 }
937 }
938 }
939
preLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)940 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
941 uint32_t slot, const Allocation * ain, Allocation * aout,
942 const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
943
944 const Element *ein = ain->mHal.state.type->getElement();
945 const Element *eout = aout->mHal.state.type->getElement();
946
947 if (ein->getType() == eout->getType()) {
948 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
949 updateCoeffCache(1.f, 255.f);
950 } else {
951 updateCoeffCache(1.f, 1.f);
952 }
953 } else {
954 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
955 updateCoeffCache(255.f, 255.f);
956 } else {
957 updateCoeffCache(1.f / 255.f, 1.f);
958 }
959 }
960
961 Key_t key = computeKey(ain->mHal.state.type->getElement(),
962 aout->mHal.state.type->getElement());
963 #if defined(ARCH_X86_HAVE_SSSE3)
964 if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
965 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
966 // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
967 mLastKey = key;
968 }
969
970 #else //if !defined(ARCH_X86_HAVE_SSSE3)
971 if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
972 if (mBuf) munmap(mBuf, mBufSize);
973 mBuf = NULL;
974 mOptKernel = NULL;
975 if (build(key)) {
976 mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
977 }
978 #if defined(ARCH_ARM64_USE_INTRINSICS)
979 else {
980 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
981 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
982 uint32_t mm = 0;
983 int i;
984 for (i = 0; i < 4; i++)
985 {
986 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
987 m = ((m * 0x249) >> 9) & 15;
988 m |= ((key.u.addMask >> i) & 1) << 4;
989 mm |= m << (i * 5);
990 }
991
992 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
993 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
994 } else {
995 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
996 }
997 }
998 #endif
999 mLastKey = key;
1000 }
1001 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1002 }
1003
postLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)1004 void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
1005 uint32_t slot, const Allocation * ain, Allocation * aout,
1006 const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
1007
1008 }
1009
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1010 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1011 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1012 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1013
1014 mLastKey.key = 0;
1015 mBuf = NULL;
1016 mBufSize = 0;
1017 mOptKernel = NULL;
1018 const static float defaultMatrix[] = {
1019 1.f, 0.f, 0.f, 0.f,
1020 0.f, 1.f, 0.f, 0.f,
1021 0.f, 0.f, 1.f, 0.f,
1022 0.f, 0.f, 0.f, 1.f
1023 };
1024 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1025 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
1026 setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1027 }
1028
~RsdCpuScriptIntrinsicColorMatrix()1029 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
1030 if (mBuf) munmap(mBuf, mBufSize);
1031 mBuf = NULL;
1032 mOptKernel = NULL;
1033 }
1034
populateScript(Script * s)1035 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
1036 s->mHal.info.exportedVariableCount = 2;
1037 }
1038
rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1039 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1040 const Script *s, const Element *e) {
1041
1042 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1043 }
1044