1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <sys/mman.h>
18 #include <unistd.h>
19
20 #include "rsCpuIntrinsic.h"
21 #include "rsCpuIntrinsicInlines.h"
22
23 #include <sys/mman.h>
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 //#include <utils/StopWatch.h>
28
29
30 /* uint kernel
31 * Q0 D0: Load slot for R
32 * D1: Load slot for G
33 * Q1 D2: Load slot for B
34 * D3: Load slot for A
35 * Q2 D4: Matrix
36 * D5: =
37 * Q3 D6: =
38 * D7: =
39 * Q4 D8: Add R
40 * D9:
41 * Q5 D10: Add G
42 * D11:
43 * Q6 D12: Add B
44 * D13:
45 * Q7 D14: Add A
46 * D15:
47 * Q8 D16: I32: R Sum
48 * D17:
49 * Q9 D18: I32: G Sum
50 * D19:
51 * Q10 D20: I32: B Sum
52 * D21:
53 * Q11 D22: I32: A Sum
54 * D23:
55 * Q12 D24: U16: expanded R
56 * D25:
57 * Q13 D26: U16: expanded G
58 * D27:
59 * Q14 D28: U16: expanded B
60 * D29:
61 * Q15 D30: U16: expanded A
62 * D31:
63 *
64 */
65
66 /* float kernel
67 * Q0 D0: Load slot for R
68 * D1: =
69 * Q1 D2: Load slot for G
70 * D3: =
71 * Q2 D4: Load slot for B
72 * D5: =
73 * Q3 D6: Load slot for A
74 * D7: =
75 * Q4 D8: Matrix
76 * D9: =
77 * Q5 D10: =
78 * D11: =
79 * Q6 D12: =
80 * D13: =
81 * Q7 D14: =
82 * D15: =
83 * Q8 D16: Add R
84 * D17: =
85 * Q9 D18: Add G
86 * D19: =
87 * Q10 D20: Add B
88 * D21: =
89 * Q11 D22: Add A
90 * D23: =
91 * Q12 D24: Sum R
92 * D25: =
93 * Q13 D26: Sum G
94 * D27: =
95 * Q14 D28: Sum B
96 * D29: =
97 * Q15 D30: Sum A
98 * D31: =
99 *
100 */
101
102
103
104 using namespace android;
105 using namespace android::renderscript;
106
107 namespace android {
108 namespace renderscript {
109
110 typedef union {
111 uint64_t key;
112 struct {
113 uint32_t inVecSize :2; // [0 - 1]
114 uint32_t outVecSize :2; // [2 - 3]
115 uint32_t inType :4; // [4 - 7]
116 uint32_t outType :4; // [8 - 11]
117 uint32_t dot :1; // [12]
118 uint32_t _unused1 :1; // [13]
119 uint32_t copyAlpha :1; // [14]
120 uint32_t _unused2 :1; // [15]
121 uint32_t coeffMask :16; // [16-31]
122 uint32_t addMask :4; // [32-35]
123 } u;
124 } Key_t;
125
126 //Re-enable when intrinsic is fixed
127 #if defined(ARCH_ARM64_USE_INTRINSICS)
128 typedef struct {
129 void (*column[4])(void);
130 void (*store)(void);
131 void (*load)(void);
132 void (*store_end)(void);
133 void (*load_end)(void);
134 } FunctionTab_t;
135
136 extern "C" void rsdIntrinsicColorMatrix_int_K(
137 void *out, void const *in, size_t count,
138 FunctionTab_t const *fns,
139 int16_t const *mult, int32_t const *add);
140
141 extern "C" void rsdIntrinsicColorMatrix_float_K(
142 void *out, void const *in, size_t count,
143 FunctionTab_t const *fns,
144 float const *mult, float const *add);
145
146 /* The setup functions fill in function tables to be used by above functions;
147 * this code also eliminates jump-to-another-jump cases by short-circuiting
148 * empty functions. While it's not performance critical, it works out easier
149 * to write the set-up code in assembly than to try to expose the same symbols
150 * and write the code in C.
151 */
152 extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
153 FunctionTab_t *fns,
154 uint32_t mask, int dt, int st);
155
156 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
157 FunctionTab_t *fns,
158 uint32_t mask, int dt, int st);
159 #endif
160
161 class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
162 public:
163 void populateScript(Script *) override;
164
165 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
166
167 ~RsdCpuScriptIntrinsicColorMatrix() override;
168 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
169
170 void preLaunch(uint32_t slot, const Allocation ** ains,
171 uint32_t inLen, Allocation * aout, const void * usr,
172 uint32_t usrLen, const RsScriptCall *sc) override;
173
174 protected:
175 float fp[16];
176 float fpa[4];
177
178 // The following four fields are read as constants
179 // by the SIMD assembly code.
180 short ip[16];
181 int ipa[4];
182 float tmpFp[16];
183 float tmpFpa[4];
184 #if defined(ARCH_ARM64_USE_INTRINSICS)
185 FunctionTab_t mFnTab;
186 #endif
187
188 static void kernel(const RsExpandKernelDriverInfo *info,
189 uint32_t xstart, uint32_t xend,
190 uint32_t outstep);
191 void updateCoeffCache(float fpMul, float addMul);
192
193 Key_t mLastKey;
194 unsigned char *mBuf;
195 size_t mBufSize;
196
197 Key_t computeKey(const Element *ein, const Element *eout);
198
199 bool build(Key_t key);
200
201 void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
202
203 };
204
205 }
206 }
207
208
computeKey(const Element * ein,const Element * eout)209 Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
210 const Element *ein, const Element *eout) {
211
212 Key_t key;
213 key.key = 0;
214
215 // Compute a unique code key for this operation
216
217 // Add to the key the input and output types
218 bool hasFloat = false;
219 if (ein->getType() == RS_TYPE_FLOAT_32) {
220 hasFloat = true;
221 key.u.inType = RS_TYPE_FLOAT_32;
222 rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
223 }
224 if (eout->getType() == RS_TYPE_FLOAT_32) {
225 hasFloat = true;
226 key.u.outType = RS_TYPE_FLOAT_32;
227 rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
228 }
229
230 // Mask in the bits indicating which coefficients in the
231 // color matrix are needed.
232 if (hasFloat) {
233 for (uint32_t i=0; i < 16; i++) {
234 if (fabs(fp[i]) != 0.f) {
235 key.u.coeffMask |= 1 << i;
236 }
237 }
238 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
239 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
240 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
241 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
242
243 } else {
244 for (uint32_t i=0; i < 16; i++) {
245 if (ip[i] != 0) {
246 key.u.coeffMask |= 1 << i;
247 }
248 }
249 if (ipa[0] != 0) key.u.addMask |= 0x1;
250 if (ipa[1] != 0) key.u.addMask |= 0x2;
251 if (ipa[2] != 0) key.u.addMask |= 0x4;
252 if (ipa[3] != 0) key.u.addMask |= 0x8;
253 }
254
255 // Look for a dot product where the r,g,b colums are the same
256 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
257 (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
258 (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
259 (ip[12] == ip[13]) && (ip[12] == ip[14])) {
260
261 if (!key.u.addMask) key.u.dot = 1;
262 }
263
264 // Is alpha a simple copy
265 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
266 key.u.copyAlpha = !(key.u.inType || key.u.outType);
267 }
268
269 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
270
271 switch (ein->getVectorSize()) {
272 case 4:
273 key.u.inVecSize = 3;
274 break;
275 case 3:
276 key.u.inVecSize = 2;
277 key.u.coeffMask &= ~0xF000;
278 break;
279 case 2:
280 key.u.inVecSize = 1;
281 key.u.coeffMask &= ~0xFF00;
282 break;
283 default:
284 key.u.coeffMask &= ~0xFFF0;
285 break;
286 }
287
288 switch (eout->getVectorSize()) {
289 case 4:
290 key.u.outVecSize = 3;
291 break;
292 case 3:
293 key.u.outVecSize = 2;
294 key.u.coeffMask &= ~0x8888;
295 key.u.addMask &= 7;
296 break;
297 case 2:
298 key.u.outVecSize = 1;
299 key.u.coeffMask &= ~0xCCCC;
300 key.u.addMask &= 3;
301 break;
302 default:
303 key.u.coeffMask &= ~0xEEEE;
304 key.u.addMask &= 1;
305 break;
306 }
307
308 if (key.u.inType && !key.u.outType) {
309 key.u.addMask |= 1;
310 if (key.u.outVecSize > 0) key.u.addMask |= 2;
311 if (key.u.outVecSize > 1) key.u.addMask |= 4;
312 if (key.u.outVecSize > 2) key.u.addMask |= 8;
313 }
314
315 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
316 return key;
317 }
318
319 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
320
321 #define DEF_SYM(x) \
322 extern "C" uint32_t _N_ColorMatrix_##x; \
323 extern "C" uint32_t _N_ColorMatrix_##x##_end; \
324 extern "C" uint32_t _N_ColorMatrix_##x##_len;
325
326 DEF_SYM(prefix_i)
DEF_SYM(prefix_f)327 DEF_SYM(prefix_f)
328 DEF_SYM(postfix1)
329 DEF_SYM(postfix2)
330
331 DEF_SYM(load_u8_4)
332 DEF_SYM(load_u8_3)
333 DEF_SYM(load_u8_2)
334 DEF_SYM(load_u8_1)
335 DEF_SYM(load_u8f_4)
336 DEF_SYM(load_u8f_3)
337 DEF_SYM(load_u8f_2)
338 DEF_SYM(load_u8f_1)
339 DEF_SYM(load_f32_4)
340 DEF_SYM(load_f32_3)
341 DEF_SYM(load_f32_2)
342 DEF_SYM(load_f32_1)
343
344 DEF_SYM(store_u8_4)
345 DEF_SYM(store_u8_2)
346 DEF_SYM(store_u8_1)
347 DEF_SYM(store_f32_4)
348 DEF_SYM(store_f32_3)
349 DEF_SYM(store_f32_2)
350 DEF_SYM(store_f32_1)
351 DEF_SYM(store_f32u_4)
352 DEF_SYM(store_f32u_2)
353 DEF_SYM(store_f32u_1)
354
355 DEF_SYM(unpack_u8_4)
356 DEF_SYM(unpack_u8_3)
357 DEF_SYM(unpack_u8_2)
358 DEF_SYM(unpack_u8_1)
359 DEF_SYM(pack_u8_4)
360 DEF_SYM(pack_u8_3)
361 DEF_SYM(pack_u8_2)
362 DEF_SYM(pack_u8_1)
363 DEF_SYM(dot)
364 DEF_SYM(add_0_u8)
365 DEF_SYM(add_1_u8)
366 DEF_SYM(add_2_u8)
367 DEF_SYM(add_3_u8)
368
369 #define ADD_CHUNK(x) \
370 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
371 buf += _N_ColorMatrix_##x##_len
372
373
374 static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
375 size_t off = (target - buf - 8) >> 2;
376 rsAssert(((off & 0xff000000) == 0) ||
377 ((off & 0xff000000) == 0xff000000));
378
379 uint32_t op = (condition << 28);
380 op |= 0xa << 24; // branch
381 op |= 0xffffff & off;
382 ((uint32_t *)buf)[0] = op;
383 return buf + 4;
384 }
385
encodeSIMDRegs(uint32_t vd,uint32_t vn,uint32_t vm)386 static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
387 rsAssert(vd < 32);
388 rsAssert(vm < 32);
389 rsAssert(vn < 32);
390
391 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
392 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
393 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
394 return op;
395 }
396
addVMLAL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)397 static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
398 //vmlal.s16 Q#1, D#1, D#2[#]
399 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
400 ((uint32_t *)buf)[0] = op;
401 return buf + 4;
402 }
403
addVMULL_S16(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)404 static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
405 //vmull.s16 Q#1, D#1, D#2[#]
406 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
407 ((uint32_t *)buf)[0] = op;
408 return buf + 4;
409 }
410
addVQADD_S32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)411 static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
412 //vqadd.s32 Q#1, Q#1, Q#2
413 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
414 ((uint32_t *)buf)[0] = op;
415 return buf + 4;
416 }
417
addVMLAL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)418 static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
419 //vmlal.f32 Q#1, D#1, D#2[#]
420 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
421 ((uint32_t *)buf)[0] = op;
422 return buf + 4;
423 }
424
addVMULL_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_d1,uint32_t src_d2,uint32_t src_d2_s)425 static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
426 //vmull.f32 Q#1, D#1, D#2[#]
427 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
428 ((uint32_t *)buf)[0] = op;
429 return buf + 4;
430 }
431
addVORR_32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)432 static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
433 //vadd.f32 Q#1, D#1, D#2
434 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
435 ((uint32_t *)buf)[0] = op;
436 return buf + 4;
437 }
438
addVMOV_32(uint8_t * buf,uint32_t dest_q,uint32_t imm)439 static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
440 //vmov.32 Q#1, #imm
441 rsAssert(imm == 0);
442 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
443 ((uint32_t *)buf)[0] = op;
444 return buf + 4;
445 }
446
addVADD_F32(uint8_t * buf,uint32_t dest_q,uint32_t src_q1,uint32_t src_q2)447 static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
448 //vadd.f32 Q#1, D#1, D#2
449 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
450 ((uint32_t *)buf)[0] = op;
451 return buf + 4;
452 }
453 #endif
454
455 #if defined(ARCH_X86_HAVE_SSSE3)
456 extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
457 const short *coef, uint32_t count);
458 extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
459 const short *coef, uint32_t count);
460 extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
461 const short *coef, uint32_t count);
462
selectKernel(Key_t key)463 void * selectKernel(Key_t key)
464 {
465 void * kernel = nullptr;
466
467 // inType, outType float if nonzero
468 if (!(key.u.inType || key.u.outType)) {
469 if (key.u.dot)
470 kernel = (void *)rsdIntrinsicColorMatrixDot_K;
471 else if (key.u.copyAlpha)
472 kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
473 else
474 kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
475 }
476
477 return kernel;
478 }
479 #endif
480
build(Key_t key)481 bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
482 #if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
483 mBufSize = 4096;
484 //StopWatch build_time("rs cm: build time");
485 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
486 MAP_PRIVATE | MAP_ANON, -1, 0);
487 if (mBuf == MAP_FAILED) {
488 mBuf = NULL;
489 return false;
490 }
491
492 uint8_t *buf = mBuf;
493 uint8_t *buf2 = nullptr;
494
495 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final
496 int opInit[4] = {0, 0, 0, 0};
497
498 memset(ops, 0, sizeof(ops));
499 for (int i=0; i < 4; i++) {
500 if (key.u.coeffMask & (1 << (i*4))) {
501 ops[i][0] = 0x2 | opInit[0];
502 opInit[0] = 1;
503 }
504 if (!key.u.dot) {
505 if (key.u.coeffMask & (1 << (1 + i*4))) {
506 ops[i][1] = 0x2 | opInit[1];
507 opInit[1] = 1;
508 }
509 if (key.u.coeffMask & (1 << (2 + i*4))) {
510 ops[i][2] = 0x2 | opInit[2];
511 opInit[2] = 1;
512 }
513 }
514 if (!key.u.copyAlpha) {
515 if (key.u.coeffMask & (1 << (3 + i*4))) {
516 ops[i][3] = 0x2 | opInit[3];
517 opInit[3] = 1;
518 }
519 }
520 }
521
522 if (key.u.inType || key.u.outType) {
523 key.u.copyAlpha = 0;
524 ADD_CHUNK(prefix_f);
525 buf2 = buf;
526
527 // Load the incoming r,g,b,a as needed
528 if (key.u.inType) {
529 switch(key.u.inVecSize) {
530 case 3:
531 ADD_CHUNK(load_f32_4);
532 break;
533 case 2:
534 ADD_CHUNK(load_f32_3);
535 break;
536 case 1:
537 ADD_CHUNK(load_f32_2);
538 break;
539 case 0:
540 ADD_CHUNK(load_f32_1);
541 break;
542 }
543 } else {
544 switch(key.u.inVecSize) {
545 case 3:
546 ADD_CHUNK(load_u8f_4);
547 break;
548 case 2:
549 ADD_CHUNK(load_u8f_3);
550 break;
551 case 1:
552 ADD_CHUNK(load_u8f_2);
553 break;
554 case 0:
555 ADD_CHUNK(load_u8f_1);
556 break;
557 }
558 }
559
560 for (int i=0; i < 4; i++) {
561 for (int j=0; j < 4; j++) {
562 switch(ops[i][j]) {
563 case 0:
564 break;
565 case 2:
566 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
567 break;
568 case 3:
569 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
570 break;
571 }
572 }
573 }
574 for (int j=0; j < 4; j++) {
575 if (opInit[j]) {
576 if (key.u.addMask & (1 << j)) {
577 buf = addVADD_F32(buf, j, 12+j, 8+j);
578 } else {
579 buf = addVORR_32(buf, j, 12+j, 12+j);
580 }
581 } else {
582 if (key.u.addMask & (1 << j)) {
583 buf = addVORR_32(buf, j, 8+j, 8+j);
584 } else {
585 buf = addVMOV_32(buf, j, 0);
586 }
587 }
588 }
589
590 if (key.u.outType) {
591 switch(key.u.outVecSize) {
592 case 3:
593 ADD_CHUNK(store_f32_4);
594 break;
595 case 2:
596 ADD_CHUNK(store_f32_3);
597 break;
598 case 1:
599 ADD_CHUNK(store_f32_2);
600 break;
601 case 0:
602 ADD_CHUNK(store_f32_1);
603 break;
604 }
605 } else {
606 switch(key.u.outVecSize) {
607 case 3:
608 case 2:
609 ADD_CHUNK(store_f32u_4);
610 break;
611 case 1:
612 ADD_CHUNK(store_f32u_2);
613 break;
614 case 0:
615 ADD_CHUNK(store_f32u_1);
616 break;
617 }
618 }
619
620
621 } else {
622 // Add the function prefix
623 // Store the address for the loop return
624 ADD_CHUNK(prefix_i);
625 buf2 = buf;
626
627 // Load the incoming r,g,b,a as needed
628 switch(key.u.inVecSize) {
629 case 3:
630 ADD_CHUNK(load_u8_4);
631 if (key.u.copyAlpha) {
632 ADD_CHUNK(unpack_u8_3);
633 } else {
634 ADD_CHUNK(unpack_u8_4);
635 }
636 break;
637 case 2:
638 ADD_CHUNK(load_u8_3);
639 ADD_CHUNK(unpack_u8_3);
640 break;
641 case 1:
642 ADD_CHUNK(load_u8_2);
643 ADD_CHUNK(unpack_u8_2);
644 break;
645 case 0:
646 ADD_CHUNK(load_u8_1);
647 ADD_CHUNK(unpack_u8_1);
648 break;
649 }
650
651 // Add multiply and accumulate
652 // use MULL to init the output register,
653 // use MLAL from there
654 for (int i=0; i < 4; i++) {
655 for (int j=0; j < 4; j++) {
656 switch(ops[i][j]) {
657 case 0:
658 break;
659 case 2:
660 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
661 break;
662 case 3:
663 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
664 break;
665 }
666 }
667 }
668 for (int j=0; j < 4; j++) {
669 if (opInit[j]) {
670 if (key.u.addMask & (1 << j)) {
671 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
672 }
673 } else {
674 if (key.u.addMask & (1 << j)) {
675 buf = addVORR_32(buf, 8+j, 4+j, 4+j);
676 }
677 }
678 }
679
680 // If we have a dot product, perform the special pack.
681 if (key.u.dot) {
682 ADD_CHUNK(pack_u8_1);
683 ADD_CHUNK(dot);
684 } else {
685 switch(key.u.outVecSize) {
686 case 3:
687 if (key.u.copyAlpha) {
688 ADD_CHUNK(pack_u8_3);
689 } else {
690 ADD_CHUNK(pack_u8_4);
691 }
692 break;
693 case 2:
694 ADD_CHUNK(pack_u8_3);
695 break;
696 case 1:
697 ADD_CHUNK(pack_u8_2);
698 break;
699 case 0:
700 ADD_CHUNK(pack_u8_1);
701 break;
702 }
703 }
704
705 // Write out result
706 switch(key.u.outVecSize) {
707 case 3:
708 case 2:
709 ADD_CHUNK(store_u8_4);
710 break;
711 case 1:
712 ADD_CHUNK(store_u8_2);
713 break;
714 case 0:
715 ADD_CHUNK(store_u8_1);
716 break;
717 }
718 }
719
720 if (key.u.inType != key.u.outType) {
721 key.u.copyAlpha = 0;
722 key.u.dot = 0;
723 }
724
725 // Loop, branch, and cleanup
726 ADD_CHUNK(postfix1);
727 buf = addBranch(buf, buf2, 0x01);
728 ADD_CHUNK(postfix2);
729
730 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
731 if (ret == -1) {
732 ALOGE("mprotect error %i", ret);
733 return false;
734 }
735
736 __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
737 return true;
738 #else
739 return false;
740 #endif
741 }
742
updateCoeffCache(float fpMul,float addMul)743 void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
744 for(int ct=0; ct < 16; ct++) {
745 ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
746 tmpFp[ct] = fp[ct] * fpMul;
747 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]);
748 }
749
750 float add = 0.f;
751 if (fpMul > 254.f) add = 0.5f;
752 for(int ct=0; ct < 4; ct++) {
753 tmpFpa[ct] = fpa[ct] * addMul + add;
754 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
755 }
756
757 for(int ct=0; ct < 4; ct++) {
758 ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
759 }
760 }
761
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)762 void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
763 size_t dataLength) {
764 switch(slot) {
765 case 0:
766 memcpy (fp, data, sizeof(fp));
767 break;
768 case 1:
769 memcpy (fpa, data, sizeof(fpa));
770 break;
771 default:
772 rsAssert(0);
773 break;
774 }
775 mRootPtr = &kernel;
776 }
777
778
One(const RsExpandKernelDriverInfo * info,void * out,const void * py,const float * coeff,const float * add,uint32_t vsin,uint32_t vsout,bool fin,bool fout)779 static void One(const RsExpandKernelDriverInfo *info, void *out,
780 const void *py, const float* coeff, const float *add,
781 uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
782
783 float4 f = 0.f;
784 if (fin) {
785 switch(vsin) {
786 case 3:
787 f = ((const float4 *)py)[0];
788 break;
789 case 2:
790 f = ((const float4 *)py)[0];
791 f.w = 0.f;
792 break;
793 case 1:
794 f.xy = ((const float2 *)py)[0];
795 break;
796 case 0:
797 f.x = ((const float *)py)[0];
798 break;
799 }
800 } else {
801 switch(vsin) {
802 case 3:
803 f = convert_float4(((const uchar4 *)py)[0]);
804 break;
805 case 2:
806 f = convert_float4(((const uchar4 *)py)[0]);
807 f.w = 0.f;
808 break;
809 case 1:
810 f.xy = convert_float2(((const uchar2 *)py)[0]);
811 break;
812 case 0:
813 f.x = (float)(((const uchar *)py)[0]);
814 break;
815 }
816 }
817 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w);
818
819 float4 sum;
820 sum.x = f.x * coeff[0] +
821 f.y * coeff[4] +
822 f.z * coeff[8] +
823 f.w * coeff[12];
824 sum.y = f.x * coeff[1] +
825 f.y * coeff[5] +
826 f.z * coeff[9] +
827 f.w * coeff[13];
828 sum.z = f.x * coeff[2] +
829 f.y * coeff[6] +
830 f.z * coeff[10] +
831 f.w * coeff[14];
832 sum.w = f.x * coeff[3] +
833 f.y * coeff[7] +
834 f.z * coeff[11] +
835 f.w * coeff[15];
836 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
837
838 sum.x += add[0];
839 sum.y += add[1];
840 sum.z += add[2];
841 sum.w += add[3];
842
843
844 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
845 if (fout) {
846 switch(vsout) {
847 case 3:
848 case 2:
849 ((float4 *)out)[0] = sum;
850 break;
851 case 1:
852 ((float2 *)out)[0] = sum.xy;
853 break;
854 case 0:
855 ((float *)out)[0] = sum.x;
856 break;
857 }
858 } else {
859 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
860 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
861 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
862 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
863
864 switch(vsout) {
865 case 3:
866 case 2:
867 ((uchar4 *)out)[0] = convert_uchar4(sum);
868 break;
869 case 1:
870 ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
871 break;
872 case 0:
873 ((uchar *)out)[0] = sum.x;
874 break;
875 }
876 }
877 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
878 }
879
kernel(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)880 void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
881 uint32_t xstart, uint32_t xend,
882 uint32_t outstep) {
883 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
884
885 uint32_t instep = info->inStride[0];
886
887 uchar *out = (uchar *)info->outPtr[0];
888 uchar *in = (uchar *)info->inPtr[0];
889 uint32_t x1 = xstart;
890 uint32_t x2 = xend;
891
892 uint32_t vsin = cp->mLastKey.u.inVecSize;
893 uint32_t vsout = cp->mLastKey.u.outVecSize;
894 bool floatIn = !!cp->mLastKey.u.inType;
895 bool floatOut = !!cp->mLastKey.u.outType;
896
897 //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout);
898
899 if(x2 > x1) {
900 int32_t len = x2 - x1;
901 if (gArchUseSIMD) {
902 if((cp->mOptKernel != nullptr) && (len >= 4)) {
903 // The optimized kernel processes 4 pixels at once
904 // and requires a minimum of 1 chunk of 4
905 cp->mOptKernel(out, in, cp->ip, len >> 2);
906 // Update the len and pointers so the generic code can
907 // finish any leftover pixels
908 len &= ~3;
909 x1 += len;
910 out += outstep * len;
911 in += instep * len;
912 }
913 #if defined(ARCH_ARM64_USE_INTRINSICS)
914 else {
915 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
916 // Currently this generates off by one errors.
917 //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
918 //x1 += len;
919 //out += outstep * len;
920 //in += instep * len;
921 } else {
922 rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
923 x1 += len;
924 out += outstep * len;
925 in += instep * len;
926 }
927 }
928 #endif
929 }
930
931 while(x1 != x2) {
932 One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
933 out += outstep;
934 in += instep;
935 x1++;
936 }
937 }
938 }
939
preLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)940 void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
941 const Allocation ** ains,
942 uint32_t inLen,
943 Allocation * aout,
944 const void * usr,
945 uint32_t usrLen,
946 const RsScriptCall *sc) {
947
948 const Element *ein = ains[0]->mHal.state.type->getElement();
949 const Element *eout = aout->mHal.state.type->getElement();
950
951 if (ein->getType() == eout->getType()) {
952 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
953 updateCoeffCache(1.f, 255.f);
954 } else {
955 updateCoeffCache(1.f, 1.f);
956 }
957 } else {
958 if (eout->getType() == RS_TYPE_UNSIGNED_8) {
959 updateCoeffCache(255.f, 255.f);
960 } else {
961 updateCoeffCache(1.f / 255.f, 1.f);
962 }
963 }
964
965 Key_t key = computeKey(ein, eout);
966
967 #if defined(ARCH_X86_HAVE_SSSE3)
968 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
969 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
970 // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
971 mLastKey = key;
972 }
973
974 #else //if !defined(ARCH_X86_HAVE_SSSE3)
975 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
976 if (mBuf) munmap(mBuf, mBufSize);
977 mBuf = nullptr;
978 mOptKernel = nullptr;
979 if (build(key)) {
980 mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
981 }
982 #if defined(ARCH_ARM64_USE_INTRINSICS)
983 else {
984 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
985 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
986 uint32_t mm = 0;
987 int i;
988 for (i = 0; i < 4; i++)
989 {
990 uint32_t m = (key.u.coeffMask >> i) & 0x1111;
991 m = ((m * 0x249) >> 9) & 15;
992 m |= ((key.u.addMask >> i) & 1) << 4;
993 mm |= m << (i * 5);
994 }
995
996 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
997 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
998 } else {
999 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1000 }
1001 }
1002 #endif
1003 mLastKey = key;
1004 }
1005 #endif //if !defined(ARCH_X86_HAVE_SSSE3)
1006 }
1007
RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1008 RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1009 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1010 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1011
1012 mLastKey.key = 0;
1013 mBuf = nullptr;
1014 mBufSize = 0;
1015 mOptKernel = nullptr;
1016 const static float defaultMatrix[] = {
1017 1.f, 0.f, 0.f, 0.f,
1018 0.f, 1.f, 0.f, 0.f,
1019 0.f, 0.f, 1.f, 0.f,
1020 0.f, 0.f, 0.f, 1.f
1021 };
1022 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1023 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
1024 setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1025 }
1026
~RsdCpuScriptIntrinsicColorMatrix()1027 RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
1028 if (mBuf) munmap(mBuf, mBufSize);
1029 mBuf = nullptr;
1030 mOptKernel = nullptr;
1031 }
1032
populateScript(Script * s)1033 void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
1034 s->mHal.info.exportedVariableCount = 2;
1035 }
1036
rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)1037 RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1038 const Script *s, const Element *e) {
1039
1040 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1041 }
1042