1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "rsCpuCore.h"
18 #include "rsCpuScript.h"
19 #include "rsCpuScriptGroup.h"
20 #include "rsCpuScriptGroup2.h"
21
22 #include <malloc.h>
23 #include "rsContext.h"
24
25 #include <sys/types.h>
26 #include <sys/resource.h>
27 #include <sched.h>
28 #include <sys/syscall.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <unistd.h>
32
33 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
34 #include <cutils/properties.h>
35 #include "utils/StopWatch.h"
36 #endif
37
38 #ifdef RS_SERVER
39 // Android exposes gettid(), standard Linux does not
gettid()40 static pid_t gettid() {
41 return syscall(SYS_gettid);
42 }
43 #endif
44
45 using namespace android;
46 using namespace android::renderscript;
47
48 #define REDUCE_ALOGV(mtls, level, ...) do { if ((mtls)->logReduce >= (level)) ALOGV(__VA_ARGS__); } while(0)
49
50 static pthread_key_t gThreadTLSKey = 0;
51 static uint32_t gThreadTLSKeyCount = 0;
52 static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
53
54 bool android::renderscript::gArchUseSIMD = false;
55
~RsdCpuReference()56 RsdCpuReference::~RsdCpuReference() {
57 }
58
create(Context * rsc,uint32_t version_major,uint32_t version_minor,sym_lookup_t lfn,script_lookup_t slfn,RSSelectRTCallback pSelectRTCallback,const char * pBccPluginName)59 RsdCpuReference * RsdCpuReference::create(Context *rsc, uint32_t version_major,
60 uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn
61 , RSSelectRTCallback pSelectRTCallback,
62 const char *pBccPluginName
63 ) {
64
65 RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
66 if (!cpu) {
67 return nullptr;
68 }
69 if (!cpu->init(version_major, version_minor, lfn, slfn)) {
70 delete cpu;
71 return nullptr;
72 }
73
74 cpu->setSelectRTCallback(pSelectRTCallback);
75 if (pBccPluginName) {
76 cpu->setBccPluginName(pBccPluginName);
77 }
78
79 return cpu;
80 }
81
82
getTlsContext()83 Context * RsdCpuReference::getTlsContext() {
84 ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
85 return tls->mContext;
86 }
87
getTlsScript()88 const Script * RsdCpuReference::getTlsScript() {
89 ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
90 return tls->mScript;
91 }
92
getThreadTLSKey()93 pthread_key_t RsdCpuReference::getThreadTLSKey(){ return gThreadTLSKey; }
94
95 ////////////////////////////////////////////////////////////
96 ///
97
RsdCpuReferenceImpl(Context * rsc)98 RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
99 mRSC = rsc;
100
101 version_major = 0;
102 version_minor = 0;
103 mInKernel = false;
104 memset(&mWorkers, 0, sizeof(mWorkers));
105 memset(&mTlsStruct, 0, sizeof(mTlsStruct));
106 mExit = false;
107 mSelectRTCallback = nullptr;
108 mEmbedGlobalInfo = true;
109 mEmbedGlobalInfoSkipConstant = true;
110 }
111
112
helperThreadProc(void * vrsc)113 void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
114 RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
115
116 uint32_t idx = __sync_fetch_and_add(&dc->mWorkers.mLaunchCount, 1);
117
118 //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
119
120 dc->mWorkers.mLaunchSignals[idx].init();
121 dc->mWorkers.mNativeThreadId[idx] = gettid();
122
123 memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
124 int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
125 if (status) {
126 ALOGE("pthread_setspecific %i", status);
127 }
128
129 #if 0
130 typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
131 cpu_set_t cpuset;
132 memset(&cpuset, 0, sizeof(cpuset));
133 cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
134 int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
135 sizeof(cpuset), &cpuset);
136 ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
137 #endif
138
139 while (!dc->mExit) {
140 dc->mWorkers.mLaunchSignals[idx].wait();
141 if (dc->mWorkers.mLaunchCallback) {
142 // idx +1 is used because the calling thread is always worker 0.
143 dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
144 }
145 __sync_fetch_and_sub(&dc->mWorkers.mRunningCount, 1);
146 dc->mWorkers.mCompleteSignal.set();
147 }
148
149 //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
150 return nullptr;
151 }
152
153 // Launch a kernel.
154 // The callback function is called to execute the kernel.
launchThreads(WorkerCallback_t cbk,void * data)155 void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
156 mWorkers.mLaunchData = data;
157 mWorkers.mLaunchCallback = cbk;
158
159 // fast path for very small launches
160 MTLaunchStructCommon *mtls = (MTLaunchStructCommon *)data;
161 if (mtls && mtls->dimPtr->y <= 1 && mtls->end.x <= mtls->start.x + mtls->mSliceSize) {
162 if (mWorkers.mLaunchCallback) {
163 mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
164 }
165 return;
166 }
167
168 mWorkers.mRunningCount = mWorkers.mCount;
169 __sync_synchronize();
170
171 for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
172 mWorkers.mLaunchSignals[ct].set();
173 }
174
175 // We use the calling thread as one of the workers so we can start without
176 // the delay of the thread wakeup.
177 if (mWorkers.mLaunchCallback) {
178 mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
179 }
180
181 while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
182 mWorkers.mCompleteSignal.wait();
183 }
184 }
185
186
lockMutex()187 void RsdCpuReferenceImpl::lockMutex() {
188 pthread_mutex_lock(&gInitMutex);
189 }
190
unlockMutex()191 void RsdCpuReferenceImpl::unlockMutex() {
192 pthread_mutex_unlock(&gInitMutex);
193 }
194
195 // Determine if the CPU we're running on supports SIMD instructions.
GetCpuInfo()196 static void GetCpuInfo() {
197 // Read the CPU flags from /proc/cpuinfo.
198 FILE *cpuinfo = fopen("/proc/cpuinfo", "r");
199
200 if (!cpuinfo) {
201 return;
202 }
203
204 char cpuinfostr[4096];
205 // fgets() ends with newline or EOF, need to check the whole
206 // "cpuinfo" file to make sure we can use SIMD or not.
207 while (fgets(cpuinfostr, sizeof(cpuinfostr), cpuinfo)) {
208 #if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
209 gArchUseSIMD = strstr(cpuinfostr, " neon") || strstr(cpuinfostr, " asimd");
210 #elif defined(ARCH_X86_HAVE_SSSE3)
211 gArchUseSIMD = strstr(cpuinfostr, " ssse3");
212 #endif
213 if (gArchUseSIMD) {
214 break;
215 }
216 }
217 fclose(cpuinfo);
218 }
219
init(uint32_t version_major,uint32_t version_minor,sym_lookup_t lfn,script_lookup_t slfn)220 bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
221 sym_lookup_t lfn, script_lookup_t slfn) {
222 mSymLookupFn = lfn;
223 mScriptLookupFn = slfn;
224
225 lockMutex();
226 if (!gThreadTLSKeyCount) {
227 int status = pthread_key_create(&gThreadTLSKey, nullptr);
228 if (status) {
229 ALOGE("Failed to init thread tls key.");
230 unlockMutex();
231 return false;
232 }
233 }
234 gThreadTLSKeyCount++;
235 unlockMutex();
236
237 mTlsStruct.mContext = mRSC;
238 mTlsStruct.mScript = nullptr;
239 int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
240 if (status) {
241 ALOGE("pthread_setspecific %i", status);
242 }
243
244 mPageSize = sysconf(_SC_PAGE_SIZE);
245 // ALOGV("page size = %ld", mPageSize);
246
247 GetCpuInfo();
248
249 int cpu = sysconf(_SC_NPROCESSORS_CONF);
250 if(mRSC->props.mDebugMaxThreads) {
251 cpu = mRSC->props.mDebugMaxThreads;
252 }
253 if (cpu < 2) {
254 mWorkers.mCount = 0;
255 return true;
256 }
257
258 // Subtract one from the cpu count because we also use the command thread as a worker.
259 mWorkers.mCount = (uint32_t)(cpu - 1);
260
261 if (mRSC->props.mLogScripts) {
262 ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount + 1);
263 }
264
265 mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
266 mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
267 mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
268 mWorkers.mLaunchCallback = nullptr;
269
270 mWorkers.mCompleteSignal.init();
271
272 mWorkers.mRunningCount = mWorkers.mCount;
273 mWorkers.mLaunchCount = 0;
274 __sync_synchronize();
275
276 pthread_attr_t threadAttr;
277 status = pthread_attr_init(&threadAttr);
278 if (status) {
279 ALOGE("Failed to init thread attribute.");
280 return false;
281 }
282
283 for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
284 status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
285 if (status) {
286 mWorkers.mCount = ct;
287 ALOGE("Created fewer than expected number of RS threads.");
288 break;
289 }
290 }
291 while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
292 usleep(100);
293 }
294
295 pthread_attr_destroy(&threadAttr);
296 return true;
297 }
298
299
setPriority(int32_t priority)300 void RsdCpuReferenceImpl::setPriority(int32_t priority) {
301 for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
302 setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], priority);
303 }
304 }
305
~RsdCpuReferenceImpl()306 RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
307 mExit = true;
308 mWorkers.mLaunchData = nullptr;
309 mWorkers.mLaunchCallback = nullptr;
310 mWorkers.mRunningCount = mWorkers.mCount;
311 __sync_synchronize();
312 for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
313 mWorkers.mLaunchSignals[ct].set();
314 }
315 void *res;
316 for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
317 pthread_join(mWorkers.mThreadId[ct], &res);
318 }
319 rsAssert(__sync_fetch_and_or(&mWorkers.mRunningCount, 0) == 0);
320 free(mWorkers.mThreadId);
321 free(mWorkers.mNativeThreadId);
322 delete[] mWorkers.mLaunchSignals;
323
324 // Global structure cleanup.
325 lockMutex();
326 --gThreadTLSKeyCount;
327 if (!gThreadTLSKeyCount) {
328 pthread_key_delete(gThreadTLSKey);
329 }
330 unlockMutex();
331
332 }
333
334 // Set up the appropriate input and output pointers to the kernel driver info structure.
335 // Inputs:
336 // mtls - The MTLaunchStruct holding information about the kernel launch
337 // fep - The forEach parameters (driver info structure)
338 // x, y, z, lod, face, a1, a2, a3, a4 - The start offsets into each dimension
FepPtrSetup(const MTLaunchStructForEach * mtls,RsExpandKernelDriverInfo * fep,uint32_t x,uint32_t y,uint32_t z=0,uint32_t lod=0,RsAllocationCubemapFace face=RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,uint32_t a1=0,uint32_t a2=0,uint32_t a3=0,uint32_t a4=0)339 static inline void FepPtrSetup(const MTLaunchStructForEach *mtls, RsExpandKernelDriverInfo *fep,
340 uint32_t x, uint32_t y,
341 uint32_t z = 0, uint32_t lod = 0,
342 RsAllocationCubemapFace face = RS_ALLOCATION_CUBEMAP_FACE_POSITIVE_X,
343 uint32_t a1 = 0, uint32_t a2 = 0, uint32_t a3 = 0, uint32_t a4 = 0) {
344 for (uint32_t i = 0; i < fep->inLen; i++) {
345 fep->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
346 }
347 if (mtls->aout[0] != nullptr) {
348 fep->outPtr[0] = (uint8_t *)mtls->aout[0]->getPointerUnchecked(x, y, z, lod, face, a1, a2, a3, a4);
349 }
350 }
351
352 // Set up the appropriate input and output pointers to the kernel driver info structure.
353 // Inputs:
354 // mtls - The MTLaunchStruct holding information about the kernel launch
355 // redp - The reduce parameters (driver info structure)
356 // x, y, z - The start offsets into each dimension
RedpPtrSetup(const MTLaunchStructReduce * mtls,RsExpandKernelDriverInfo * redp,uint32_t x,uint32_t y,uint32_t z)357 static inline void RedpPtrSetup(const MTLaunchStructReduce *mtls, RsExpandKernelDriverInfo *redp,
358 uint32_t x, uint32_t y, uint32_t z) {
359 for (uint32_t i = 0; i < redp->inLen; i++) {
360 redp->inPtr[i] = (const uint8_t *)mtls->ains[i]->getPointerUnchecked(x, y, z);
361 }
362 }
363
sliceInt(uint32_t * p,uint32_t val,uint32_t start,uint32_t end)364 static uint32_t sliceInt(uint32_t *p, uint32_t val, uint32_t start, uint32_t end) {
365 if (start >= end) {
366 *p = start;
367 return val;
368 }
369
370 uint32_t div = end - start;
371
372 uint32_t n = val / div;
373 *p = (val - (n * div)) + start;
374 return n;
375 }
376
SelectOuterSlice(const MTLaunchStructCommon * mtls,RsExpandKernelDriverInfo * info,uint32_t sliceNum)377 static bool SelectOuterSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
378 uint32_t r = sliceNum;
379 r = sliceInt(&info->current.z, r, mtls->start.z, mtls->end.z);
380 r = sliceInt(&info->current.lod, r, mtls->start.lod, mtls->end.lod);
381 r = sliceInt(&info->current.face, r, mtls->start.face, mtls->end.face);
382 r = sliceInt(&info->current.array[0], r, mtls->start.array[0], mtls->end.array[0]);
383 r = sliceInt(&info->current.array[1], r, mtls->start.array[1], mtls->end.array[1]);
384 r = sliceInt(&info->current.array[2], r, mtls->start.array[2], mtls->end.array[2]);
385 r = sliceInt(&info->current.array[3], r, mtls->start.array[3], mtls->end.array[3]);
386 return r == 0;
387 }
388
SelectZSlice(const MTLaunchStructCommon * mtls,RsExpandKernelDriverInfo * info,uint32_t sliceNum)389 static bool SelectZSlice(const MTLaunchStructCommon *mtls, RsExpandKernelDriverInfo* info, uint32_t sliceNum) {
390 return sliceInt(&info->current.z, sliceNum, mtls->start.z, mtls->end.z) == 0;
391 }
392
walk_general_foreach(void * usr,uint32_t idx)393 static void walk_general_foreach(void *usr, uint32_t idx) {
394 MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
395 RsExpandKernelDriverInfo fep = mtls->fep;
396 fep.lid = idx;
397 ForEachFunc_t fn = mtls->kernel;
398
399 while(1) {
400 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
401
402 if (!SelectOuterSlice(mtls, &fep, slice)) {
403 return;
404 }
405
406 for (fep.current.y = mtls->start.y; fep.current.y < mtls->end.y;
407 fep.current.y++) {
408
409 FepPtrSetup(mtls, &fep, mtls->start.x,
410 fep.current.y, fep.current.z, fep.current.lod,
411 (RsAllocationCubemapFace)fep.current.face,
412 fep.current.array[0], fep.current.array[1],
413 fep.current.array[2], fep.current.array[3]);
414
415 fn(&fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
416 }
417 }
418 }
419
walk_2d_foreach(void * usr,uint32_t idx)420 static void walk_2d_foreach(void *usr, uint32_t idx) {
421 MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
422 RsExpandKernelDriverInfo fep = mtls->fep;
423 fep.lid = idx;
424 ForEachFunc_t fn = mtls->kernel;
425
426 while (1) {
427 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
428 uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
429 uint32_t yEnd = yStart + mtls->mSliceSize;
430
431 yEnd = rsMin(yEnd, mtls->end.y);
432
433 if (yEnd <= yStart) {
434 return;
435 }
436
437 for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
438 FepPtrSetup(mtls, &fep, mtls->start.x, fep.current.y);
439
440 fn(&fep, mtls->start.x, mtls->end.x, fep.outStride[0]);
441 }
442 }
443 }
444
walk_1d_foreach(void * usr,uint32_t idx)445 static void walk_1d_foreach(void *usr, uint32_t idx) {
446 MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
447 RsExpandKernelDriverInfo fep = mtls->fep;
448 fep.lid = idx;
449 ForEachFunc_t fn = mtls->kernel;
450
451 while (1) {
452 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
453 uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
454 uint32_t xEnd = xStart + mtls->mSliceSize;
455
456 xEnd = rsMin(xEnd, mtls->end.x);
457
458 if (xEnd <= xStart) {
459 return;
460 }
461
462 FepPtrSetup(mtls, &fep, xStart, 0);
463
464 fn(&fep, xStart, xEnd, fep.outStride[0]);
465 }
466 }
467
468 // The function format_bytes() is an auxiliary function to assist in logging.
469 //
470 // Bytes are read from an input (inBuf) and written (as pairs of hex digits)
471 // to an output (outBuf).
472 //
473 // Output format:
474 // - starts with ": "
475 // - each input byte is translated to a pair of hex digits
476 // - bytes are separated by "." except that every fourth separator is "|"
477 // - if the input is sufficiently long, the output is truncated and terminated with "..."
478 //
479 // Arguments:
480 // - outBuf -- Pointer to buffer of type "FormatBuf" into which output is written
481 // - inBuf -- Pointer to bytes which are to be formatted into outBuf
482 // - inBytes -- Number of bytes in inBuf
483 //
484 // Constant:
485 // - kFormatInBytesMax -- Only min(kFormatInBytesMax, inBytes) bytes will be read
486 // from inBuf
487 //
488 // Return value:
489 // - pointer (const char *) to output (which is part of outBuf)
490 //
491 static const int kFormatInBytesMax = 16;
492 // ": " + 2 digits per byte + 1 separator between bytes + "..." + null
493 typedef char FormatBuf[2 + kFormatInBytesMax*2 + (kFormatInBytesMax - 1) + 3 + 1];
format_bytes(FormatBuf * outBuf,const uint8_t * inBuf,const int inBytes)494 static const char *format_bytes(FormatBuf *outBuf, const uint8_t *inBuf, const int inBytes) {
495 strcpy(*outBuf, ": ");
496 int pos = 2;
497 const int lim = std::min(kFormatInBytesMax, inBytes);
498 for (int i = 0; i < lim; ++i) {
499 if (i) {
500 sprintf(*outBuf + pos, (i % 4 ? "." : "|"));
501 ++pos;
502 }
503 sprintf(*outBuf + pos, "%02x", inBuf[i]);
504 pos += 2;
505 }
506 if (kFormatInBytesMax < inBytes)
507 strcpy(*outBuf + pos, "...");
508 return *outBuf;
509 }
510
reduce_get_accumulator(uint8_t * & accumPtr,const MTLaunchStructReduce * mtls,const char * walkerName,uint32_t threadIdx)511 static void reduce_get_accumulator(uint8_t *&accumPtr, const MTLaunchStructReduce *mtls,
512 const char *walkerName, uint32_t threadIdx) {
513 rsAssert(!accumPtr);
514
515 uint32_t accumIdx = (uint32_t)__sync_fetch_and_add(&mtls->accumCount, 1);
516 if (mtls->outFunc) {
517 accumPtr = mtls->accumAlloc + mtls->accumStride * accumIdx;
518 } else {
519 if (accumIdx == 0) {
520 accumPtr = mtls->redp.outPtr[0];
521 } else {
522 accumPtr = mtls->accumAlloc + mtls->accumStride * (accumIdx - 1);
523 }
524 }
525 REDUCE_ALOGV(mtls, 2, "%s(%p): idx = %u got accumCount %u and accumPtr %p",
526 walkerName, mtls->accumFunc, threadIdx, accumIdx, accumPtr);
527 // initialize accumulator
528 if (mtls->initFunc) {
529 mtls->initFunc(accumPtr);
530 } else {
531 memset(accumPtr, 0, mtls->accumSize);
532 }
533 }
534
walk_1d_reduce(void * usr,uint32_t idx)535 static void walk_1d_reduce(void *usr, uint32_t idx) {
536 const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
537 RsExpandKernelDriverInfo redp = mtls->redp;
538
539 // find accumulator
540 uint8_t *&accumPtr = mtls->accumPtr[idx];
541 if (!accumPtr) {
542 reduce_get_accumulator(accumPtr, mtls, __func__, idx);
543 }
544
545 // accumulate
546 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
547 while (1) {
548 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
549 uint32_t xStart = mtls->start.x + slice * mtls->mSliceSize;
550 uint32_t xEnd = xStart + mtls->mSliceSize;
551
552 xEnd = rsMin(xEnd, mtls->end.x);
553
554 if (xEnd <= xStart) {
555 return;
556 }
557
558 RedpPtrSetup(mtls, &redp, xStart, 0, 0);
559 fn(&redp, xStart, xEnd, accumPtr);
560
561 // Emit log line after slice has been run, so that we can include
562 // the results of the run on that line.
563 FormatBuf fmt;
564 if (mtls->logReduce >= 3) {
565 format_bytes(&fmt, accumPtr, mtls->accumSize);
566 } else {
567 fmt[0] = 0;
568 }
569 REDUCE_ALOGV(mtls, 2, "walk_1d_reduce(%p): idx = %u, x in [%u, %u)%s",
570 mtls->accumFunc, idx, xStart, xEnd, fmt);
571 }
572 }
573
walk_2d_reduce(void * usr,uint32_t idx)574 static void walk_2d_reduce(void *usr, uint32_t idx) {
575 const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
576 RsExpandKernelDriverInfo redp = mtls->redp;
577
578 // find accumulator
579 uint8_t *&accumPtr = mtls->accumPtr[idx];
580 if (!accumPtr) {
581 reduce_get_accumulator(accumPtr, mtls, __func__, idx);
582 }
583
584 // accumulate
585 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
586 while (1) {
587 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
588 uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
589 uint32_t yEnd = yStart + mtls->mSliceSize;
590
591 yEnd = rsMin(yEnd, mtls->end.y);
592
593 if (yEnd <= yStart) {
594 return;
595 }
596
597 for (redp.current.y = yStart; redp.current.y < yEnd; redp.current.y++) {
598 RedpPtrSetup(mtls, &redp, mtls->start.x, redp.current.y, 0);
599 fn(&redp, mtls->start.x, mtls->end.x, accumPtr);
600 }
601
602 FormatBuf fmt;
603 if (mtls->logReduce >= 3) {
604 format_bytes(&fmt, accumPtr, mtls->accumSize);
605 } else {
606 fmt[0] = 0;
607 }
608 REDUCE_ALOGV(mtls, 2, "walk_2d_reduce(%p): idx = %u, y in [%u, %u)%s",
609 mtls->accumFunc, idx, yStart, yEnd, fmt);
610 }
611 }
612
walk_3d_reduce(void * usr,uint32_t idx)613 static void walk_3d_reduce(void *usr, uint32_t idx) {
614 const MTLaunchStructReduce *mtls = (const MTLaunchStructReduce *)usr;
615 RsExpandKernelDriverInfo redp = mtls->redp;
616
617 // find accumulator
618 uint8_t *&accumPtr = mtls->accumPtr[idx];
619 if (!accumPtr) {
620 reduce_get_accumulator(accumPtr, mtls, __func__, idx);
621 }
622
623 // accumulate
624 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
625 while (1) {
626 uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
627
628 if (!SelectZSlice(mtls, &redp, slice)) {
629 return;
630 }
631
632 for (redp.current.y = mtls->start.y; redp.current.y < mtls->end.y; redp.current.y++) {
633 RedpPtrSetup(mtls, &redp, mtls->start.x, redp.current.y, redp.current.z);
634 fn(&redp, mtls->start.x, mtls->end.x, accumPtr);
635 }
636
637 FormatBuf fmt;
638 if (mtls->logReduce >= 3) {
639 format_bytes(&fmt, accumPtr, mtls->accumSize);
640 } else {
641 fmt[0] = 0;
642 }
643 REDUCE_ALOGV(mtls, 2, "walk_3d_reduce(%p): idx = %u, z = %u%s",
644 mtls->accumFunc, idx, redp.current.z, fmt);
645 }
646 }
647
648 // Launch a general reduce-style kernel.
649 // Inputs:
650 // ains[0..inLen-1]: Array of allocations that contain the inputs
651 // aout: The allocation that will hold the output
652 // mtls: Holds launch parameters
launchReduce(const Allocation ** ains,uint32_t inLen,Allocation * aout,MTLaunchStructReduce * mtls)653 void RsdCpuReferenceImpl::launchReduce(const Allocation ** ains,
654 uint32_t inLen,
655 Allocation * aout,
656 MTLaunchStructReduce *mtls) {
657 mtls->logReduce = mRSC->props.mLogReduce;
658 if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
659 launchReduceParallel(ains, inLen, aout, mtls);
660 } else {
661 launchReduceSerial(ains, inLen, aout, mtls);
662 }
663 }
664
665 // Launch a general reduce-style kernel, single-threaded.
666 // Inputs:
667 // ains[0..inLen-1]: Array of allocations that contain the inputs
668 // aout: The allocation that will hold the output
669 // mtls: Holds launch parameters
launchReduceSerial(const Allocation ** ains,uint32_t inLen,Allocation * aout,MTLaunchStructReduce * mtls)670 void RsdCpuReferenceImpl::launchReduceSerial(const Allocation ** ains,
671 uint32_t inLen,
672 Allocation * aout,
673 MTLaunchStructReduce *mtls) {
674 REDUCE_ALOGV(mtls, 1, "launchReduceSerial(%p): %u x %u x %u", mtls->accumFunc,
675 mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z);
676
677 // In the presence of outconverter, we allocate temporary memory for
678 // the accumulator.
679 //
680 // In the absence of outconverter, we use the output allocation as the
681 // accumulator.
682 uint8_t *const accumPtr = (mtls->outFunc
683 ? static_cast<uint8_t *>(malloc(mtls->accumSize))
684 : mtls->redp.outPtr[0]);
685
686 // initialize
687 if (mtls->initFunc) {
688 mtls->initFunc(accumPtr);
689 } else {
690 memset(accumPtr, 0, mtls->accumSize);
691 }
692
693 // accumulate
694 const ReduceAccumulatorFunc_t fn = mtls->accumFunc;
695 uint32_t slice = 0;
696 while (SelectOuterSlice(mtls, &mtls->redp, slice++)) {
697 for (mtls->redp.current.y = mtls->start.y;
698 mtls->redp.current.y < mtls->end.y;
699 mtls->redp.current.y++) {
700 RedpPtrSetup(mtls, &mtls->redp, mtls->start.x, mtls->redp.current.y, mtls->redp.current.z);
701 fn(&mtls->redp, mtls->start.x, mtls->end.x, accumPtr);
702 }
703 }
704
705 // outconvert
706 if (mtls->outFunc) {
707 mtls->outFunc(mtls->redp.outPtr[0], accumPtr);
708 free(accumPtr);
709 }
710 }
711
712 // Launch a general reduce-style kernel, multi-threaded.
713 // Inputs:
714 // ains[0..inLen-1]: Array of allocations that contain the inputs
715 // aout: The allocation that will hold the output
716 // mtls: Holds launch parameters
launchReduceParallel(const Allocation ** ains,uint32_t inLen,Allocation * aout,MTLaunchStructReduce * mtls)717 void RsdCpuReferenceImpl::launchReduceParallel(const Allocation ** ains,
718 uint32_t inLen,
719 Allocation * aout,
720 MTLaunchStructReduce *mtls) {
721 // For now, we don't know how to go parallel in the absence of a combiner.
722 if (!mtls->combFunc) {
723 launchReduceSerial(ains, inLen, aout, mtls);
724 return;
725 }
726
727 // Number of threads = "main thread" + number of other (worker) threads
728 const uint32_t numThreads = mWorkers.mCount + 1;
729
730 // In the absence of outconverter, we use the output allocation as
731 // an accumulator, and therefore need to allocate one fewer accumulator.
732 const uint32_t numAllocAccum = numThreads - (mtls->outFunc == nullptr);
733
734 // If mDebugReduceSplitAccum, then we want each accumulator to start
735 // on a page boundary. (TODO: Would some unit smaller than a page
736 // be sufficient to avoid false sharing?)
737 if (mRSC->props.mDebugReduceSplitAccum) {
738 // Round up accumulator size to an integral number of pages
739 mtls->accumStride =
740 (unsigned(mtls->accumSize) + unsigned(mPageSize)-1) &
741 ~(unsigned(mPageSize)-1);
742 // Each accumulator gets its own page. Alternatively, if we just
743 // wanted to make sure no two accumulators are on the same page,
744 // we could instead do
745 // allocSize = mtls->accumStride * (numAllocation - 1) + mtls->accumSize
746 const size_t allocSize = mtls->accumStride * numAllocAccum;
747 mtls->accumAlloc = static_cast<uint8_t *>(memalign(mPageSize, allocSize));
748 } else {
749 mtls->accumStride = mtls->accumSize;
750 mtls->accumAlloc = static_cast<uint8_t *>(malloc(mtls->accumStride * numAllocAccum));
751 }
752
753 const size_t accumPtrArrayBytes = sizeof(uint8_t *) * numThreads;
754 mtls->accumPtr = static_cast<uint8_t **>(malloc(accumPtrArrayBytes));
755 memset(mtls->accumPtr, 0, accumPtrArrayBytes);
756
757 mtls->accumCount = 0;
758
759 rsAssert(!mInKernel);
760 mInKernel = true;
761 REDUCE_ALOGV(mtls, 1, "launchReduceParallel(%p): %u x %u x %u, %u threads, accumAlloc = %p",
762 mtls->accumFunc,
763 mtls->redp.dim.x, mtls->redp.dim.y, mtls->redp.dim.z,
764 numThreads, mtls->accumAlloc);
765 if (mtls->redp.dim.z > 1) {
766 mtls->mSliceSize = 1;
767 launchThreads(walk_3d_reduce, mtls);
768 } else if (mtls->redp.dim.y > 1) {
769 mtls->mSliceSize = rsMax(1U, mtls->redp.dim.y / (numThreads * 4));
770 launchThreads(walk_2d_reduce, mtls);
771 } else {
772 mtls->mSliceSize = rsMax(1U, mtls->redp.dim.x / (numThreads * 4));
773 launchThreads(walk_1d_reduce, mtls);
774 }
775 mInKernel = false;
776
777 // Combine accumulators and identify final accumulator
778 uint8_t *finalAccumPtr = (mtls->outFunc ? nullptr : mtls->redp.outPtr[0]);
779 // Loop over accumulators, combining into finalAccumPtr. If finalAccumPtr
780 // is null, then the first accumulator I find becomes finalAccumPtr.
781 for (unsigned idx = 0; idx < mtls->accumCount; ++idx) {
782 uint8_t *const thisAccumPtr = mtls->accumPtr[idx];
783 if (finalAccumPtr) {
784 if (finalAccumPtr != thisAccumPtr) {
785 if (mtls->combFunc) {
786 if (mtls->logReduce >= 3) {
787 FormatBuf fmt;
788 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): accumulating into%s",
789 mtls->accumFunc,
790 format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
791 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): accumulator[%d]%s",
792 mtls->accumFunc, idx,
793 format_bytes(&fmt, thisAccumPtr, mtls->accumSize));
794 }
795 mtls->combFunc(finalAccumPtr, thisAccumPtr);
796 } else {
797 rsAssert(!"expected combiner");
798 }
799 }
800 } else {
801 finalAccumPtr = thisAccumPtr;
802 }
803 }
804 rsAssert(finalAccumPtr != nullptr);
805 if (mtls->logReduce >= 3) {
806 FormatBuf fmt;
807 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final accumulator%s",
808 mtls->accumFunc, format_bytes(&fmt, finalAccumPtr, mtls->accumSize));
809 }
810
811 // Outconvert
812 if (mtls->outFunc) {
813 mtls->outFunc(mtls->redp.outPtr[0], finalAccumPtr);
814 if (mtls->logReduce >= 3) {
815 FormatBuf fmt;
816 REDUCE_ALOGV(mtls, 3, "launchReduceParallel(%p): final outconverted result%s",
817 mtls->accumFunc,
818 format_bytes(&fmt, mtls->redp.outPtr[0], mtls->redp.outStride[0]));
819 }
820 }
821
822 // Clean up
823 free(mtls->accumPtr);
824 free(mtls->accumAlloc);
825 }
826
827
launchForEach(const Allocation ** ains,uint32_t inLen,Allocation * aout,const RsScriptCall * sc,MTLaunchStructForEach * mtls)828 void RsdCpuReferenceImpl::launchForEach(const Allocation ** ains,
829 uint32_t inLen,
830 Allocation* aout,
831 const RsScriptCall* sc,
832 MTLaunchStructForEach* mtls) {
833
834 //android::StopWatch kernel_time("kernel time");
835
836 bool outerDims = (mtls->start.z != mtls->end.z) ||
837 (mtls->start.face != mtls->end.face) ||
838 (mtls->start.lod != mtls->end.lod) ||
839 (mtls->start.array[0] != mtls->end.array[0]) ||
840 (mtls->start.array[1] != mtls->end.array[1]) ||
841 (mtls->start.array[2] != mtls->end.array[2]) ||
842 (mtls->start.array[3] != mtls->end.array[3]);
843
844 if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInKernel) {
845 const size_t targetByteChunk = 16 * 1024;
846 mInKernel = true; // NOTE: The guard immediately above ensures this was !mInKernel
847
848 if (outerDims) {
849 // No fancy logic for chunk size
850 mtls->mSliceSize = 1;
851 launchThreads(walk_general_foreach, mtls);
852 } else if (mtls->fep.dim.y > 1) {
853 uint32_t s1 = mtls->fep.dim.y / ((mWorkers.mCount + 1) * 4);
854 uint32_t s2 = 0;
855
856 // This chooses our slice size to rate limit atomic ops to
857 // one per 16k bytes of reads/writes.
858 if ((mtls->aout[0] != nullptr) && mtls->aout[0]->mHal.drvState.lod[0].stride) {
859 s2 = targetByteChunk / mtls->aout[0]->mHal.drvState.lod[0].stride;
860 } else if (mtls->ains[0]) {
861 s2 = targetByteChunk / mtls->ains[0]->mHal.drvState.lod[0].stride;
862 } else {
863 // Launch option only case
864 // Use s1 based only on the dimensions
865 s2 = s1;
866 }
867 mtls->mSliceSize = rsMin(s1, s2);
868
869 if(mtls->mSliceSize < 1) {
870 mtls->mSliceSize = 1;
871 }
872
873 launchThreads(walk_2d_foreach, mtls);
874 } else {
875 uint32_t s1 = mtls->fep.dim.x / ((mWorkers.mCount + 1) * 4);
876 uint32_t s2 = 0;
877
878 // This chooses our slice size to rate limit atomic ops to
879 // one per 16k bytes of reads/writes.
880 if ((mtls->aout[0] != nullptr) && mtls->aout[0]->getType()->getElementSizeBytes()) {
881 s2 = targetByteChunk / mtls->aout[0]->getType()->getElementSizeBytes();
882 } else if (mtls->ains[0]) {
883 s2 = targetByteChunk / mtls->ains[0]->getType()->getElementSizeBytes();
884 } else {
885 // Launch option only case
886 // Use s1 based only on the dimensions
887 s2 = s1;
888 }
889 mtls->mSliceSize = rsMin(s1, s2);
890
891 if (mtls->mSliceSize < 1) {
892 mtls->mSliceSize = 1;
893 }
894
895 launchThreads(walk_1d_foreach, mtls);
896 }
897 mInKernel = false;
898
899 } else {
900 ForEachFunc_t fn = mtls->kernel;
901 uint32_t slice = 0;
902
903
904 while(SelectOuterSlice(mtls, &mtls->fep, slice++)) {
905 for (mtls->fep.current.y = mtls->start.y;
906 mtls->fep.current.y < mtls->end.y;
907 mtls->fep.current.y++) {
908
909 FepPtrSetup(mtls, &mtls->fep, mtls->start.x,
910 mtls->fep.current.y, mtls->fep.current.z, mtls->fep.current.lod,
911 (RsAllocationCubemapFace) mtls->fep.current.face,
912 mtls->fep.current.array[0], mtls->fep.current.array[1],
913 mtls->fep.current.array[2], mtls->fep.current.array[3]);
914
915 fn(&mtls->fep, mtls->start.x, mtls->end.x, mtls->fep.outStride[0]);
916 }
917 }
918 }
919 }
920
setTLS(RsdCpuScriptImpl * sc)921 RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
922 //ALOGE("setTls %p", sc);
923 ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
924 rsAssert(tls);
925 RsdCpuScriptImpl *old = tls->mImpl;
926 tls->mImpl = sc;
927 tls->mContext = mRSC;
928 if (sc) {
929 tls->mScript = sc->getScript();
930 } else {
931 tls->mScript = nullptr;
932 }
933 return old;
934 }
935
symLookup(const char * name)936 const RsdCpuReference::CpuSymbol * RsdCpuReferenceImpl::symLookup(const char *name) {
937 return mSymLookupFn(mRSC, name);
938 }
939
940
createScript(const ScriptC * s,char const * resName,char const * cacheDir,uint8_t const * bitcode,size_t bitcodeSize,uint32_t flags)941 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
942 char const *resName, char const *cacheDir,
943 uint8_t const *bitcode, size_t bitcodeSize,
944 uint32_t flags) {
945
946 RsdCpuScriptImpl *i = new RsdCpuScriptImpl(this, s);
947 if (!i->init(resName, cacheDir, bitcode, bitcodeSize, flags
948 , getBccPluginName()
949 )) {
950 delete i;
951 return nullptr;
952 }
953 return i;
954 }
955
956 extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
957 const Script *s, const Element *e);
958 extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
959 const Script *s, const Element *e);
960 extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
961 const Script *s, const Element *e);
962 extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
963 const Script *s, const Element *e);
964 extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
965 const Script *s, const Element *e);
966 extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx,
967 const Script *s, const Element *e);
968 extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
969 const Script *s, const Element *e);
970 extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
971 const Script *s, const Element *e);
972 extern RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx,
973 const Script *s, const Element *e);
974 extern RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx,
975 const Script *s, const Element *e);
976 extern RsdCpuScriptImpl * rsdIntrinsic_BLAS(RsdCpuReferenceImpl *ctx,
977 const Script *s, const Element *e);
978
createIntrinsic(const Script * s,RsScriptIntrinsicID iid,Element * e)979 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
980 RsScriptIntrinsicID iid, Element *e) {
981
982 RsdCpuScriptImpl *i = nullptr;
983 switch (iid) {
984 case RS_SCRIPT_INTRINSIC_ID_3DLUT:
985 i = rsdIntrinsic_3DLUT(this, s, e);
986 break;
987 case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
988 i = rsdIntrinsic_Convolve3x3(this, s, e);
989 break;
990 case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
991 i = rsdIntrinsic_ColorMatrix(this, s, e);
992 break;
993 case RS_SCRIPT_INTRINSIC_ID_LUT:
994 i = rsdIntrinsic_LUT(this, s, e);
995 break;
996 case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
997 i = rsdIntrinsic_Convolve5x5(this, s, e);
998 break;
999 case RS_SCRIPT_INTRINSIC_ID_BLUR:
1000 i = rsdIntrinsic_Blur(this, s, e);
1001 break;
1002 case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
1003 i = rsdIntrinsic_YuvToRGB(this, s, e);
1004 break;
1005 case RS_SCRIPT_INTRINSIC_ID_BLEND:
1006 i = rsdIntrinsic_Blend(this, s, e);
1007 break;
1008 case RS_SCRIPT_INTRINSIC_ID_HISTOGRAM:
1009 i = rsdIntrinsic_Histogram(this, s, e);
1010 break;
1011 case RS_SCRIPT_INTRINSIC_ID_RESIZE:
1012 i = rsdIntrinsic_Resize(this, s, e);
1013 break;
1014 case RS_SCRIPT_INTRINSIC_ID_BLAS:
1015 i = rsdIntrinsic_BLAS(this, s, e);
1016 break;
1017
1018 default:
1019 rsAssert(0);
1020 }
1021
1022 return i;
1023 }
1024
createScriptGroup(const ScriptGroupBase * sg)1025 void* RsdCpuReferenceImpl::createScriptGroup(const ScriptGroupBase *sg) {
1026 switch (sg->getApiVersion()) {
1027 case ScriptGroupBase::SG_V1: {
1028 CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
1029 if (!sgi->init()) {
1030 delete sgi;
1031 return nullptr;
1032 }
1033 return sgi;
1034 }
1035 case ScriptGroupBase::SG_V2: {
1036 return new CpuScriptGroup2Impl(this, sg);
1037 }
1038 }
1039 return nullptr;
1040 }
1041