1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "rsCpuCore.h"
18 #include "rsCpuScript.h"
19 #include "rsCpuScriptGroup.h"
20 
21 #include <malloc.h>
22 #include "rsContext.h"
23 
24 #include <sys/types.h>
25 #include <sys/resource.h>
26 #include <sched.h>
27 #include <sys/syscall.h>
28 #include <string.h>
29 #include <unistd.h>
30 
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <fcntl.h>
34 
35 #if !defined(RS_SERVER) && !defined(RS_COMPATIBILITY_LIB)
36 #include <cutils/properties.h>
37 #include "utils/StopWatch.h"
38 #endif
39 
40 #ifdef RS_SERVER
41 // Android exposes gettid(), standard Linux does not
gettid()42 static pid_t gettid() {
43     return syscall(SYS_gettid);
44 }
45 #endif
46 
47 using namespace android;
48 using namespace android::renderscript;
49 
50 typedef void (*outer_foreach_t)(
51     const android::renderscript::RsForEachStubParamStruct *,
52     uint32_t x1, uint32_t x2,
53     uint32_t instep, uint32_t outstep);
54 
55 
56 static pthread_key_t gThreadTLSKey = 0;
57 static uint32_t gThreadTLSKeyCount = 0;
58 static pthread_mutex_t gInitMutex = PTHREAD_MUTEX_INITIALIZER;
59 
60 bool android::renderscript::gArchUseSIMD = false;
61 
~RsdCpuReference()62 RsdCpuReference::~RsdCpuReference() {
63 }
64 
create(Context * rsc,uint32_t version_major,uint32_t version_minor,sym_lookup_t lfn,script_lookup_t slfn,bcc::RSLinkRuntimeCallback pLinkRuntimeCallback,RSSelectRTCallback pSelectRTCallback,const char * pBccPluginName)65 RsdCpuReference * RsdCpuReference::create(Context *rsc, uint32_t version_major,
66         uint32_t version_minor, sym_lookup_t lfn, script_lookup_t slfn
67 #ifndef RS_COMPATIBILITY_LIB
68         , bcc::RSLinkRuntimeCallback pLinkRuntimeCallback,
69         RSSelectRTCallback pSelectRTCallback,
70         const char *pBccPluginName
71 #endif
72         ) {
73 
74     RsdCpuReferenceImpl *cpu = new RsdCpuReferenceImpl(rsc);
75     if (!cpu) {
76         return NULL;
77     }
78     if (!cpu->init(version_major, version_minor, lfn, slfn)) {
79         delete cpu;
80         return NULL;
81     }
82 
83 #ifndef RS_COMPATIBILITY_LIB
84     cpu->setLinkRuntimeCallback(pLinkRuntimeCallback);
85     cpu->setSelectRTCallback(pSelectRTCallback);
86     if (pBccPluginName) {
87         cpu->setBccPluginName(pBccPluginName);
88     }
89 #endif
90 
91     return cpu;
92 }
93 
94 
getTlsContext()95 Context * RsdCpuReference::getTlsContext() {
96     ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
97     return tls->mContext;
98 }
99 
getTlsScript()100 const Script * RsdCpuReference::getTlsScript() {
101     ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
102     return tls->mScript;
103 }
104 
getThreadTLSKey()105 pthread_key_t RsdCpuReference::getThreadTLSKey(){ return gThreadTLSKey; }
106 
107 ////////////////////////////////////////////////////////////
108 ///
109 
RsdCpuReferenceImpl(Context * rsc)110 RsdCpuReferenceImpl::RsdCpuReferenceImpl(Context *rsc) {
111     mRSC = rsc;
112 
113     version_major = 0;
114     version_minor = 0;
115     mInForEach = false;
116     memset(&mWorkers, 0, sizeof(mWorkers));
117     memset(&mTlsStruct, 0, sizeof(mTlsStruct));
118     mExit = false;
119 #ifndef RS_COMPATIBILITY_LIB
120     mLinkRuntimeCallback = NULL;
121     mSelectRTCallback = NULL;
122     mSetupCompilerCallback = NULL;
123 #endif
124 }
125 
126 
helperThreadProc(void * vrsc)127 void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
128     RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
129 
130     uint32_t idx = __sync_fetch_and_add(&dc->mWorkers.mLaunchCount, 1);
131 
132     //ALOGV("RS helperThread starting %p idx=%i", dc, idx);
133 
134     dc->mWorkers.mLaunchSignals[idx].init();
135     dc->mWorkers.mNativeThreadId[idx] = gettid();
136 
137     memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
138     int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
139     if (status) {
140         ALOGE("pthread_setspecific %i", status);
141     }
142 
143 #if 0
144     typedef struct {uint64_t bits[1024 / 64]; } cpu_set_t;
145     cpu_set_t cpuset;
146     memset(&cpuset, 0, sizeof(cpuset));
147     cpuset.bits[idx / 64] |= 1ULL << (idx % 64);
148     int ret = syscall(241, rsc->mWorkers.mNativeThreadId[idx],
149               sizeof(cpuset), &cpuset);
150     ALOGE("SETAFFINITY ret = %i %s", ret, EGLUtils::strerror(ret));
151 #endif
152 
153     while (!dc->mExit) {
154         dc->mWorkers.mLaunchSignals[idx].wait();
155         if (dc->mWorkers.mLaunchCallback) {
156            // idx +1 is used because the calling thread is always worker 0.
157            dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
158         }
159         __sync_fetch_and_sub(&dc->mWorkers.mRunningCount, 1);
160         dc->mWorkers.mCompleteSignal.set();
161     }
162 
163     //ALOGV("RS helperThread exited %p idx=%i", dc, idx);
164     return NULL;
165 }
166 
launchThreads(WorkerCallback_t cbk,void * data)167 void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
168     mWorkers.mLaunchData = data;
169     mWorkers.mLaunchCallback = cbk;
170 
171     // fast path for very small launches
172     MTLaunchStruct *mtls = (MTLaunchStruct *)data;
173     if (mtls && mtls->fep.dimY <= 1 && mtls->xEnd <= mtls->xStart + mtls->mSliceSize) {
174         if (mWorkers.mLaunchCallback) {
175             mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
176         }
177         return;
178     }
179 
180     mWorkers.mRunningCount = mWorkers.mCount;
181     __sync_synchronize();
182 
183     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
184         mWorkers.mLaunchSignals[ct].set();
185     }
186 
187     // We use the calling thread as one of the workers so we can start without
188     // the delay of the thread wakeup.
189     if (mWorkers.mLaunchCallback) {
190         mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
191     }
192 
193     while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
194         mWorkers.mCompleteSignal.wait();
195     }
196 }
197 
198 
lockMutex()199 void RsdCpuReferenceImpl::lockMutex() {
200     pthread_mutex_lock(&gInitMutex);
201 }
202 
unlockMutex()203 void RsdCpuReferenceImpl::unlockMutex() {
204     pthread_mutex_unlock(&gInitMutex);
205 }
206 
207 static int
read_file(const char * pathname,char * buffer,size_t buffsize)208 read_file(const char*  pathname, char*  buffer, size_t  buffsize)
209 {
210     int  fd, len;
211 
212     fd = open(pathname, O_RDONLY);
213     if (fd < 0)
214         return -1;
215 
216     do {
217         len = read(fd, buffer, buffsize);
218     } while (len < 0 && errno == EINTR);
219 
220     close(fd);
221 
222     return len;
223 }
224 
GetCpuInfo()225 static void GetCpuInfo() {
226     char cpuinfo[4096];
227     int  cpuinfo_len;
228 
229     cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, sizeof cpuinfo);
230     if (cpuinfo_len < 0)  /* should not happen */ {
231         return;
232     }
233 
234 #if defined(ARCH_ARM_HAVE_VFP) || defined(ARCH_ARM_USE_INTRINSICS)
235     gArchUseSIMD = (!!strstr(cpuinfo, " neon")) ||
236                    (!!strstr(cpuinfo, " asimd"));
237 #elif defined(ARCH_X86_HAVE_SSSE3)
238     gArchUseSIMD = !!strstr(cpuinfo, " ssse3");
239 #endif
240 }
241 
init(uint32_t version_major,uint32_t version_minor,sym_lookup_t lfn,script_lookup_t slfn)242 bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
243                                sym_lookup_t lfn, script_lookup_t slfn) {
244 
245     mSymLookupFn = lfn;
246     mScriptLookupFn = slfn;
247 
248     lockMutex();
249     if (!gThreadTLSKeyCount) {
250         int status = pthread_key_create(&gThreadTLSKey, NULL);
251         if (status) {
252             ALOGE("Failed to init thread tls key.");
253             unlockMutex();
254             return false;
255         }
256     }
257     gThreadTLSKeyCount++;
258     unlockMutex();
259 
260     mTlsStruct.mContext = mRSC;
261     mTlsStruct.mScript = NULL;
262     int status = pthread_setspecific(gThreadTLSKey, &mTlsStruct);
263     if (status) {
264         ALOGE("pthread_setspecific %i", status);
265     }
266 
267     GetCpuInfo();
268 
269     int cpu = sysconf(_SC_NPROCESSORS_CONF);
270     if(mRSC->props.mDebugMaxThreads) {
271         cpu = mRSC->props.mDebugMaxThreads;
272     }
273     if (cpu < 2) {
274         mWorkers.mCount = 0;
275         return true;
276     }
277 
278     // Subtract one from the cpu count because we also use the command thread as a worker.
279     mWorkers.mCount = (uint32_t)(cpu - 1);
280 
281     ALOGV("%p Launching thread(s), CPUs %i", mRSC, mWorkers.mCount + 1);
282 
283     mWorkers.mThreadId = (pthread_t *) calloc(mWorkers.mCount, sizeof(pthread_t));
284     mWorkers.mNativeThreadId = (pid_t *) calloc(mWorkers.mCount, sizeof(pid_t));
285     mWorkers.mLaunchSignals = new Signal[mWorkers.mCount];
286     mWorkers.mLaunchCallback = NULL;
287 
288     mWorkers.mCompleteSignal.init();
289 
290     mWorkers.mRunningCount = mWorkers.mCount;
291     mWorkers.mLaunchCount = 0;
292     __sync_synchronize();
293 
294     pthread_attr_t threadAttr;
295     status = pthread_attr_init(&threadAttr);
296     if (status) {
297         ALOGE("Failed to init thread attribute.");
298         return false;
299     }
300 
301     for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
302         status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
303         if (status) {
304             mWorkers.mCount = ct;
305             ALOGE("Created fewer than expected number of RS threads.");
306             break;
307         }
308     }
309     while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
310         usleep(100);
311     }
312 
313     pthread_attr_destroy(&threadAttr);
314     return true;
315 }
316 
317 
setPriority(int32_t priority)318 void RsdCpuReferenceImpl::setPriority(int32_t priority) {
319     for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
320         setpriority(PRIO_PROCESS, mWorkers.mNativeThreadId[ct], priority);
321     }
322 }
323 
~RsdCpuReferenceImpl()324 RsdCpuReferenceImpl::~RsdCpuReferenceImpl() {
325     mExit = true;
326     mWorkers.mLaunchData = NULL;
327     mWorkers.mLaunchCallback = NULL;
328     mWorkers.mRunningCount = mWorkers.mCount;
329     __sync_synchronize();
330     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
331         mWorkers.mLaunchSignals[ct].set();
332     }
333     void *res;
334     for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
335         pthread_join(mWorkers.mThreadId[ct], &res);
336     }
337     rsAssert(__sync_fetch_and_or(&mWorkers.mRunningCount, 0) == 0);
338     free(mWorkers.mThreadId);
339     free(mWorkers.mNativeThreadId);
340     delete[] mWorkers.mLaunchSignals;
341 
342     // Global structure cleanup.
343     lockMutex();
344     --gThreadTLSKeyCount;
345     if (!gThreadTLSKeyCount) {
346         pthread_key_delete(gThreadTLSKey);
347     }
348     unlockMutex();
349 
350 }
351 
352 typedef void (*rs_t)(const void *, void *, const void *, uint32_t, uint32_t, uint32_t, uint32_t);
353 
wc_xy(void * usr,uint32_t idx)354 static void wc_xy(void *usr, uint32_t idx) {
355     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
356     RsForEachStubParamStruct p;
357     memcpy(&p, &mtls->fep, sizeof(p));
358     p.lid = idx;
359     uint32_t sig = mtls->sig;
360 
361     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
362     while (1) {
363         uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
364         uint32_t yStart = mtls->yStart + slice * mtls->mSliceSize;
365         uint32_t yEnd = yStart + mtls->mSliceSize;
366         yEnd = rsMin(yEnd, mtls->yEnd);
367         if (yEnd <= yStart) {
368             return;
369         }
370 
371         //ALOGE("usr idx %i, x %i,%i  y %i,%i", idx, mtls->xStart, mtls->xEnd, yStart, yEnd);
372         //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
373 
374         for (p.y = yStart; p.y < yEnd; p.y++) {
375             p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * p.y) +
376                     (mtls->fep.eStrideOut * mtls->xStart);
377             p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * p.y) +
378                    (mtls->fep.eStrideIn * mtls->xStart);
379             fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
380         }
381     }
382 }
383 
wc_x(void * usr,uint32_t idx)384 static void wc_x(void *usr, uint32_t idx) {
385     MTLaunchStruct *mtls = (MTLaunchStruct *)usr;
386     RsForEachStubParamStruct p;
387     memcpy(&p, &mtls->fep, sizeof(p));
388     p.lid = idx;
389     uint32_t sig = mtls->sig;
390 
391     outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
392     while (1) {
393         uint32_t slice = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
394         uint32_t xStart = mtls->xStart + slice * mtls->mSliceSize;
395         uint32_t xEnd = xStart + mtls->mSliceSize;
396         xEnd = rsMin(xEnd, mtls->xEnd);
397         if (xEnd <= xStart) {
398             return;
399         }
400 
401         //ALOGE("usr slice %i idx %i, x %i,%i", slice, idx, xStart, xEnd);
402         //ALOGE("usr ptr in %p,  out %p", mtls->fep.ptrIn, mtls->fep.ptrOut);
403 
404         p.out = mtls->fep.ptrOut + (mtls->fep.eStrideOut * xStart);
405         p.in = mtls->fep.ptrIn + (mtls->fep.eStrideIn * xStart);
406         fn(&p, xStart, xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
407     }
408 }
409 
launchThreads(const Allocation * ain,Allocation * aout,const RsScriptCall * sc,MTLaunchStruct * mtls)410 void RsdCpuReferenceImpl::launchThreads(const Allocation * ain, Allocation * aout,
411                                      const RsScriptCall *sc, MTLaunchStruct *mtls) {
412 
413     //android::StopWatch kernel_time("kernel time");
414 
415     if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
416         const size_t targetByteChunk = 16 * 1024;
417         mInForEach = true;
418         if (mtls->fep.dimY > 1) {
419             uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
420             uint32_t s2 = 0;
421 
422             // This chooses our slice size to rate limit atomic ops to
423             // one per 16k bytes of reads/writes.
424             if (mtls->fep.yStrideOut) {
425                 s2 = targetByteChunk / mtls->fep.yStrideOut;
426             } else {
427                 s2 = targetByteChunk / mtls->fep.yStrideIn;
428             }
429             mtls->mSliceSize = rsMin(s1, s2);
430 
431             if(mtls->mSliceSize < 1) {
432                 mtls->mSliceSize = 1;
433             }
434 
435          //   mtls->mSliceSize = 2;
436             launchThreads(wc_xy, mtls);
437         } else {
438             uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
439             uint32_t s2 = 0;
440 
441             // This chooses our slice size to rate limit atomic ops to
442             // one per 16k bytes of reads/writes.
443             if (mtls->fep.eStrideOut) {
444                 s2 = targetByteChunk / mtls->fep.eStrideOut;
445             } else {
446                 s2 = targetByteChunk / mtls->fep.eStrideIn;
447             }
448             mtls->mSliceSize = rsMin(s1, s2);
449 
450             if(mtls->mSliceSize < 1) {
451                 mtls->mSliceSize = 1;
452             }
453 
454             launchThreads(wc_x, mtls);
455         }
456         mInForEach = false;
457 
458         //ALOGE("launch 1");
459     } else {
460         RsForEachStubParamStruct p;
461         memcpy(&p, &mtls->fep, sizeof(p));
462         uint32_t sig = mtls->sig;
463 
464         //ALOGE("launch 3");
465         outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
466         for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
467             for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
468                 for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
469                     uint32_t offset = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0] +
470                                       mtls->fep.dimY * p.z + p.y;
471                     p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
472                             (mtls->fep.eStrideOut * mtls->xStart);
473                     p.in = mtls->fep.ptrIn + (mtls->fep.yStrideIn * offset) +
474                            (mtls->fep.eStrideIn * mtls->xStart);
475                     fn(&p, mtls->xStart, mtls->xEnd, mtls->fep.eStrideIn, mtls->fep.eStrideOut);
476                 }
477             }
478         }
479     }
480 }
481 
launchThreads(const Allocation ** ains,uint32_t inLen,Allocation * aout,const RsScriptCall * sc,MTLaunchStruct * mtls)482 void RsdCpuReferenceImpl::launchThreads(const Allocation** ains, uint32_t inLen, Allocation* aout,
483                                         const RsScriptCall* sc, MTLaunchStruct* mtls) {
484 
485     //android::StopWatch kernel_time("kernel time");
486 
487     if ((mWorkers.mCount >= 1) && mtls->isThreadable && !mInForEach) {
488         const size_t targetByteChunk = 16 * 1024;
489         mInForEach = true;
490         if (mtls->fep.dimY > 1) {
491             uint32_t s1 = mtls->fep.dimY / ((mWorkers.mCount + 1) * 4);
492             uint32_t s2 = 0;
493 
494             // This chooses our slice size to rate limit atomic ops to
495             // one per 16k bytes of reads/writes.
496             if (mtls->fep.yStrideOut) {
497                 s2 = targetByteChunk / mtls->fep.yStrideOut;
498             } else {
499                 s2 = targetByteChunk / mtls->fep.yStrideIn;
500             }
501             mtls->mSliceSize = rsMin(s1, s2);
502 
503             if(mtls->mSliceSize < 1) {
504                 mtls->mSliceSize = 1;
505             }
506 
507          //   mtls->mSliceSize = 2;
508             launchThreads(wc_xy, mtls);
509         } else {
510             uint32_t s1 = mtls->fep.dimX / ((mWorkers.mCount + 1) * 4);
511             uint32_t s2 = 0;
512 
513             // This chooses our slice size to rate limit atomic ops to
514             // one per 16k bytes of reads/writes.
515             if (mtls->fep.eStrideOut) {
516                 s2 = targetByteChunk / mtls->fep.eStrideOut;
517             } else {
518                 s2 = targetByteChunk / mtls->fep.eStrideIn;
519             }
520             mtls->mSliceSize = rsMin(s1, s2);
521 
522             if (mtls->mSliceSize < 1) {
523                 mtls->mSliceSize = 1;
524             }
525 
526             launchThreads(wc_x, mtls);
527         }
528         mInForEach = false;
529 
530         //ALOGE("launch 1");
531     } else {
532         RsForEachStubParamStruct p;
533         memcpy(&p, &mtls->fep, sizeof(p));
534         uint32_t sig = mtls->sig;
535 
536         // Allocate space for our input base pointers.
537         p.ins = new const void*[inLen];
538 
539         // Allocate space for our input stride information.
540         p.eStrideIns = new uint32_t[inLen];
541 
542         // Fill our stride information.
543         for (int index = inLen; --index >= 0;) {
544           p.eStrideIns[index] = mtls->fep.inStrides[index].eStride;
545         }
546 
547         //ALOGE("launch 3");
548         outer_foreach_t fn = (outer_foreach_t) mtls->kernel;
549         uint32_t offset_invariant = mtls->fep.dimY * mtls->fep.dimZ * p.ar[0];
550 
551         for (p.ar[0] = mtls->arrayStart; p.ar[0] < mtls->arrayEnd; p.ar[0]++) {
552             uint32_t offset_part = offset_invariant * p.ar[0];
553 
554             for (p.z = mtls->zStart; p.z < mtls->zEnd; p.z++) {
555                 for (p.y = mtls->yStart; p.y < mtls->yEnd; p.y++) {
556                     uint32_t offset = offset_part + mtls->fep.dimY * p.z + p.y;
557 
558                     p.out = mtls->fep.ptrOut + (mtls->fep.yStrideOut * offset) +
559                             (mtls->fep.eStrideOut * mtls->xStart);
560 
561                     for (int index = inLen; --index >= 0;) {
562                         StridePair &strides = mtls->fep.inStrides[index];
563 
564                         p.ins[index] = mtls->fep.ptrIns[index] +
565                                        (strides.yStride * offset) +
566                                        (strides.eStride * mtls->xStart);
567                     }
568 
569                     /*
570                      * The fourth argument is zero here because multi-input
571                      * kernels get their stride information from a member of p
572                      * that points to an array.
573                      */
574                     fn(&p, mtls->xStart, mtls->xEnd, 0, mtls->fep.eStrideOut);
575                 }
576             }
577         }
578 
579         // Free our arrays.
580         delete[] p.ins;
581         delete[] p.eStrideIns;
582     }
583 }
584 
setTLS(RsdCpuScriptImpl * sc)585 RsdCpuScriptImpl * RsdCpuReferenceImpl::setTLS(RsdCpuScriptImpl *sc) {
586     //ALOGE("setTls %p", sc);
587     ScriptTLSStruct * tls = (ScriptTLSStruct *)pthread_getspecific(gThreadTLSKey);
588     rsAssert(tls);
589     RsdCpuScriptImpl *old = tls->mImpl;
590     tls->mImpl = sc;
591     tls->mContext = mRSC;
592     if (sc) {
593         tls->mScript = sc->getScript();
594     } else {
595         tls->mScript = NULL;
596     }
597     return old;
598 }
599 
symLookup(const char * name)600 const RsdCpuReference::CpuSymbol * RsdCpuReferenceImpl::symLookup(const char *name) {
601     return mSymLookupFn(mRSC, name);
602 }
603 
604 
createScript(const ScriptC * s,char const * resName,char const * cacheDir,uint8_t const * bitcode,size_t bitcodeSize,uint32_t flags)605 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createScript(const ScriptC *s,
606                                     char const *resName, char const *cacheDir,
607                                     uint8_t const *bitcode, size_t bitcodeSize,
608                                     uint32_t flags) {
609 
610     RsdCpuScriptImpl *i = new RsdCpuScriptImpl(this, s);
611     if (!i->init(resName, cacheDir, bitcode, bitcodeSize, flags
612 #ifndef RS_COMPATIBILITY_LIB
613         , getBccPluginName()
614 #endif
615         )) {
616         delete i;
617         return NULL;
618     }
619     return i;
620 }
621 
622 extern RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
623                                              const Script *s, const Element *e);
624 extern RsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx,
625                                                    const Script *s, const Element *e);
626 extern RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
627                                                    const Script *s, const Element *e);
628 extern RsdCpuScriptImpl * rsdIntrinsic_LUT(RsdCpuReferenceImpl *ctx,
629                                            const Script *s, const Element *e);
630 extern RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
631                                                    const Script *s, const Element *e);
632 extern RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx,
633                                             const Script *s, const Element *e);
634 extern RsdCpuScriptImpl * rsdIntrinsic_YuvToRGB(RsdCpuReferenceImpl *ctx,
635                                                 const Script *s, const Element *e);
636 extern RsdCpuScriptImpl * rsdIntrinsic_Blend(RsdCpuReferenceImpl *ctx,
637                                              const Script *s, const Element *e);
638 extern RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx,
639                                                  const Script *s, const Element *e);
640 extern RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx,
641                                               const Script *s, const Element *e);
642 
createIntrinsic(const Script * s,RsScriptIntrinsicID iid,Element * e)643 RsdCpuReference::CpuScript * RsdCpuReferenceImpl::createIntrinsic(const Script *s,
644                                     RsScriptIntrinsicID iid, Element *e) {
645 
646     RsdCpuScriptImpl *i = NULL;
647     switch (iid) {
648     case RS_SCRIPT_INTRINSIC_ID_3DLUT:
649         i = rsdIntrinsic_3DLUT(this, s, e);
650         break;
651     case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3:
652         i = rsdIntrinsic_Convolve3x3(this, s, e);
653         break;
654     case RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX:
655         i = rsdIntrinsic_ColorMatrix(this, s, e);
656         break;
657     case RS_SCRIPT_INTRINSIC_ID_LUT:
658         i = rsdIntrinsic_LUT(this, s, e);
659         break;
660     case RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5:
661         i = rsdIntrinsic_Convolve5x5(this, s, e);
662         break;
663     case RS_SCRIPT_INTRINSIC_ID_BLUR:
664         i = rsdIntrinsic_Blur(this, s, e);
665         break;
666     case RS_SCRIPT_INTRINSIC_ID_YUV_TO_RGB:
667         i = rsdIntrinsic_YuvToRGB(this, s, e);
668         break;
669     case RS_SCRIPT_INTRINSIC_ID_BLEND:
670         i = rsdIntrinsic_Blend(this, s, e);
671         break;
672     case RS_SCRIPT_INTRINSIC_ID_HISTOGRAM:
673         i = rsdIntrinsic_Histogram(this, s, e);
674         break;
675     case RS_SCRIPT_INTRINSIC_ID_RESIZE:
676         i = rsdIntrinsic_Resize(this, s, e);
677         break;
678 
679     default:
680         rsAssert(0);
681     }
682 
683     return i;
684 }
685 
createScriptGroup(const ScriptGroup * sg)686 RsdCpuReference::CpuScriptGroup * RsdCpuReferenceImpl::createScriptGroup(const ScriptGroup *sg) {
687     CpuScriptGroupImpl *sgi = new CpuScriptGroupImpl(this, sg);
688     if (!sgi->init()) {
689         delete sgi;
690         return NULL;
691     }
692     return sgi;
693 }
694