1 #include "rsCpuScriptGroup2.h"
2 
3 #include <dlfcn.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <unistd.h>
7 
8 #include <set>
9 #include <sstream>
10 #include <string>
11 #include <vector>
12 
13 #ifndef RS_COMPATIBILITY_LIB
14 #include "bcc/Config.h"
15 #endif
16 
17 #include "cpu_ref/rsCpuCore.h"
18 #include "rsClosure.h"
19 #include "rsContext.h"
20 #include "rsCpuCore.h"
21 #include "rsCpuExecutable.h"
22 #include "rsCpuScript.h"
23 #include "rsScript.h"
24 #include "rsScriptGroup2.h"
25 #include "rsScriptIntrinsic.h"
26 
27 using std::string;
28 using std::vector;
29 
30 namespace android {
31 namespace renderscript {
32 
33 namespace {
34 
35 const size_t DefaultKernelArgCount = 2;
36 
groupRoot(const RsExpandKernelDriverInfo * kinfo,uint32_t xstart,uint32_t xend,uint32_t outstep)37 void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart,
38                uint32_t xend, uint32_t outstep) {
39     const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr;
40     RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo);
41 
42     const size_t oldInLen = mutable_kinfo->inLen;
43 
44     decltype(mutable_kinfo->inStride) oldInStride;
45     memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride));
46 
47     for (CPUClosure* cpuClosure : closures) {
48         const Closure* closure = cpuClosure->mClosure;
49 
50         // There had better be enough space in mutable_kinfo
51         rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT);
52 
53         for (size_t i = 0; i < closure->mNumArg; i++) {
54             const void* arg = closure->mArgs[i];
55             const Allocation* a = (const Allocation*)arg;
56             const uint32_t eStride = a->mHal.state.elementSizeBytes;
57             const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
58                     eStride * xstart;
59             if (kinfo->dim.y > 1) {
60                 ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y;
61             }
62             mutable_kinfo->inPtr[i] = ptr;
63             mutable_kinfo->inStride[i] = eStride;
64         }
65         mutable_kinfo->inLen = closure->mNumArg;
66 
67         const Allocation* out = closure->mReturnValue;
68         const uint32_t ostep = out->mHal.state.elementSizeBytes;
69         const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
70                 ostep * xstart;
71         if (kinfo->dim.y > 1) {
72             ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y;
73         }
74 
75         mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr);
76 
77         // The implementation of an intrinsic relies on kinfo->usr being
78         // the "this" pointer to the intrinsic (an RsdCpuScriptIntrinsic object)
79         mutable_kinfo->usr = cpuClosure->mSi;
80 
81         cpuClosure->mFunc(kinfo, xstart, xend, ostep);
82     }
83 
84     mutable_kinfo->inLen = oldInLen;
85     mutable_kinfo->usr = &closures;
86     memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride));
87 }
88 
89 }  // namespace
90 
Batch(CpuScriptGroup2Impl * group,const char * name)91 Batch::Batch(CpuScriptGroup2Impl* group, const char* name) :
92     mGroup(group), mFunc(nullptr) {
93     mName = strndup(name, strlen(name));
94 }
95 
~Batch()96 Batch::~Batch() {
97     for (CPUClosure* c : mClosures) {
98         delete c;
99     }
100     free(mName);
101 }
102 
conflict(CPUClosure * cpuClosure) const103 bool Batch::conflict(CPUClosure* cpuClosure) const {
104     if (mClosures.empty()) {
105         return false;
106     }
107 
108     const Closure* closure = cpuClosure->mClosure;
109 
110     if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) {
111         // An invoke should be in a batch by itself, so it conflicts with any other
112         // closure.
113         return true;
114     }
115 
116     const auto& globalDeps = closure->mGlobalDeps;
117     const auto& argDeps = closure->mArgDeps;
118 
119     for (CPUClosure* c : mClosures) {
120         const Closure* batched = c->mClosure;
121         if (globalDeps.find(batched) != globalDeps.end()) {
122             return true;
123         }
124         const auto& it = argDeps.find(batched);
125         if (it != argDeps.end()) {
126             const auto& args = (*it).second;
127             for (const auto &p1 : *args) {
128                 if (p1.second.get() != nullptr) {
129                     return true;
130                 }
131             }
132         }
133     }
134 
135     // The compiler fusion pass in bcc expects that kernels chained up through
136     // (1st) input and output.
137 
138     const Closure* lastBatched = mClosures.back()->mClosure;
139     const auto& it = argDeps.find(lastBatched);
140 
141     if (it == argDeps.end()) {
142         return true;
143     }
144 
145     const auto& args = (*it).second;
146     for (const auto &p1 : *args) {
147         if (p1.first == 0 && p1.second.get() == nullptr) {
148             // The new closure depends on the last batched closure's return
149             // value (fieldId being nullptr) for its first argument (argument 0)
150             return false;
151         }
152     }
153 
154     return true;
155 }
156 
CpuScriptGroup2Impl(RsdCpuReferenceImpl * cpuRefImpl,const ScriptGroupBase * sg)157 CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
158                                          const ScriptGroupBase *sg) :
159     mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)),
160     mExecutable(nullptr), mScriptObj(nullptr) {
161     rsAssert(!mGroup->mClosures.empty());
162 
163     mCpuRefImpl->lockMutex();
164     Batch* batch = new Batch(this, "Batch0");
165     int i = 0;
166     for (Closure* closure: mGroup->mClosures) {
167         CPUClosure* cc;
168         const IDBase* funcID = closure->mFunctionID.get();
169         RsdCpuScriptImpl* si =
170                 (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript);
171         if (closure->mIsKernel) {
172             MTLaunchStructForEach mtls;
173             si->forEachKernelSetup(funcID->mSlot, &mtls);
174             cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel);
175         } else {
176             cc = new CPUClosure(closure, si);
177         }
178 
179         if (batch->conflict(cc)) {
180             mBatches.push_back(batch);
181             std::stringstream ss;
182             ss << "Batch" << ++i;
183             std::string batchStr(ss.str());
184             batch = new Batch(this, batchStr.c_str());
185         }
186 
187         batch->mClosures.push_back(cc);
188     }
189 
190     rsAssert(!batch->mClosures.empty());
191     mBatches.push_back(batch);
192 
193 #ifndef RS_COMPATIBILITY_LIB
194     compile(mGroup->mCacheDir);
195     if (mScriptObj != nullptr && mExecutable != nullptr) {
196         for (Batch* batch : mBatches) {
197             batch->resolveFuncPtr(mScriptObj);
198         }
199     }
200 #endif  // RS_COMPATIBILITY_LIB
201     mCpuRefImpl->unlockMutex();
202 }
203 
resolveFuncPtr(void * sharedObj)204 void Batch::resolveFuncPtr(void* sharedObj) {
205     std::string funcName(mName);
206     if (mClosures.front()->mClosure->mIsKernel) {
207         funcName.append(".expand");
208     }
209     mFunc = dlsym(sharedObj, funcName.c_str());
210     rsAssert (mFunc != nullptr);
211 }
212 
~CpuScriptGroup2Impl()213 CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
214     for (Batch* batch : mBatches) {
215         delete batch;
216     }
217     delete mExecutable;
218     // TODO: move this dlclose into ~ScriptExecutable().
219     if (mScriptObj != nullptr) {
220         dlclose(mScriptObj);
221     }
222 }
223 
224 namespace {
225 
226 #ifndef RS_COMPATIBILITY_LIB
227 
getCoreLibPath(Context * context,string * coreLibRelaxedPath)228 string getCoreLibPath(Context* context, string* coreLibRelaxedPath) {
229     *coreLibRelaxedPath = "";
230 
231     // If we're debugging, use the debug library.
232     if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
233         return SYSLIBPATH_BC"/libclcore_debug.bc";
234     }
235 
236     // Check for a platform specific library
237 
238 #if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
239     // NEON-capable ARMv7a devices can use an accelerated math library
240     // for all reduced precision scripts.
241     // ARMv8 does not use NEON, as ASIMD can be used with all precision
242     // levels.
243     *coreLibRelaxedPath = SYSLIBPATH_BC"/libclcore_neon.bc";
244 #endif
245 
246 #if defined(__i386__) || defined(__x86_64__)
247     // x86 devices will use an optimized library.
248     return SYSLIBPATH_BC"/libclcore_x86.bc";
249 #else
250     return SYSLIBPATH_BC"/libclcore.bc";
251 #endif
252 }
253 
setupCompileArguments(const vector<const char * > & inputs,const vector<string> & kernelBatches,const vector<string> & invokeBatches,const char * outputDir,const char * outputFileName,const char * coreLibPath,const char * coreLibRelaxedPath,const bool emitGlobalInfo,const bool emitGlobalInfoSkipConstant,int optLevel,vector<const char * > * args)254 void setupCompileArguments(
255         const vector<const char*>& inputs, const vector<string>& kernelBatches,
256         const vector<string>& invokeBatches,
257         const char* outputDir, const char* outputFileName,
258         const char* coreLibPath, const char* coreLibRelaxedPath,
259         const bool emitGlobalInfo, const bool emitGlobalInfoSkipConstant,
260         int optLevel, vector<const char*>* args) {
261     args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
262     args->push_back("-fPIC");
263     args->push_back("-embedRSInfo");
264     if (emitGlobalInfo) {
265         args->push_back("-rs-global-info");
266         if (emitGlobalInfoSkipConstant) {
267             args->push_back("-rs-global-info-skip-constant");
268         }
269     }
270     args->push_back("-mtriple");
271     args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
272     args->push_back("-bclib");
273     args->push_back(coreLibPath);
274     args->push_back("-bclib_relaxed");
275     args->push_back(coreLibRelaxedPath);
276     for (const char* input : inputs) {
277         args->push_back(input);
278     }
279     for (const string& batch : kernelBatches) {
280         args->push_back("-merge");
281         args->push_back(batch.c_str());
282     }
283     for (const string& batch : invokeBatches) {
284         args->push_back("-invoke");
285         args->push_back(batch.c_str());
286     }
287     args->push_back("-output_path");
288     args->push_back(outputDir);
289 
290     args->push_back("-O");
291     switch (optLevel) {
292     case 0:
293         args->push_back("0");
294         break;
295     case 3:
296         args->push_back("3");
297         break;
298     default:
299         ALOGW("Expected optimization level of 0 or 3. Received %d", optLevel);
300         args->push_back("3");
301         break;
302     }
303 
304     // The output filename has to be the last, in case we need to pop it out and
305     // replace with a different name.
306     args->push_back("-o");
307     args->push_back(outputFileName);
308 }
309 
generateSourceSlot(RsdCpuReferenceImpl * ctxt,const Closure & closure,const std::vector<const char * > & inputs,std::stringstream & ss)310 void generateSourceSlot(RsdCpuReferenceImpl* ctxt,
311                         const Closure& closure,
312                         const std::vector<const char*>& inputs,
313                         std::stringstream& ss) {
314     const IDBase* funcID = (const IDBase*)closure.mFunctionID.get();
315     const Script* script = funcID->mScript;
316 
317     rsAssert (!script->isIntrinsic());
318 
319     const RsdCpuScriptImpl *cpuScript =
320             (const RsdCpuScriptImpl *)ctxt->lookupScript(script);
321     const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
322 
323     const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) -
324             inputs.begin();
325 
326     ss << index << "," << funcID->mSlot << ".";
327 }
328 
329 #endif  // RS_COMPATIBILTY_LIB
330 
331 }  // anonymous namespace
332 
333 // This function is used by the debugger to inspect ScriptGroup
334 // compilations.
335 //
336 // "__attribute__((noinline))" and "__asm__" are used to prevent the
337 // function call from being eliminated as a no-op (see the "noinline"
338 // attribute in gcc documentation).
339 //
340 // "__attribute__((weak))" is used to prevent callers from recognizing
341 // that this is guaranteed to be the function definition, recognizing
342 // that certain arguments are unused, and optimizing away the passing
343 // of those arguments (see the LLVM optimization
344 // DeadArgumentElimination).  Theoretically, the compiler could get
345 // aggressive enough with link-time optimization that even marking the
346 // entry point as a weak definition wouldn't solve the problem.
347 //
348 extern __attribute__((noinline)) __attribute__((weak))
debugHintScriptGroup2(const char * groupName,const uint32_t groupNameSize,const ExpandFuncTy * kernel,const uint32_t kernelCount)349 void debugHintScriptGroup2(const char* groupName,
350                            const uint32_t groupNameSize,
351                            const ExpandFuncTy* kernel,
352                            const uint32_t kernelCount) {
353     ALOGV("group name: %d:%s\n", groupNameSize, groupName);
354     for (uint32_t i=0; i < kernelCount; ++i) {
355         const char* f1 = (const char*)(kernel[i]);
356         __asm__ __volatile__("");
357         ALOGV("  closure: %p\n", (const void*)f1);
358     }
359     // do nothing, this is just a hook point for the debugger.
360     return;
361 }
362 
compile(const char * cacheDir)363 void CpuScriptGroup2Impl::compile(const char* cacheDir) {
364 #ifndef RS_COMPATIBILITY_LIB
365     if (mGroup->mClosures.size() < 2) {
366         return;
367     }
368 
369     const int optLevel = getCpuRefImpl()->getContext()->getOptLevel();
370     if (optLevel == 0) {
371         std::vector<ExpandFuncTy> kernels;
372         for (const Batch* b : mBatches)
373             for (const CPUClosure* c : b->mClosures)
374                 kernels.push_back(c->mFunc);
375 
376         if (kernels.size()) {
377             // pass this information on to the debugger via a hint function.
378             debugHintScriptGroup2(mGroup->mName,
379                                   strlen(mGroup->mName),
380                                   kernels.data(),
381                                   kernels.size());
382         }
383 
384         // skip script group compilation forcing the driver to use the fallback
385         // execution path which currently has better support for debugging.
386         return;
387     }
388 
389     auto comparator = [](const char* str1, const char* str2) -> bool {
390         return strcmp(str1, str2) < 0;
391     };
392     std::set<const char*, decltype(comparator)> inputSet(comparator);
393 
394     for (Closure* closure : mGroup->mClosures) {
395         const Script* script = closure->mFunctionID.get()->mScript;
396 
397         // If any script is an intrinsic, give up trying fusing the kernels.
398         if (script->isIntrinsic()) {
399             return;
400         }
401 
402         const RsdCpuScriptImpl *cpuScript =
403             (const RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(script);
404 
405         const char* bitcodeFilename = cpuScript->getBitcodeFilePath();
406         inputSet.insert(bitcodeFilename);
407     }
408 
409     std::vector<const char*> inputs(inputSet.begin(), inputSet.end());
410 
411     std::vector<string> kernelBatches;
412     std::vector<string> invokeBatches;
413 
414     int i = 0;
415     for (const auto& batch : mBatches) {
416         rsAssert(batch->size() > 0);
417 
418         std::stringstream ss;
419         ss << batch->mName << ":";
420 
421         if (!batch->mClosures.front()->mClosure->mIsKernel) {
422             rsAssert(batch->size() == 1);
423             generateSourceSlot(mCpuRefImpl, *batch->mClosures.front()->mClosure, inputs, ss);
424             invokeBatches.push_back(ss.str());
425         } else {
426             for (const auto& cpuClosure : batch->mClosures) {
427                 generateSourceSlot(mCpuRefImpl, *cpuClosure->mClosure, inputs, ss);
428             }
429             kernelBatches.push_back(ss.str());
430         }
431     }
432 
433     rsAssert(cacheDir != nullptr);
434     string objFilePath(cacheDir);
435     objFilePath.append("/");
436     objFilePath.append(mGroup->mName);
437     objFilePath.append(".o");
438 
439     const char* resName = mGroup->mName;
440     string coreLibRelaxedPath;
441     const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(),
442                                                &coreLibRelaxedPath);
443 
444     vector<const char*> arguments;
445     bool emitGlobalInfo = getCpuRefImpl()->getEmbedGlobalInfo();
446     bool emitGlobalInfoSkipConstant = getCpuRefImpl()->getEmbedGlobalInfoSkipConstant();
447     setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir,
448                           resName, coreLibPath.c_str(), coreLibRelaxedPath.c_str(),
449                           emitGlobalInfo, emitGlobalInfoSkipConstant,
450                           optLevel, &arguments);
451 
452     std::unique_ptr<const char> cmdLine(rsuJoinStrings(arguments.size() - 1,
453                                                        arguments.data()));
454 
455     inputs.push_back(coreLibPath.c_str());
456     inputs.push_back(coreLibRelaxedPath.c_str());
457 
458     uint32_t checksum = constructBuildChecksum(nullptr, 0, cmdLine.get(),
459                                                inputs.data(), inputs.size());
460 
461     if (checksum == 0) {
462         return;
463     }
464 
465     std::stringstream ss;
466     ss << std::hex << checksum;
467     std::string checksumStr(ss.str());
468 
469     //===--------------------------------------------------------------------===//
470     // Try to load a shared lib from code cache matching filename and checksum
471     //===--------------------------------------------------------------------===//
472 
473     bool alreadyLoaded = false;
474     std::string cloneName;
475 
476     const bool useRSDebugContext =
477             (mCpuRefImpl->getContext()->getContextType() == RS_CONTEXT_TYPE_DEBUG);
478     const bool reuse = !is_force_recompile() && !useRSDebugContext;
479     if (reuse) {
480         mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName, nullptr,
481                                                            &alreadyLoaded);
482     }
483     if (mScriptObj != nullptr) {
484         // A shared library named resName is found in code cache directory
485         // cacheDir, and loaded with the handle stored in mScriptObj.
486 
487         mExecutable = ScriptExecutable::createFromSharedObject(
488             mScriptObj, checksum);
489 
490         if (mExecutable != nullptr) {
491             // The loaded shared library in mScriptObj has a matching checksum.
492             // An executable object has been created.
493             return;
494         }
495 
496         ALOGV("Failed to create an executable object from so file due to "
497               "mismatching checksum");
498 
499         if (alreadyLoaded) {
500             // The shared object found in code cache has already been loaded.
501             // A different file name is needed for the new shared library, to
502             // avoid corrupting the currently loaded instance.
503 
504             cloneName.append(resName);
505             cloneName.append("#");
506             cloneName.append(SharedLibraryUtils::getRandomString(6).c_str());
507 
508             // The last element in arguments is the output filename.
509             arguments.pop_back();
510             arguments.push_back(cloneName.c_str());
511         }
512 
513         dlclose(mScriptObj);
514         mScriptObj = nullptr;
515     }
516 
517     //===--------------------------------------------------------------------===//
518     // Fuse the input kernels and generate native code in an object file
519     //===--------------------------------------------------------------------===//
520 
521     arguments.push_back("-build-checksum");
522     arguments.push_back(checksumStr.c_str());
523     arguments.push_back(nullptr);
524 
525     bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH,
526                                       arguments.size()-1,
527                                       arguments.data());
528     if (!compiled) {
529         return;
530     }
531 
532     //===--------------------------------------------------------------------===//
533     // Create and load the shared lib
534     //===--------------------------------------------------------------------===//
535 
536     std::string SOPath;
537 
538     if (!SharedLibraryUtils::createSharedLibrary(
539             getCpuRefImpl()->getContext()->getDriverName(), cacheDir, resName,
540             reuse, &SOPath)) {
541         ALOGE("Failed to link object file '%s'", resName);
542         unlink(objFilePath.c_str());
543         return;
544     }
545 
546     unlink(objFilePath.c_str());
547 
548     if (reuse) {
549         mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
550     } else {
551         mScriptObj = SharedLibraryUtils::loadAndDeleteSharedLibrary(SOPath.c_str());
552     }
553     if (mScriptObj == nullptr) {
554         ALOGE("Unable to load '%s'", resName);
555         return;
556     }
557 
558     if (alreadyLoaded) {
559         // Delete the temporary, random-named file that we created to avoid
560         // interfering with an already loaded shared library.
561         string cloneFilePath(cacheDir);
562         cloneFilePath.append("/");
563         cloneFilePath.append(cloneName.c_str());
564         cloneFilePath.append(".so");
565         unlink(cloneFilePath.c_str());
566     }
567 
568     mExecutable = ScriptExecutable::createFromSharedObject(mScriptObj);
569 
570 #endif  // RS_COMPATIBILITY_LIB
571 }
572 
execute()573 void CpuScriptGroup2Impl::execute() {
574     for (auto batch : mBatches) {
575         batch->setGlobalsForBatch();
576         batch->run();
577     }
578 }
579 
setGlobalsForBatch()580 void Batch::setGlobalsForBatch() {
581     for (CPUClosure* cpuClosure : mClosures) {
582         const Closure* closure = cpuClosure->mClosure;
583         const IDBase* funcID = closure->mFunctionID.get();
584         Script* s = funcID->mScript;;
585         for (const auto& p : closure->mGlobals) {
586             const int64_t value = p.second.first;
587             int size = p.second.second;
588             if (value == 0 && size == 0) {
589                 // This indicates the current closure depends on another closure for a
590                 // global in their shared module (script). In this case we don't need to
591                 // copy the value. For example, an invoke intializes a global variable
592                 // which a kernel later reads.
593                 continue;
594             }
595             rsAssert(p.first != nullptr);
596             Script* script = p.first->mScript;
597             rsAssert(script == s);
598             RsdCpuReferenceImpl* ctxt = mGroup->getCpuRefImpl();
599             const RsdCpuScriptImpl *cpuScript =
600                     (const RsdCpuScriptImpl *)ctxt->lookupScript(script);
601             int slot = p.first->mSlot;
602             ScriptExecutable* exec = mGroup->getExecutable();
603             if (exec != nullptr) {
604                 const char* varName = cpuScript->getFieldName(slot);
605                 void* addr = exec->getFieldAddress(varName);
606                 if (size < 0) {
607                     rsrSetObject(mGroup->getCpuRefImpl()->getContext(),
608                                  (rs_object_base*)addr, (ObjectBase*)value);
609                 } else {
610                     memcpy(addr, (const void*)&value, size);
611                 }
612             } else {
613                 // We use -1 size to indicate an ObjectBase rather than a primitive type
614                 if (size < 0) {
615                     s->setVarObj(slot, (ObjectBase*)value);
616                 } else {
617                     s->setVar(slot, (const void*)&value, size);
618                 }
619             }
620         }
621     }
622 }
623 
run()624 void Batch::run() {
625     if (!mClosures.front()->mClosure->mIsKernel) {
626         rsAssert(mClosures.size() == 1);
627 
628         // This batch contains a single closure for an invoke function
629         CPUClosure* cc = mClosures.front();
630         const Closure* c = cc->mClosure;
631 
632         if (mFunc != nullptr) {
633             // TODO: Need align pointers for x86_64.
634             // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp
635             ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength);
636         } else {
637             const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get();
638             rsAssert(invokeID != nullptr);
639             cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
640         }
641 
642         return;
643     }
644 
645     if (mFunc != nullptr) {
646         MTLaunchStructForEach mtls;
647         const CPUClosure* firstCpuClosure = mClosures.front();
648         const CPUClosure* lastCpuClosure = mClosures.back();
649 
650         firstCpuClosure->mSi->forEachMtlsSetup(
651                 (const Allocation**)firstCpuClosure->mClosure->mArgs,
652                 firstCpuClosure->mClosure->mNumArg,
653                 lastCpuClosure->mClosure->mReturnValue,
654                 nullptr, 0, nullptr, &mtls);
655 
656         mtls.script = nullptr;
657         mtls.fep.usr = nullptr;
658         mtls.kernel = (ForEachFunc_t)mFunc;
659 
660         mGroup->getCpuRefImpl()->launchForEach(
661                 (const Allocation**)firstCpuClosure->mClosure->mArgs,
662                 firstCpuClosure->mClosure->mNumArg,
663                 lastCpuClosure->mClosure->mReturnValue,
664                 nullptr, &mtls);
665 
666         return;
667     }
668 
669     for (CPUClosure* cpuClosure : mClosures) {
670         const Closure* closure = cpuClosure->mClosure;
671         const ScriptKernelID* kernelID =
672                 (const ScriptKernelID*)closure->mFunctionID.get();
673         cpuClosure->mSi->preLaunch(kernelID->mSlot,
674                                    (const Allocation**)closure->mArgs,
675                                    closure->mNumArg, closure->mReturnValue,
676                                    nullptr, 0, nullptr);
677     }
678 
679     const CPUClosure* cpuClosure = mClosures.front();
680     const Closure* closure = cpuClosure->mClosure;
681     MTLaunchStructForEach mtls;
682 
683     if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
684                                           closure->mNumArg,
685                                           closure->mReturnValue,
686                                           nullptr, 0, nullptr, &mtls)) {
687 
688         mtls.script = nullptr;
689         mtls.kernel = &groupRoot;
690         mtls.fep.usr = &mClosures;
691 
692         mGroup->getCpuRefImpl()->launchForEach(nullptr, 0, nullptr, nullptr, &mtls);
693     }
694 
695     for (CPUClosure* cpuClosure : mClosures) {
696         const Closure* closure = cpuClosure->mClosure;
697         const ScriptKernelID* kernelID =
698                 (const ScriptKernelID*)closure->mFunctionID.get();
699         cpuClosure->mSi->postLaunch(kernelID->mSlot,
700                                     (const Allocation**)closure->mArgs,
701                                     closure->mNumArg, closure->mReturnValue,
702                                     nullptr, 0, nullptr);
703     }
704 }
705 
706 }  // namespace renderscript
707 }  // namespace android
708