1 #include "rsCpuScriptGroup2.h"
2
3 #include <dlfcn.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <unistd.h>
7
8 #include <set>
9 #include <sstream>
10 #include <string>
11 #include <vector>
12
13 #ifndef RS_COMPATIBILITY_LIB
14 #include "bcc/Config/Config.h"
15 #endif
16
17 #include "cpu_ref/rsCpuCore.h"
18 #include "rsClosure.h"
19 #include "rsContext.h"
20 #include "rsCpuCore.h"
21 #include "rsCpuExecutable.h"
22 #include "rsCpuScript.h"
23 #include "rsScript.h"
24 #include "rsScriptGroup2.h"
25 #include "rsScriptIntrinsic.h"
26
27 using std::string;
28 using std::vector;
29
30 namespace android {
31 namespace renderscript {
32
33 namespace {
34
35 const size_t DefaultKernelArgCount = 2;
36
groupRoot(const RsExpandKernelDriverInfo * kinfo,uint32_t xstart,uint32_t xend,uint32_t outstep)37 void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart,
38 uint32_t xend, uint32_t outstep) {
39 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr;
40 RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo);
41
42 const size_t oldInLen = mutable_kinfo->inLen;
43
44 decltype(mutable_kinfo->inStride) oldInStride;
45 memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride));
46
47 for (CPUClosure* cpuClosure : closures) {
48 const Closure* closure = cpuClosure->mClosure;
49
50 // There had better be enough space in mutable_kinfo
51 rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT);
52
53 for (size_t i = 0; i < closure->mNumArg; i++) {
54 const void* arg = closure->mArgs[i];
55 const Allocation* a = (const Allocation*)arg;
56 const uint32_t eStride = a->mHal.state.elementSizeBytes;
57 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) +
58 eStride * xstart;
59 if (kinfo->dim.y > 1) {
60 ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y;
61 }
62 mutable_kinfo->inPtr[i] = ptr;
63 mutable_kinfo->inStride[i] = eStride;
64 }
65 mutable_kinfo->inLen = closure->mNumArg;
66
67 const Allocation* out = closure->mReturnValue;
68 const uint32_t ostep = out->mHal.state.elementSizeBytes;
69 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) +
70 ostep * xstart;
71 if (kinfo->dim.y > 1) {
72 ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y;
73 }
74
75 rsAssert(kinfo->outLen <= 1);
76 mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr);
77
78 cpuClosure->mFunc(kinfo, xstart, xend, ostep);
79 }
80
81 mutable_kinfo->inLen = oldInLen;
82 memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride));
83 }
84
85 } // namespace
86
Batch(CpuScriptGroup2Impl * group,const char * name)87 Batch::Batch(CpuScriptGroup2Impl* group, const char* name) :
88 mGroup(group), mFunc(nullptr) {
89 mName = strndup(name, strlen(name));
90 }
91
~Batch()92 Batch::~Batch() {
93 for (CPUClosure* c : mClosures) {
94 delete c;
95 }
96 free(mName);
97 }
98
conflict(CPUClosure * cpuClosure) const99 bool Batch::conflict(CPUClosure* cpuClosure) const {
100 if (mClosures.empty()) {
101 return false;
102 }
103
104 const Closure* closure = cpuClosure->mClosure;
105
106 if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) {
107 // An invoke should be in a batch by itself, so it conflicts with any other
108 // closure.
109 return true;
110 }
111
112 const auto& globalDeps = closure->mGlobalDeps;
113 const auto& argDeps = closure->mArgDeps;
114
115 for (CPUClosure* c : mClosures) {
116 const Closure* batched = c->mClosure;
117 if (globalDeps.find(batched) != globalDeps.end()) {
118 return true;
119 }
120 const auto& it = argDeps.find(batched);
121 if (it != argDeps.end()) {
122 const auto& args = (*it).second;
123 for (const auto &p1 : *args) {
124 if (p1.second.get() != nullptr) {
125 return true;
126 }
127 }
128 }
129 }
130
131 // The compiler fusion pass in bcc expects that kernels chained up through
132 // (1st) input and output.
133
134 const Closure* lastBatched = mClosures.back()->mClosure;
135 const auto& it = argDeps.find(lastBatched);
136
137 if (it == argDeps.end()) {
138 return true;
139 }
140
141 const auto& args = (*it).second;
142 for (const auto &p1 : *args) {
143 if (p1.first == 0 && p1.second.get() == nullptr) {
144 // The new closure depends on the last batched closure's return
145 // value (fieldId being nullptr) for its first argument (argument 0)
146 return false;
147 }
148 }
149
150 return true;
151 }
152
CpuScriptGroup2Impl(RsdCpuReferenceImpl * cpuRefImpl,const ScriptGroupBase * sg)153 CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
154 const ScriptGroupBase *sg) :
155 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)),
156 mExecutable(nullptr), mScriptObj(nullptr) {
157 rsAssert(!mGroup->mClosures.empty());
158
159 mCpuRefImpl->lockMutex();
160 Batch* batch = new Batch(this, "Batch0");
161 int i = 0;
162 for (Closure* closure: mGroup->mClosures) {
163 CPUClosure* cc;
164 const IDBase* funcID = closure->mFunctionID.get();
165 RsdCpuScriptImpl* si =
166 (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript);
167 if (closure->mIsKernel) {
168 MTLaunchStruct mtls;
169 si->forEachKernelSetup(funcID->mSlot, &mtls);
170 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel);
171 } else {
172 cc = new CPUClosure(closure, si);
173 }
174
175 if (batch->conflict(cc)) {
176 mBatches.push_back(batch);
177 std::stringstream ss;
178 ss << "Batch" << ++i;
179 batch = new Batch(this, ss.str().c_str());
180 }
181
182 batch->mClosures.push_back(cc);
183 }
184
185 rsAssert(!batch->mClosures.empty());
186 mBatches.push_back(batch);
187
188 #ifndef RS_COMPATIBILITY_LIB
189 compile(mGroup->mCacheDir);
190 if (mScriptObj != nullptr && mExecutable != nullptr) {
191 for (Batch* batch : mBatches) {
192 batch->resolveFuncPtr(mScriptObj);
193 }
194 }
195 #endif // RS_COMPATIBILITY_LIB
196 mCpuRefImpl->unlockMutex();
197 }
198
resolveFuncPtr(void * sharedObj)199 void Batch::resolveFuncPtr(void* sharedObj) {
200 std::string funcName(mName);
201 if (mClosures.front()->mClosure->mIsKernel) {
202 funcName.append(".expand");
203 }
204 mFunc = dlsym(sharedObj, funcName.c_str());
205 rsAssert (mFunc != nullptr);
206 }
207
~CpuScriptGroup2Impl()208 CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
209 for (Batch* batch : mBatches) {
210 delete batch;
211 }
212 delete mExecutable;
213 // TODO: move this dlclose into ~ScriptExecutable().
214 if (mScriptObj != nullptr) {
215 dlclose(mScriptObj);
216 }
217 }
218
219 namespace {
220
221 #ifndef RS_COMPATIBILITY_LIB
222
getCoreLibPath(Context * context,string * coreLibRelaxedPath)223 string getCoreLibPath(Context* context, string* coreLibRelaxedPath) {
224 *coreLibRelaxedPath = "";
225
226 // If we're debugging, use the debug library.
227 if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) {
228 return SYSLIBPATH"/libclcore_debug.bc";
229 }
230
231 // Check for a platform specific library
232
233 #if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
234 // NEON-capable ARMv7a devices can use an accelerated math library
235 // for all reduced precision scripts.
236 // ARMv8 does not use NEON, as ASIMD can be used with all precision
237 // levels.
238 *coreLibRelaxedPath = SYSLIBPATH"/libclcore_neon.bc";
239 #endif
240
241 #if defined(__i386__) || defined(__x86_64__)
242 // x86 devices will use an optimized library.
243 return SYSLIBPATH"/libclcore_x86.bc";
244 #else
245 return SYSLIBPATH"/libclcore.bc";
246 #endif
247 }
248
setupCompileArguments(const vector<const char * > & inputs,const vector<string> & kernelBatches,const vector<string> & invokeBatches,const char * outputDir,const char * outputFileName,const char * coreLibPath,const char * coreLibRelaxedPath,const bool emitGlobalInfo,const bool emitGlobalInfoSkipConstant,vector<const char * > * args)249 void setupCompileArguments(
250 const vector<const char*>& inputs, const vector<string>& kernelBatches,
251 const vector<string>& invokeBatches,
252 const char* outputDir, const char* outputFileName,
253 const char* coreLibPath, const char* coreLibRelaxedPath,
254 const bool emitGlobalInfo, const bool emitGlobalInfoSkipConstant,
255 vector<const char*>* args) {
256 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
257 args->push_back("-fPIC");
258 args->push_back("-embedRSInfo");
259 if (emitGlobalInfo) {
260 args->push_back("-rs-global-info");
261 if (emitGlobalInfoSkipConstant) {
262 args->push_back("-rs-global-info-skip-constant");
263 }
264 }
265 args->push_back("-mtriple");
266 args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
267 args->push_back("-bclib");
268 args->push_back(coreLibPath);
269 args->push_back("-bclib_relaxed");
270 args->push_back(coreLibRelaxedPath);
271 for (const char* input : inputs) {
272 args->push_back(input);
273 }
274 for (const string& batch : kernelBatches) {
275 args->push_back("-merge");
276 args->push_back(batch.c_str());
277 }
278 for (const string& batch : invokeBatches) {
279 args->push_back("-invoke");
280 args->push_back(batch.c_str());
281 }
282 args->push_back("-output_path");
283 args->push_back(outputDir);
284
285 // The output filename has to be the last, in case we need to pop it out and
286 // replace with a different name.
287 args->push_back("-o");
288 args->push_back(outputFileName);
289 }
290
generateSourceSlot(RsdCpuReferenceImpl * ctxt,const Closure & closure,const std::vector<const char * > & inputs,std::stringstream & ss)291 void generateSourceSlot(RsdCpuReferenceImpl* ctxt,
292 const Closure& closure,
293 const std::vector<const char*>& inputs,
294 std::stringstream& ss) {
295 const IDBase* funcID = (const IDBase*)closure.mFunctionID.get();
296 const Script* script = funcID->mScript;
297
298 rsAssert (!script->isIntrinsic());
299
300 const RsdCpuScriptImpl *cpuScript =
301 (const RsdCpuScriptImpl *)ctxt->lookupScript(script);
302 const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
303
304 const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) -
305 inputs.begin();
306
307 ss << index << "," << funcID->mSlot << ".";
308 }
309
310 #endif // RS_COMPATIBILTY_LIB
311
312 } // anonymous namespace
313
compile(const char * cacheDir)314 void CpuScriptGroup2Impl::compile(const char* cacheDir) {
315 #ifndef RS_COMPATIBILITY_LIB
316 if (mGroup->mClosures.size() < 2) {
317 return;
318 }
319
320 auto comparator = [](const char* str1, const char* str2) -> bool {
321 return strcmp(str1, str2) < 0;
322 };
323 std::set<const char*, decltype(comparator)> inputSet(comparator);
324
325 for (Closure* closure : mGroup->mClosures) {
326 const Script* script = closure->mFunctionID.get()->mScript;
327
328 // If any script is an intrinsic, give up trying fusing the kernels.
329 if (script->isIntrinsic()) {
330 return;
331 }
332
333 const RsdCpuScriptImpl *cpuScript =
334 (const RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(script);
335
336 const char* bitcodeFilename = cpuScript->getBitcodeFilePath();
337 inputSet.insert(bitcodeFilename);
338 }
339
340 std::vector<const char*> inputs(inputSet.begin(), inputSet.end());
341
342 std::vector<string> kernelBatches;
343 std::vector<string> invokeBatches;
344
345 int i = 0;
346 for (const auto& batch : mBatches) {
347 rsAssert(batch->size() > 0);
348
349 std::stringstream ss;
350 ss << batch->mName << ":";
351
352 if (!batch->mClosures.front()->mClosure->mIsKernel) {
353 rsAssert(batch->size() == 1);
354 generateSourceSlot(mCpuRefImpl, *batch->mClosures.front()->mClosure, inputs, ss);
355 invokeBatches.push_back(ss.str());
356 } else {
357 for (const auto& cpuClosure : batch->mClosures) {
358 generateSourceSlot(mCpuRefImpl, *cpuClosure->mClosure, inputs, ss);
359 }
360 kernelBatches.push_back(ss.str());
361 }
362 }
363
364 rsAssert(cacheDir != nullptr);
365 string objFilePath(cacheDir);
366 objFilePath.append("/");
367 objFilePath.append(mGroup->mName);
368 objFilePath.append(".o");
369
370 const char* resName = mGroup->mName;
371 string coreLibRelaxedPath;
372 const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(),
373 &coreLibRelaxedPath);
374
375 vector<const char*> arguments;
376 bool emitGlobalInfo = getCpuRefImpl()->getEmbedGlobalInfo();
377 bool emitGlobalInfoSkipConstant = getCpuRefImpl()->getEmbedGlobalInfoSkipConstant();
378 setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir,
379 resName, coreLibPath.c_str(), coreLibRelaxedPath.c_str(),
380 emitGlobalInfo, emitGlobalInfoSkipConstant,
381 &arguments);
382
383 std::unique_ptr<const char> cmdLine(rsuJoinStrings(arguments.size() - 1,
384 arguments.data()));
385
386 inputs.push_back(coreLibPath.c_str());
387 inputs.push_back(coreLibRelaxedPath.c_str());
388
389 uint32_t checksum = constructBuildChecksum(nullptr, 0, cmdLine.get(),
390 inputs.data(), inputs.size());
391
392 if (checksum == 0) {
393 return;
394 }
395
396 std::stringstream ss;
397 ss << std::hex << checksum;
398 const char* checksumStr = ss.str().c_str();
399
400 //===--------------------------------------------------------------------===//
401 // Try to load a shared lib from code cache matching filename and checksum
402 //===--------------------------------------------------------------------===//
403
404 bool alreadyLoaded = false;
405 std::string cloneName;
406
407 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName, nullptr,
408 &alreadyLoaded);
409 if (mScriptObj != nullptr) {
410 // A shared library named resName is found in code cache directory
411 // cacheDir, and loaded with the handle stored in mScriptObj.
412
413 mExecutable = ScriptExecutable::createFromSharedObject(
414 getCpuRefImpl()->getContext(), mScriptObj, checksum);
415
416 if (mExecutable != nullptr) {
417 // The loaded shared library in mScriptObj has a matching checksum.
418 // An executable object has been created.
419 return;
420 }
421
422 ALOGV("Failed to create an executable object from so file due to "
423 "mismatching checksum");
424
425 if (alreadyLoaded) {
426 // The shared object found in code cache has already been loaded.
427 // A different file name is needed for the new shared library, to
428 // avoid corrupting the currently loaded instance.
429
430 cloneName.append(resName);
431 cloneName.append("#");
432 cloneName.append(SharedLibraryUtils::getRandomString(6).string());
433
434 // The last element in arguments is the output filename.
435 arguments.pop_back();
436 arguments.push_back(cloneName.c_str());
437 }
438
439 dlclose(mScriptObj);
440 mScriptObj = nullptr;
441 }
442
443 //===--------------------------------------------------------------------===//
444 // Fuse the input kernels and generate native code in an object file
445 //===--------------------------------------------------------------------===//
446
447 arguments.push_back("-build-checksum");
448 arguments.push_back(checksumStr);
449 arguments.push_back(nullptr);
450
451 bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH,
452 arguments.size()-1,
453 arguments.data());
454 if (!compiled) {
455 return;
456 }
457
458 //===--------------------------------------------------------------------===//
459 // Create and load the shared lib
460 //===--------------------------------------------------------------------===//
461
462 if (!SharedLibraryUtils::createSharedLibrary(
463 getCpuRefImpl()->getContext()->getDriverName(), cacheDir, resName)) {
464 ALOGE("Failed to link object file '%s'", resName);
465 unlink(objFilePath.c_str());
466 return;
467 }
468
469 unlink(objFilePath.c_str());
470
471 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
472 if (mScriptObj == nullptr) {
473 ALOGE("Unable to load '%s'", resName);
474 return;
475 }
476
477 if (alreadyLoaded) {
478 // Delete the temporary, random-named file that we created to avoid
479 // interfering with an already loaded shared library.
480 string cloneFilePath(cacheDir);
481 cloneFilePath.append("/");
482 cloneFilePath.append(cloneName.c_str());
483 cloneFilePath.append(".so");
484 unlink(cloneFilePath.c_str());
485 }
486
487 mExecutable = ScriptExecutable::createFromSharedObject(
488 getCpuRefImpl()->getContext(),
489 mScriptObj);
490
491 #endif // RS_COMPATIBILITY_LIB
492 }
493
execute()494 void CpuScriptGroup2Impl::execute() {
495 for (auto batch : mBatches) {
496 batch->setGlobalsForBatch();
497 batch->run();
498 }
499 }
500
setGlobalsForBatch()501 void Batch::setGlobalsForBatch() {
502 for (CPUClosure* cpuClosure : mClosures) {
503 const Closure* closure = cpuClosure->mClosure;
504 const IDBase* funcID = closure->mFunctionID.get();
505 Script* s = funcID->mScript;;
506 for (const auto& p : closure->mGlobals) {
507 const void* value = p.second.first;
508 int size = p.second.second;
509 if (value == nullptr && size == 0) {
510 // This indicates the current closure depends on another closure for a
511 // global in their shared module (script). In this case we don't need to
512 // copy the value. For example, an invoke intializes a global variable
513 // which a kernel later reads.
514 continue;
515 }
516 rsAssert(p.first != nullptr);
517 Script* script = p.first->mScript;
518 RsdCpuReferenceImpl* ctxt = mGroup->getCpuRefImpl();
519 const RsdCpuScriptImpl *cpuScript =
520 (const RsdCpuScriptImpl *)ctxt->lookupScript(script);
521 int slot = p.first->mSlot;
522 ScriptExecutable* exec = mGroup->getExecutable();
523 if (exec != nullptr) {
524 const char* varName = cpuScript->getFieldName(slot);
525 void* addr = exec->getFieldAddress(varName);
526 if (size < 0) {
527 rsrSetObject(mGroup->getCpuRefImpl()->getContext(),
528 (rs_object_base*)addr, (ObjectBase*)value);
529 } else {
530 memcpy(addr, (const void*)&value, size);
531 }
532 } else {
533 // We use -1 size to indicate an ObjectBase rather than a primitive type
534 if (size < 0) {
535 s->setVarObj(slot, (ObjectBase*)value);
536 } else {
537 s->setVar(slot, (const void*)&value, size);
538 }
539 }
540 }
541 }
542 }
543
run()544 void Batch::run() {
545 if (!mClosures.front()->mClosure->mIsKernel) {
546 rsAssert(mClosures.size() == 1);
547
548 // This batch contains a single closure for an invoke function
549 CPUClosure* cc = mClosures.front();
550 const Closure* c = cc->mClosure;
551
552 if (mFunc != nullptr) {
553 // TODO: Need align pointers for x86_64.
554 // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp
555 ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength);
556 } else {
557 const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get();
558 rsAssert(invokeID != nullptr);
559 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength);
560 }
561
562 return;
563 }
564
565 if (mFunc != nullptr) {
566 MTLaunchStruct mtls;
567 const CPUClosure* firstCpuClosure = mClosures.front();
568 const CPUClosure* lastCpuClosure = mClosures.back();
569
570 firstCpuClosure->mSi->forEachMtlsSetup(
571 (const Allocation**)firstCpuClosure->mClosure->mArgs,
572 firstCpuClosure->mClosure->mNumArg,
573 lastCpuClosure->mClosure->mReturnValue,
574 nullptr, 0, nullptr, &mtls);
575
576 mtls.script = nullptr;
577 mtls.fep.usr = nullptr;
578 mtls.kernel = (ForEachFunc_t)mFunc;
579
580 mGroup->getCpuRefImpl()->launchThreads(
581 (const Allocation**)firstCpuClosure->mClosure->mArgs,
582 firstCpuClosure->mClosure->mNumArg,
583 lastCpuClosure->mClosure->mReturnValue,
584 nullptr, &mtls);
585
586 return;
587 }
588
589 for (CPUClosure* cpuClosure : mClosures) {
590 const Closure* closure = cpuClosure->mClosure;
591 const ScriptKernelID* kernelID =
592 (const ScriptKernelID*)closure->mFunctionID.get();
593 cpuClosure->mSi->preLaunch(kernelID->mSlot,
594 (const Allocation**)closure->mArgs,
595 closure->mNumArg, closure->mReturnValue,
596 nullptr, 0, nullptr);
597 }
598
599 const CPUClosure* cpuClosure = mClosures.front();
600 const Closure* closure = cpuClosure->mClosure;
601 MTLaunchStruct mtls;
602
603 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs,
604 closure->mNumArg,
605 closure->mReturnValue,
606 nullptr, 0, nullptr, &mtls)) {
607
608 mtls.script = nullptr;
609 mtls.kernel = (void (*)())&groupRoot;
610 mtls.fep.usr = &mClosures;
611
612 mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
613 }
614
615 for (CPUClosure* cpuClosure : mClosures) {
616 const Closure* closure = cpuClosure->mClosure;
617 const ScriptKernelID* kernelID =
618 (const ScriptKernelID*)closure->mFunctionID.get();
619 cpuClosure->mSi->postLaunch(kernelID->mSlot,
620 (const Allocation**)closure->mArgs,
621 closure->mNumArg, closure->mReturnValue,
622 nullptr, 0, nullptr);
623 }
624 }
625
626 } // namespace renderscript
627 } // namespace android
628