1 /*
2  * Copyright (C) 2012 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef RSD_CPU_CORE_H
18 #define RSD_CPU_CORE_H
19 
20 #include "rsd_cpu.h"
21 #include "rsSignal.h"
22 #include "rsContext.h"
23 #include "rsCppUtils.h"
24 #include "rsElement.h"
25 #include "rsScriptC.h"
26 #include "rsCpuCoreRuntime.h"
27 
28 namespace android {
29 namespace renderscript {
30 
31 // Whether the CPU we're running on supports SIMD instructions
32 extern bool gArchUseSIMD;
33 
34 // Function types found in RenderScript code
35 typedef void (*ReduceAccumulatorFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint8_t *accum);
36 typedef void (*ReduceCombinerFunc_t)(uint8_t *accum, const uint8_t *other);
37 typedef void (*ReduceInitializerFunc_t)(uint8_t *accum);
38 typedef void (*ReduceOutConverterFunc_t)(uint8_t *out, const uint8_t *accum);
39 typedef void (*ForEachFunc_t)(const RsExpandKernelDriverInfo *info, uint32_t x1, uint32_t x2, uint32_t outStride);
40 typedef void (*InvokeFunc_t)(void *params);
41 typedef void (*InitOrDtorFunc_t)(void);
42 typedef int  (*RootFunc_t)(void);
43 
44 struct ReduceDescription {
45     ReduceAccumulatorFunc_t  accumFunc;  // expanded accumulator function
46     ReduceInitializerFunc_t  initFunc;   // user initializer function
47     ReduceCombinerFunc_t     combFunc;   // user combiner function
48     ReduceOutConverterFunc_t outFunc;    // user outconverter function
49     size_t                   accumSize;  // accumulator datum size, in bytes
50 };
51 
52 // Internal driver callback used to execute a kernel
53 typedef void (*WorkerCallback_t)(void *usr, uint32_t idx);
54 
55 class RsdCpuScriptImpl;
56 class RsdCpuReferenceImpl;
57 
58 struct ScriptTLSStruct {
59     android::renderscript::Context * mContext;
60     const android::renderscript::Script * mScript;
61     RsdCpuScriptImpl *mImpl;
62 };
63 
64 // MTLaunchStruct passes information about a multithreaded kernel launch.
65 struct MTLaunchStructCommon {
66     RsdCpuReferenceImpl *rs;
67     RsdCpuScriptImpl *script;
68 
69     uint32_t mSliceSize;
70     volatile int mSliceNum;
71     bool isThreadable;
72 
73     // Boundary information about the launch
74     RsLaunchDimensions start;
75     RsLaunchDimensions end;
76     // Points to MTLaunchStructForEach::fep::dim or
77     // MTLaunchStructReduce::redp::dim.
78     RsLaunchDimensions *dimPtr;
79 };
80 
81 struct MTLaunchStructForEach : public MTLaunchStructCommon {
82     // Driver info structure
83     RsExpandKernelDriverInfo fep;
84 
85     ForEachFunc_t kernel;
86     const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
87     Allocation *aout[RS_KERNEL_INPUT_LIMIT];
88 };
89 
90 struct MTLaunchStructReduce : public MTLaunchStructCommon {
91     // Driver info structure
92     RsExpandKernelDriverInfo redp;
93 
94     const Allocation *ains[RS_KERNEL_INPUT_LIMIT];
95 
96     ReduceAccumulatorFunc_t accumFunc;
97     ReduceInitializerFunc_t initFunc;
98     ReduceCombinerFunc_t combFunc;
99     ReduceOutConverterFunc_t outFunc;
100 
101     size_t accumSize;  // accumulator datum size in bytes
102 
103     size_t accumStride;  // stride between accumulators in accumAlloc (below)
104 
105     // These fields are used for managing accumulator data items in a
106     // multithreaded execution.
107     //
108     // Let the number of threads be N.
109     // Let Outc be true iff there is an outconverter.
110     //
111     // accumAlloc is a pointer to a single allocation of (N - !Outc)
112     // accumulators.  (If there is no outconverter, then the output
113     // allocation acts as an accumulator.)  It is created at kernel
114     // launch time.  Within that allocation, the distance between the
115     // start of adjacent accumulators is accumStride bytes -- this
116     // might be the same as accumSize, or it might be larger, if we
117     // are attempting to avoid false sharing.
118     //
119     // accumCount is an atomic counter of how many accumulators have
120     // been grabbed by threads.  It is initialized to zero at kernel
121     // launch time.  See accumPtr for further description.
122     //
123     // accumPtr is pointer to an array of N pointers to accumulators.
124     // The array is created at kernel launch time, and each element is
125     // initialized to nullptr.  When a particular thread goes to work,
126     // that thread obtains its accumulator from its entry in this
127     // array.  If the entry is nullptr, that thread needs to obtain an
128     // accumulator, and initialize its entry in the array accordingly.
129     // It does so via atomic access (fetch-and-add) to accumCount.
130     // - If Outc, then the fetched value is used as an index into
131     //   accumAlloc.
132     // - If !Outc, then
133     //   - If the fetched value is zero, then this thread gets the
134     //     output allocation for its accumulator.
135     //   - If the fetched value is nonzero, then (fetched value - 1)
136     //     is used as an index into accumAlloc.
137     uint8_t *accumAlloc;
138     uint8_t **accumPtr;
139     uint32_t accumCount;
140 
141     // Logging control
142     uint32_t logReduce;
143 };
144 
145 class RsdCpuReferenceImpl : public RsdCpuReference {
146 public:
147     ~RsdCpuReferenceImpl() override;
148     RsdCpuReferenceImpl(Context *);
149 
150     void lockMutex();
151     void unlockMutex();
152 
153     bool init(uint32_t version_major, uint32_t version_minor, sym_lookup_t, script_lookup_t);
154     void setPriority(int32_t priority) override;
155     virtual void launchThreads(WorkerCallback_t cbk, void *data);
156     static void * helperThreadProc(void *vrsc);
157     RsdCpuScriptImpl * setTLS(RsdCpuScriptImpl *sc);
158 
getContext()159     Context * getContext() {return mRSC;}
getThreadCount()160     uint32_t getThreadCount() const {
161         return mWorkers.mCount + 1;
162     }
163 
164     // Launch foreach kernel
165     void launchForEach(const Allocation **ains, uint32_t inLen, Allocation *aout,
166                        const RsScriptCall *sc, MTLaunchStructForEach *mtls);
167 
168     // Launch a general reduce kernel
169     void launchReduce(const Allocation ** ains, uint32_t inLen, Allocation *aout,
170                       MTLaunchStructReduce *mtls);
171 
172     CpuScript * createScript(const ScriptC *s, char const *resName, char const *cacheDir,
173                              uint8_t const *bitcode, size_t bitcodeSize, uint32_t flags) override;
174     CpuScript * createIntrinsic(const Script *s, RsScriptIntrinsicID iid, Element *e) override;
175     void* createScriptGroup(const ScriptGroupBase *sg) override;
176 
177     const RsdCpuReference::CpuSymbol *symLookup(const char *);
178 
lookupScript(const Script * s)179     RsdCpuReference::CpuScript *lookupScript(const Script *s) {
180         return mScriptLookupFn(mRSC, s);
181     }
182 
setSelectRTCallback(RSSelectRTCallback pSelectRTCallback)183     void setSelectRTCallback(RSSelectRTCallback pSelectRTCallback) {
184         mSelectRTCallback = pSelectRTCallback;
185     }
getSelectRTCallback()186     RSSelectRTCallback getSelectRTCallback() {
187         return mSelectRTCallback;
188     }
189 
setBccPluginName(const char * name)190     virtual void setBccPluginName(const char *name) {
191         mBccPluginName.setTo(name);
192     }
getBccPluginName()193     virtual const char *getBccPluginName() const {
194         return mBccPluginName.string();
195     }
getInKernel()196     bool getInKernel() override { return mInKernel; }
197 
198     // Set to true if we should embed global variable information in the code.
setEmbedGlobalInfo(bool v)199     void setEmbedGlobalInfo(bool v) override {
200         mEmbedGlobalInfo = v;
201     }
202 
203     // Returns true if we should embed global variable information in the code.
getEmbedGlobalInfo()204     bool getEmbedGlobalInfo() const override {
205         return mEmbedGlobalInfo;
206     }
207 
208     // Set to true if we should skip constant (immutable) global variables when
209     // potentially embedding information about globals.
setEmbedGlobalInfoSkipConstant(bool v)210     void setEmbedGlobalInfoSkipConstant(bool v) override {
211         mEmbedGlobalInfoSkipConstant = v;
212     }
213 
214     // Returns true if we should skip constant (immutable) global variables when
215     // potentially embedding information about globals.
getEmbedGlobalInfoSkipConstant()216     bool getEmbedGlobalInfoSkipConstant() const override {
217         return mEmbedGlobalInfoSkipConstant;
218     }
219 
220 protected:
221     Context *mRSC;
222     uint32_t version_major;
223     uint32_t version_minor;
224     //bool mHasGraphics;
225     bool mInKernel;  // Is a parallel kernel execution underway?
226 
227     struct Workers {
228         volatile int mRunningCount;
229         volatile int mLaunchCount;
230         uint32_t mCount;
231         pthread_t *mThreadId;
232         pid_t *mNativeThreadId;
233         Signal mCompleteSignal;
234         Signal *mLaunchSignals;
235         WorkerCallback_t mLaunchCallback;
236         void *mLaunchData;
237     };
238     Workers mWorkers;
239     bool mExit;
240     sym_lookup_t mSymLookupFn;
241     script_lookup_t mScriptLookupFn;
242 
243     ScriptTLSStruct mTlsStruct;
244 
245     RSSelectRTCallback mSelectRTCallback;
246     String8 mBccPluginName;
247 
248     // Specifies whether we should embed global variable information in the
249     // code via special RS variables that can be examined later by the driver.
250     // Defaults to true.
251     bool mEmbedGlobalInfo;
252 
253     // Specifies whether we should skip constant (immutable) global variables
254     // when potentially embedding information about globals.
255     // Defaults to true.
256     bool mEmbedGlobalInfoSkipConstant;
257 
258     long mPageSize;
259 
260     // Launch a general reduce kernel
261     void launchReduceSerial(const Allocation ** ains, uint32_t inLen, Allocation *aout,
262                             MTLaunchStructReduce *mtls);
263     void launchReduceParallel(const Allocation ** ains, uint32_t inLen, Allocation *aout,
264                               MTLaunchStructReduce *mtls);
265 };
266 
267 
268 }
269 }
270 
271 #endif
272