1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "rsCpuIntrinsic.h"
18 #include "rsCpuIntrinsicInlines.h"
19 
20 using namespace android;
21 using namespace android::renderscript;
22 
23 namespace android {
24 namespace renderscript {
25 
26 
27 class RsdCpuScriptIntrinsicHistogram : public RsdCpuScriptIntrinsic {
28 public:
29     virtual void populateScript(Script *);
30     virtual void invokeFreeChildren();
31 
32     virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
33     virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
34 
35     virtual ~RsdCpuScriptIntrinsicHistogram();
36     RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
37 
38 protected:
39     void preLaunch(uint32_t slot, const Allocation * ain,
40                    Allocation * aout, const void * usr,
41                    uint32_t usrLen, const RsScriptCall *sc);
42     void postLaunch(uint32_t slot, const Allocation * ain,
43                     Allocation * aout, const void * usr,
44                     uint32_t usrLen, const RsScriptCall *sc);
45 
46 
47     float mDot[4];
48     int mDotI[4];
49     int *mSums;
50     ObjectBaseRef<Allocation> mAllocOut;
51 
52     static void kernelP1U4(const RsForEachStubParamStruct *p,
53                           uint32_t xstart, uint32_t xend,
54                           uint32_t instep, uint32_t outstep);
55     static void kernelP1U3(const RsForEachStubParamStruct *p,
56                           uint32_t xstart, uint32_t xend,
57                           uint32_t instep, uint32_t outstep);
58     static void kernelP1U2(const RsForEachStubParamStruct *p,
59                           uint32_t xstart, uint32_t xend,
60                           uint32_t instep, uint32_t outstep);
61     static void kernelP1U1(const RsForEachStubParamStruct *p,
62                           uint32_t xstart, uint32_t xend,
63                           uint32_t instep, uint32_t outstep);
64 
65     static void kernelP1L4(const RsForEachStubParamStruct *p,
66                            uint32_t xstart, uint32_t xend,
67                            uint32_t instep, uint32_t outstep);
68     static void kernelP1L3(const RsForEachStubParamStruct *p,
69                            uint32_t xstart, uint32_t xend,
70                            uint32_t instep, uint32_t outstep);
71     static void kernelP1L2(const RsForEachStubParamStruct *p,
72                            uint32_t xstart, uint32_t xend,
73                            uint32_t instep, uint32_t outstep);
74     static void kernelP1L1(const RsForEachStubParamStruct *p,
75                            uint32_t xstart, uint32_t xend,
76                            uint32_t instep, uint32_t outstep);
77 
78 };
79 
80 }
81 }
82 
setGlobalObj(uint32_t slot,ObjectBase * data)83 void RsdCpuScriptIntrinsicHistogram::setGlobalObj(uint32_t slot, ObjectBase *data) {
84     rsAssert(slot == 1);
85     mAllocOut.set(static_cast<Allocation *>(data));
86 }
87 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)88 void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
89     rsAssert(slot == 0);
90     rsAssert(dataLength == 16);
91     memcpy(mDot, data, 16);
92     mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
93     mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
94     mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
95     mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
96 }
97 
98 
99 
preLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)100 void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
101                                       Allocation * aout, const void * usr,
102                                       uint32_t usrLen, const RsScriptCall *sc) {
103 
104     const uint32_t threads = mCtx->getThreadCount();
105     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
106 
107     switch (slot) {
108     case 0:
109         switch(vSize) {
110         case 1:
111             mRootPtr = &kernelP1U1;
112             break;
113         case 2:
114             mRootPtr = &kernelP1U2;
115             break;
116         case 3:
117             mRootPtr = &kernelP1U3;
118             vSize = 4;
119             break;
120         case 4:
121             mRootPtr = &kernelP1U4;
122             break;
123         }
124         break;
125     case 1:
126         switch(ain->getType()->getElement()->getVectorSize()) {
127         case 1:
128             mRootPtr = &kernelP1L1;
129             break;
130         case 2:
131             mRootPtr = &kernelP1L2;
132             break;
133         case 3:
134             mRootPtr = &kernelP1L3;
135             break;
136         case 4:
137             mRootPtr = &kernelP1L4;
138             break;
139         }
140         break;
141     }
142     memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
143 }
144 
postLaunch(uint32_t slot,const Allocation * ain,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)145 void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
146                                        Allocation * aout, const void * usr,
147                                        uint32_t usrLen, const RsScriptCall *sc) {
148 
149     unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
150     uint32_t threads = mCtx->getThreadCount();
151     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
152 
153     if (vSize == 3) vSize = 4;
154 
155     for (uint32_t ct=0; ct < (256 * vSize); ct++) {
156         o[ct] = mSums[ct];
157         for (uint32_t t=1; t < threads; t++) {
158             o[ct] += mSums[ct + (256 * vSize * t)];
159         }
160     }
161 }
162 
kernelP1U4(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)163 void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsForEachStubParamStruct *p,
164                                                 uint32_t xstart, uint32_t xend,
165                                                 uint32_t instep, uint32_t outstep) {
166 
167     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
168     uchar *in = (uchar *)p->in;
169     int * sums = &cp->mSums[256 * 4 * p->lid];
170 
171     for (uint32_t x = xstart; x < xend; x++) {
172         sums[(in[0] << 2)    ] ++;
173         sums[(in[1] << 2) + 1] ++;
174         sums[(in[2] << 2) + 2] ++;
175         sums[(in[3] << 2) + 3] ++;
176         in += instep;
177     }
178 }
179 
kernelP1U3(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)180 void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsForEachStubParamStruct *p,
181                                                 uint32_t xstart, uint32_t xend,
182                                                 uint32_t instep, uint32_t outstep) {
183 
184     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
185     uchar *in = (uchar *)p->in;
186     int * sums = &cp->mSums[256 * 4 * p->lid];
187 
188     for (uint32_t x = xstart; x < xend; x++) {
189         sums[(in[0] << 2)    ] ++;
190         sums[(in[1] << 2) + 1] ++;
191         sums[(in[2] << 2) + 2] ++;
192         in += instep;
193     }
194 }
195 
kernelP1U2(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)196 void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsForEachStubParamStruct *p,
197                                                 uint32_t xstart, uint32_t xend,
198                                                 uint32_t instep, uint32_t outstep) {
199 
200     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
201     uchar *in = (uchar *)p->in;
202     int * sums = &cp->mSums[256 * 2 * p->lid];
203 
204     for (uint32_t x = xstart; x < xend; x++) {
205         sums[(in[0] << 1)    ] ++;
206         sums[(in[1] << 1) + 1] ++;
207         in += instep;
208     }
209 }
210 
kernelP1L4(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)211 void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsForEachStubParamStruct *p,
212                                                 uint32_t xstart, uint32_t xend,
213                                                 uint32_t instep, uint32_t outstep) {
214 
215     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
216     uchar *in = (uchar *)p->in;
217     int * sums = &cp->mSums[256 * p->lid];
218 
219     for (uint32_t x = xstart; x < xend; x++) {
220         int t = (cp->mDotI[0] * in[0]) +
221                 (cp->mDotI[1] * in[1]) +
222                 (cp->mDotI[2] * in[2]) +
223                 (cp->mDotI[3] * in[3]);
224         sums[(t + 0x7f) >> 8] ++;
225         in += instep;
226     }
227 }
228 
kernelP1L3(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)229 void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsForEachStubParamStruct *p,
230                                                 uint32_t xstart, uint32_t xend,
231                                                 uint32_t instep, uint32_t outstep) {
232 
233     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
234     uchar *in = (uchar *)p->in;
235     int * sums = &cp->mSums[256 * p->lid];
236 
237     for (uint32_t x = xstart; x < xend; x++) {
238         int t = (cp->mDotI[0] * in[0]) +
239                 (cp->mDotI[1] * in[1]) +
240                 (cp->mDotI[2] * in[2]);
241         sums[(t + 0x7f) >> 8] ++;
242         in += instep;
243     }
244 }
245 
kernelP1L2(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)246 void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsForEachStubParamStruct *p,
247                                                 uint32_t xstart, uint32_t xend,
248                                                 uint32_t instep, uint32_t outstep) {
249 
250     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
251     uchar *in = (uchar *)p->in;
252     int * sums = &cp->mSums[256 * p->lid];
253 
254     for (uint32_t x = xstart; x < xend; x++) {
255         int t = (cp->mDotI[0] * in[0]) +
256                 (cp->mDotI[1] * in[1]);
257         sums[(t + 0x7f) >> 8] ++;
258         in += instep;
259     }
260 }
261 
kernelP1L1(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)262 void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsForEachStubParamStruct *p,
263                                                 uint32_t xstart, uint32_t xend,
264                                                 uint32_t instep, uint32_t outstep) {
265 
266     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
267     uchar *in = (uchar *)p->in;
268     int * sums = &cp->mSums[256 * p->lid];
269 
270     for (uint32_t x = xstart; x < xend; x++) {
271         int t = (cp->mDotI[0] * in[0]);
272         sums[(t + 0x7f) >> 8] ++;
273         in += instep;
274     }
275 }
276 
kernelP1U1(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)277 void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsForEachStubParamStruct *p,
278                                                 uint32_t xstart, uint32_t xend,
279                                                 uint32_t instep, uint32_t outstep) {
280 
281     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
282     uchar *in = (uchar *)p->in;
283     int * sums = &cp->mSums[256 * p->lid];
284 
285     for (uint32_t x = xstart; x < xend; x++) {
286         sums[in[0]] ++;
287         in += instep;
288     }
289 }
290 
291 
RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)292 RsdCpuScriptIntrinsicHistogram::RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx,
293                                                      const Script *s, const Element *e)
294             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_HISTOGRAM) {
295 
296     mRootPtr = NULL;
297     mSums = new int[256 * 4 * mCtx->getThreadCount()];
298     mDot[0] = 0.299f;
299     mDot[1] = 0.587f;
300     mDot[2] = 0.114f;
301     mDot[3] = 0;
302     mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
303     mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
304     mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
305     mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
306 }
307 
~RsdCpuScriptIntrinsicHistogram()308 RsdCpuScriptIntrinsicHistogram::~RsdCpuScriptIntrinsicHistogram() {
309     if (mSums) {
310         delete []mSums;
311     }
312 }
313 
populateScript(Script * s)314 void RsdCpuScriptIntrinsicHistogram::populateScript(Script *s) {
315     s->mHal.info.exportedVariableCount = 2;
316 }
317 
invokeFreeChildren()318 void RsdCpuScriptIntrinsicHistogram::invokeFreeChildren() {
319 }
320 
321 
rsdIntrinsic_Histogram(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)322 RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
323 
324     return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
325 }
326 
327 
328