1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "rsCpuIntrinsic.h"
18 #include "rsCpuIntrinsicInlines.h"
19 
20 using namespace android;
21 using namespace android::renderscript;
22 
23 namespace android {
24 namespace renderscript {
25 
26 
27 class RsdCpuScriptIntrinsicHistogram : public RsdCpuScriptIntrinsic {
28 public:
29     void populateScript(Script *) override;
30     void invokeFreeChildren() override;
31 
32     void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
33     void setGlobalObj(uint32_t slot, ObjectBase *data) override;
34 
35     ~RsdCpuScriptIntrinsicHistogram() override;
36     RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
37 
38 protected:
39     void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
40                    Allocation * aout, const void * usr,
41                    uint32_t usrLen, const RsScriptCall *sc);
42     void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
43                     Allocation * aout, const void * usr,
44                     uint32_t usrLen, const RsScriptCall *sc);
45 
46 
47     float mDot[4];
48     int mDotI[4];
49     int *mSums;
50     ObjectBaseRef<Allocation> mAllocOut;
51 
52     static void kernelP1U4(const RsExpandKernelDriverInfo *info,
53                            uint32_t xstart, uint32_t xend,
54                            uint32_t outstep);
55     static void kernelP1U3(const RsExpandKernelDriverInfo *info,
56                            uint32_t xstart, uint32_t xend,
57                            uint32_t outstep);
58     static void kernelP1U2(const RsExpandKernelDriverInfo *info,
59                            uint32_t xstart, uint32_t xend,
60                            uint32_t outstep);
61     static void kernelP1U1(const RsExpandKernelDriverInfo *info,
62                            uint32_t xstart, uint32_t xend,
63                            uint32_t outstep);
64 
65     static void kernelP1L4(const RsExpandKernelDriverInfo *info,
66                            uint32_t xstart, uint32_t xend,
67                            uint32_t outstep);
68     static void kernelP1L3(const RsExpandKernelDriverInfo *info,
69                            uint32_t xstart, uint32_t xend,
70                            uint32_t outstep);
71     static void kernelP1L2(const RsExpandKernelDriverInfo *info,
72                            uint32_t xstart, uint32_t xend,
73                            uint32_t outstep);
74     static void kernelP1L1(const RsExpandKernelDriverInfo *info,
75                            uint32_t xstart, uint32_t xend,
76                            uint32_t outstep);
77 
78 };
79 
80 }
81 }
82 
setGlobalObj(uint32_t slot,ObjectBase * data)83 void RsdCpuScriptIntrinsicHistogram::setGlobalObj(uint32_t slot, ObjectBase *data) {
84     rsAssert(slot == 1);
85     mAllocOut.set(static_cast<Allocation *>(data));
86 }
87 
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)88 void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
89     rsAssert(slot == 0);
90     rsAssert(dataLength == 16);
91     memcpy(mDot, data, 16);
92     mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
93     mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
94     mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
95     mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
96 }
97 
98 
99 
100 void
preLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)101 RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
102                                           const Allocation ** ains,
103                                           uint32_t inLen, Allocation * aout,
104                                           const void * usr, uint32_t usrLen,
105                                           const RsScriptCall *sc) {
106 
107     const uint32_t threads = mCtx->getThreadCount();
108     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
109 
110     switch (slot) {
111     case 0:
112         switch(vSize) {
113         case 1:
114             mRootPtr = &kernelP1U1;
115             break;
116         case 2:
117             mRootPtr = &kernelP1U2;
118             break;
119         case 3:
120             mRootPtr = &kernelP1U3;
121             vSize = 4;
122             break;
123         case 4:
124             mRootPtr = &kernelP1U4;
125             break;
126         }
127         break;
128     case 1:
129         switch(ains[0]->getType()->getElement()->getVectorSize()) {
130         case 1:
131             mRootPtr = &kernelP1L1;
132             break;
133         case 2:
134             mRootPtr = &kernelP1L2;
135             break;
136         case 3:
137             mRootPtr = &kernelP1L3;
138             break;
139         case 4:
140             mRootPtr = &kernelP1L4;
141             break;
142         }
143         break;
144     }
145     memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
146 }
147 
148 void
postLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)149 RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot,
150                                            const Allocation ** ains,
151                                            uint32_t inLen,  Allocation * aout,
152                                            const void * usr, uint32_t usrLen,
153                                            const RsScriptCall *sc) {
154 
155     unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
156     uint32_t threads = mCtx->getThreadCount();
157     uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
158 
159     if (vSize == 3) vSize = 4;
160 
161     for (uint32_t ct=0; ct < (256 * vSize); ct++) {
162         o[ct] = mSums[ct];
163         for (uint32_t t=1; t < threads; t++) {
164             o[ct] += mSums[ct + (256 * vSize * t)];
165         }
166     }
167 }
168 
kernelP1U4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)169 void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelDriverInfo *info,
170                                                 uint32_t xstart, uint32_t xend,
171                                                 uint32_t outstep) {
172 
173     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
174     uchar *in = (uchar *)info->inPtr[0];
175     int * sums = &cp->mSums[256 * 4 * info->lid];
176 
177     for (uint32_t x = xstart; x < xend; x++) {
178         sums[(in[0] << 2)    ] ++;
179         sums[(in[1] << 2) + 1] ++;
180         sums[(in[2] << 2) + 2] ++;
181         sums[(in[3] << 2) + 3] ++;
182         in += info->inStride[0];
183     }
184 }
185 
kernelP1U3(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)186 void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelDriverInfo *info,
187                                                 uint32_t xstart, uint32_t xend,
188                                                 uint32_t outstep) {
189 
190     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
191     uchar *in = (uchar *)info->inPtr[0];
192     int * sums = &cp->mSums[256 * 4 * info->lid];
193 
194     for (uint32_t x = xstart; x < xend; x++) {
195         sums[(in[0] << 2)    ] ++;
196         sums[(in[1] << 2) + 1] ++;
197         sums[(in[2] << 2) + 2] ++;
198         in += info->inStride[0];
199     }
200 }
201 
kernelP1U2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)202 void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelDriverInfo *info,
203                                                 uint32_t xstart, uint32_t xend,
204                                                 uint32_t outstep) {
205 
206     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
207     uchar *in = (uchar *)info->inPtr[0];
208     int * sums = &cp->mSums[256 * 2 * info->lid];
209 
210     for (uint32_t x = xstart; x < xend; x++) {
211         sums[(in[0] << 1)    ] ++;
212         sums[(in[1] << 1) + 1] ++;
213         in += info->inStride[0];
214     }
215 }
216 
kernelP1L4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)217 void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelDriverInfo *info,
218                                                 uint32_t xstart, uint32_t xend,
219                                                 uint32_t outstep) {
220 
221     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
222     uchar *in = (uchar *)info->inPtr[0];
223     int * sums = &cp->mSums[256 * info->lid];
224 
225     for (uint32_t x = xstart; x < xend; x++) {
226         int t = (cp->mDotI[0] * in[0]) +
227                 (cp->mDotI[1] * in[1]) +
228                 (cp->mDotI[2] * in[2]) +
229                 (cp->mDotI[3] * in[3]);
230         sums[(t + 0x7f) >> 8] ++;
231         in += info->inStride[0];
232     }
233 }
234 
kernelP1L3(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)235 void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelDriverInfo *info,
236                                                 uint32_t xstart, uint32_t xend,
237                                                 uint32_t outstep) {
238 
239     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
240     uchar *in = (uchar *)info->inPtr[0];
241     int * sums = &cp->mSums[256 * info->lid];
242 
243     for (uint32_t x = xstart; x < xend; x++) {
244         int t = (cp->mDotI[0] * in[0]) +
245                 (cp->mDotI[1] * in[1]) +
246                 (cp->mDotI[2] * in[2]);
247         sums[(t + 0x7f) >> 8] ++;
248         in += info->inStride[0];
249     }
250 }
251 
kernelP1L2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)252 void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelDriverInfo *info,
253                                                 uint32_t xstart, uint32_t xend,
254                                                 uint32_t outstep) {
255 
256     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
257     uchar *in = (uchar *)info->inPtr[0];
258     int * sums = &cp->mSums[256 * info->lid];
259 
260     for (uint32_t x = xstart; x < xend; x++) {
261         int t = (cp->mDotI[0] * in[0]) +
262                 (cp->mDotI[1] * in[1]);
263         sums[(t + 0x7f) >> 8] ++;
264         in += info->inStride[0];
265     }
266 }
267 
kernelP1L1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)268 void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelDriverInfo *info,
269                                                 uint32_t xstart, uint32_t xend,
270                                                 uint32_t outstep) {
271 
272     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
273     uchar *in = (uchar *)info->inPtr[0];
274     int * sums = &cp->mSums[256 * info->lid];
275 
276     for (uint32_t x = xstart; x < xend; x++) {
277         int t = (cp->mDotI[0] * in[0]);
278         sums[(t + 0x7f) >> 8] ++;
279         in += info->inStride[0];
280     }
281 }
282 
kernelP1U1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)283 void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelDriverInfo *info,
284                                                 uint32_t xstart, uint32_t xend,
285                                                 uint32_t outstep) {
286 
287     RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
288     uchar *in = (uchar *)info->inPtr[0];
289     int * sums = &cp->mSums[256 * info->lid];
290 
291     for (uint32_t x = xstart; x < xend; x++) {
292         sums[in[0]] ++;
293         in += info->inStride[0];
294     }
295 }
296 
297 
RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)298 RsdCpuScriptIntrinsicHistogram::RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx,
299                                                      const Script *s, const Element *e)
300             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_HISTOGRAM) {
301 
302     mRootPtr = nullptr;
303     mSums = new int[256 * 4 * mCtx->getThreadCount()];
304     mDot[0] = 0.299f;
305     mDot[1] = 0.587f;
306     mDot[2] = 0.114f;
307     mDot[3] = 0;
308     mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
309     mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
310     mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
311     mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
312 }
313 
~RsdCpuScriptIntrinsicHistogram()314 RsdCpuScriptIntrinsicHistogram::~RsdCpuScriptIntrinsicHistogram() {
315     if (mSums) {
316         delete []mSums;
317     }
318 }
319 
populateScript(Script * s)320 void RsdCpuScriptIntrinsicHistogram::populateScript(Script *s) {
321     s->mHal.info.exportedVariableCount = 2;
322 }
323 
invokeFreeChildren()324 void RsdCpuScriptIntrinsicHistogram::invokeFreeChildren() {
325 }
326 
327 
rsdIntrinsic_Histogram(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)328 RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
329 
330     return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
331 }
332