1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18 #include "rsCpuIntrinsic.h"
19 #include "rsCpuIntrinsicInlines.h"
20
21 using namespace android;
22 using namespace android::renderscript;
23
24 namespace android {
25 namespace renderscript {
26
27
28 class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
29 public:
30 virtual void populateScript(Script *);
31 virtual void invokeFreeChildren();
32
33 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
34 virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
35
36 virtual ~RsdCpuScriptIntrinsicConvolve5x5();
37 RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
38
39 protected:
40 float mFp[28];
41 short mIp[28];
42 ObjectBaseRef<Allocation> alloc;
43
44
45 static void kernelU1(const RsForEachStubParamStruct *p,
46 uint32_t xstart, uint32_t xend,
47 uint32_t instep, uint32_t outstep);
48 static void kernelU2(const RsForEachStubParamStruct *p,
49 uint32_t xstart, uint32_t xend,
50 uint32_t instep, uint32_t outstep);
51 static void kernelU4(const RsForEachStubParamStruct *p,
52 uint32_t xstart, uint32_t xend,
53 uint32_t instep, uint32_t outstep);
54 static void kernelF1(const RsForEachStubParamStruct *p,
55 uint32_t xstart, uint32_t xend,
56 uint32_t instep, uint32_t outstep);
57 static void kernelF2(const RsForEachStubParamStruct *p,
58 uint32_t xstart, uint32_t xend,
59 uint32_t instep, uint32_t outstep);
60 static void kernelF4(const RsForEachStubParamStruct *p,
61 uint32_t xstart, uint32_t xend,
62 uint32_t instep, uint32_t outstep);
63
64
65 };
66
67 }
68 }
69
setGlobalObj(uint32_t slot,ObjectBase * data)70 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
71 rsAssert(slot == 1);
72 alloc.set(static_cast<Allocation *>(data));
73 }
74
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)75 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
76 const void *data, size_t dataLength) {
77 rsAssert(slot == 0);
78 memcpy (&mFp, data, dataLength);
79 for(int ct=0; ct < 25; ct++) {
80 if (mFp[ct] >= 0) {
81 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
82 } else {
83 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
84 }
85 }
86 }
87
88
OneU4(const RsForEachStubParamStruct * p,uint32_t x,uchar4 * out,const uchar4 * py0,const uchar4 * py1,const uchar4 * py2,const uchar4 * py3,const uchar4 * py4,const float * coeff)89 static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
90 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
91 const float* coeff) {
92
93 uint32_t x0 = rsMax((int32_t)x-2, 0);
94 uint32_t x1 = rsMax((int32_t)x-1, 0);
95 uint32_t x2 = x;
96 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
97 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
98
99 float4 px = convert_float4(py0[x0]) * coeff[0] +
100 convert_float4(py0[x1]) * coeff[1] +
101 convert_float4(py0[x2]) * coeff[2] +
102 convert_float4(py0[x3]) * coeff[3] +
103 convert_float4(py0[x4]) * coeff[4] +
104
105 convert_float4(py1[x0]) * coeff[5] +
106 convert_float4(py1[x1]) * coeff[6] +
107 convert_float4(py1[x2]) * coeff[7] +
108 convert_float4(py1[x3]) * coeff[8] +
109 convert_float4(py1[x4]) * coeff[9] +
110
111 convert_float4(py2[x0]) * coeff[10] +
112 convert_float4(py2[x1]) * coeff[11] +
113 convert_float4(py2[x2]) * coeff[12] +
114 convert_float4(py2[x3]) * coeff[13] +
115 convert_float4(py2[x4]) * coeff[14] +
116
117 convert_float4(py3[x0]) * coeff[15] +
118 convert_float4(py3[x1]) * coeff[16] +
119 convert_float4(py3[x2]) * coeff[17] +
120 convert_float4(py3[x3]) * coeff[18] +
121 convert_float4(py3[x4]) * coeff[19] +
122
123 convert_float4(py4[x0]) * coeff[20] +
124 convert_float4(py4[x1]) * coeff[21] +
125 convert_float4(py4[x2]) * coeff[22] +
126 convert_float4(py4[x3]) * coeff[23] +
127 convert_float4(py4[x4]) * coeff[24];
128 px = clamp(px + 0.5f, 0.f, 255.f);
129 *out = convert_uchar4(px);
130 }
131
OneU2(const RsForEachStubParamStruct * p,uint32_t x,uchar2 * out,const uchar2 * py0,const uchar2 * py1,const uchar2 * py2,const uchar2 * py3,const uchar2 * py4,const float * coeff)132 static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
133 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
134 const float* coeff) {
135
136 uint32_t x0 = rsMax((int32_t)x-2, 0);
137 uint32_t x1 = rsMax((int32_t)x-1, 0);
138 uint32_t x2 = x;
139 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
140 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
141
142 float2 px = convert_float2(py0[x0]) * coeff[0] +
143 convert_float2(py0[x1]) * coeff[1] +
144 convert_float2(py0[x2]) * coeff[2] +
145 convert_float2(py0[x3]) * coeff[3] +
146 convert_float2(py0[x4]) * coeff[4] +
147
148 convert_float2(py1[x0]) * coeff[5] +
149 convert_float2(py1[x1]) * coeff[6] +
150 convert_float2(py1[x2]) * coeff[7] +
151 convert_float2(py1[x3]) * coeff[8] +
152 convert_float2(py1[x4]) * coeff[9] +
153
154 convert_float2(py2[x0]) * coeff[10] +
155 convert_float2(py2[x1]) * coeff[11] +
156 convert_float2(py2[x2]) * coeff[12] +
157 convert_float2(py2[x3]) * coeff[13] +
158 convert_float2(py2[x4]) * coeff[14] +
159
160 convert_float2(py3[x0]) * coeff[15] +
161 convert_float2(py3[x1]) * coeff[16] +
162 convert_float2(py3[x2]) * coeff[17] +
163 convert_float2(py3[x3]) * coeff[18] +
164 convert_float2(py3[x4]) * coeff[19] +
165
166 convert_float2(py4[x0]) * coeff[20] +
167 convert_float2(py4[x1]) * coeff[21] +
168 convert_float2(py4[x2]) * coeff[22] +
169 convert_float2(py4[x3]) * coeff[23] +
170 convert_float2(py4[x4]) * coeff[24];
171 px = clamp(px + 0.5f, 0.f, 255.f);
172 *out = convert_uchar2(px);
173 }
174
OneU1(const RsForEachStubParamStruct * p,uint32_t x,uchar * out,const uchar * py0,const uchar * py1,const uchar * py2,const uchar * py3,const uchar * py4,const float * coeff)175 static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
176 const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
177 const float* coeff) {
178
179 uint32_t x0 = rsMax((int32_t)x-2, 0);
180 uint32_t x1 = rsMax((int32_t)x-1, 0);
181 uint32_t x2 = x;
182 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
183 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
184
185 float px = (float)(py0[x0]) * coeff[0] +
186 (float)(py0[x1]) * coeff[1] +
187 (float)(py0[x2]) * coeff[2] +
188 (float)(py0[x3]) * coeff[3] +
189 (float)(py0[x4]) * coeff[4] +
190
191 (float)(py1[x0]) * coeff[5] +
192 (float)(py1[x1]) * coeff[6] +
193 (float)(py1[x2]) * coeff[7] +
194 (float)(py1[x3]) * coeff[8] +
195 (float)(py1[x4]) * coeff[9] +
196
197 (float)(py2[x0]) * coeff[10] +
198 (float)(py2[x1]) * coeff[11] +
199 (float)(py2[x2]) * coeff[12] +
200 (float)(py2[x3]) * coeff[13] +
201 (float)(py2[x4]) * coeff[14] +
202
203 (float)(py3[x0]) * coeff[15] +
204 (float)(py3[x1]) * coeff[16] +
205 (float)(py3[x2]) * coeff[17] +
206 (float)(py3[x3]) * coeff[18] +
207 (float)(py3[x4]) * coeff[19] +
208
209 (float)(py4[x0]) * coeff[20] +
210 (float)(py4[x1]) * coeff[21] +
211 (float)(py4[x2]) * coeff[22] +
212 (float)(py4[x3]) * coeff[23] +
213 (float)(py4[x4]) * coeff[24];
214 px = clamp(px + 0.5f, 0.f, 255.f);
215 *out = px;
216 }
217
OneF4(const RsForEachStubParamStruct * p,uint32_t x,float4 * out,const float4 * py0,const float4 * py1,const float4 * py2,const float4 * py3,const float4 * py4,const float * coeff)218 static void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
219 const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
220 const float* coeff) {
221
222 uint32_t x0 = rsMax((int32_t)x-2, 0);
223 uint32_t x1 = rsMax((int32_t)x-1, 0);
224 uint32_t x2 = x;
225 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
226 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
227
228 float4 px = py0[x0] * coeff[0] +
229 py0[x1] * coeff[1] +
230 py0[x2] * coeff[2] +
231 py0[x3] * coeff[3] +
232 py0[x4] * coeff[4] +
233
234 py1[x0] * coeff[5] +
235 py1[x1] * coeff[6] +
236 py1[x2] * coeff[7] +
237 py1[x3] * coeff[8] +
238 py1[x4] * coeff[9] +
239
240 py2[x0] * coeff[10] +
241 py2[x1] * coeff[11] +
242 py2[x2] * coeff[12] +
243 py2[x3] * coeff[13] +
244 py2[x4] * coeff[14] +
245
246 py3[x0] * coeff[15] +
247 py3[x1] * coeff[16] +
248 py3[x2] * coeff[17] +
249 py3[x3] * coeff[18] +
250 py3[x4] * coeff[19] +
251
252 py4[x0] * coeff[20] +
253 py4[x1] * coeff[21] +
254 py4[x2] * coeff[22] +
255 py4[x3] * coeff[23] +
256 py4[x4] * coeff[24];
257 *out = px;
258 }
259
OneF2(const RsForEachStubParamStruct * p,uint32_t x,float2 * out,const float2 * py0,const float2 * py1,const float2 * py2,const float2 * py3,const float2 * py4,const float * coeff)260 static void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
261 const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
262 const float* coeff) {
263
264 uint32_t x0 = rsMax((int32_t)x-2, 0);
265 uint32_t x1 = rsMax((int32_t)x-1, 0);
266 uint32_t x2 = x;
267 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
268 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
269
270 float2 px = py0[x0] * coeff[0] +
271 py0[x1] * coeff[1] +
272 py0[x2] * coeff[2] +
273 py0[x3] * coeff[3] +
274 py0[x4] * coeff[4] +
275
276 py1[x0] * coeff[5] +
277 py1[x1] * coeff[6] +
278 py1[x2] * coeff[7] +
279 py1[x3] * coeff[8] +
280 py1[x4] * coeff[9] +
281
282 py2[x0] * coeff[10] +
283 py2[x1] * coeff[11] +
284 py2[x2] * coeff[12] +
285 py2[x3] * coeff[13] +
286 py2[x4] * coeff[14] +
287
288 py3[x0] * coeff[15] +
289 py3[x1] * coeff[16] +
290 py3[x2] * coeff[17] +
291 py3[x3] * coeff[18] +
292 py3[x4] * coeff[19] +
293
294 py4[x0] * coeff[20] +
295 py4[x1] * coeff[21] +
296 py4[x2] * coeff[22] +
297 py4[x3] * coeff[23] +
298 py4[x4] * coeff[24];
299 *out = px;
300 }
301
OneF1(const RsForEachStubParamStruct * p,uint32_t x,float * out,const float * py0,const float * py1,const float * py2,const float * py3,const float * py4,const float * coeff)302 static void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
303 const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
304 const float* coeff) {
305
306 uint32_t x0 = rsMax((int32_t)x-2, 0);
307 uint32_t x1 = rsMax((int32_t)x-1, 0);
308 uint32_t x2 = x;
309 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
310 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
311
312 float px = py0[x0] * coeff[0] +
313 py0[x1] * coeff[1] +
314 py0[x2] * coeff[2] +
315 py0[x3] * coeff[3] +
316 py0[x4] * coeff[4] +
317
318 py1[x0] * coeff[5] +
319 py1[x1] * coeff[6] +
320 py1[x2] * coeff[7] +
321 py1[x3] * coeff[8] +
322 py1[x4] * coeff[9] +
323
324 py2[x0] * coeff[10] +
325 py2[x1] * coeff[11] +
326 py2[x2] * coeff[12] +
327 py2[x3] * coeff[13] +
328 py2[x4] * coeff[14] +
329
330 py3[x0] * coeff[15] +
331 py3[x1] * coeff[16] +
332 py3[x2] * coeff[17] +
333 py3[x3] * coeff[18] +
334 py3[x4] * coeff[19] +
335
336 py4[x0] * coeff[20] +
337 py4[x1] * coeff[21] +
338 py4[x2] * coeff[22] +
339 py4[x3] * coeff[23] +
340 py4[x4] * coeff[24];
341 *out = px;
342 }
343
344
345 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
346 const void *y2, const void *y3, const void *y4,
347 const short *coef, uint32_t count);
348
kernelU4(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)349 void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p,
350 uint32_t xstart, uint32_t xend,
351 uint32_t instep, uint32_t outstep) {
352 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
353 if (!cp->alloc.get()) {
354 ALOGE("Convolve5x5 executed without input, skipping");
355 return;
356 }
357 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
358 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
359
360 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
361 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
362 uint32_t y2 = p->y;
363 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
364 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
365
366 const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
367 const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
368 const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
369 const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
370 const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
371
372 uchar4 *out = (uchar4 *)p->out;
373 uint32_t x1 = xstart;
374 uint32_t x2 = xend;
375
376 while((x1 < x2) && (x1 < 2)) {
377 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
378 out++;
379 x1++;
380 }
381 #if defined(ARCH_X86_HAVE_SSSE3)
382 // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
383 // 3 for end boundary where x may hit the end boundary)
384 if (gArchUseSIMD &&((x1 + 6) < x2)) {
385 // subtract 3 for end boundary
386 uint32_t len = (x2 - x1 - 3) >> 2;
387 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
388 out += len << 2;
389 x1 += len << 2;
390 }
391 #endif
392
393 #if defined(ARCH_ARM_USE_INTRINSICS)
394 if(gArchUseSIMD && ((x1 + 3) < x2)) {
395 uint32_t len = (x2 - x1 - 3) >> 1;
396 rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
397 out += len << 1;
398 x1 += len << 1;
399 }
400 #endif
401
402 while(x1 < x2) {
403 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
404 out++;
405 x1++;
406 }
407 }
408
kernelU2(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)409 void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p,
410 uint32_t xstart, uint32_t xend,
411 uint32_t instep, uint32_t outstep) {
412 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
413 if (!cp->alloc.get()) {
414 ALOGE("Convolve5x5 executed without input, skipping");
415 return;
416 }
417 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
418 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
419
420 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
421 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
422 uint32_t y2 = p->y;
423 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
424 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
425
426 const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
427 const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
428 const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
429 const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
430 const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
431
432 uchar2 *out = (uchar2 *)p->out;
433 uint32_t x1 = xstart;
434 uint32_t x2 = xend;
435
436 while((x1 < x2) && (x1 < 2)) {
437 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
438 out++;
439 x1++;
440 }
441
442 #if 0//defined(ARCH_ARM_HAVE_NEON)
443 if((x1 + 3) < x2) {
444 uint32_t len = (x2 - x1 - 3) >> 1;
445 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
446 out += len << 1;
447 x1 += len << 1;
448 }
449 #endif
450
451 while(x1 < x2) {
452 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
453 out++;
454 x1++;
455 }
456 }
457
kernelU1(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)458 void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p,
459 uint32_t xstart, uint32_t xend,
460 uint32_t instep, uint32_t outstep) {
461 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
462 if (!cp->alloc.get()) {
463 ALOGE("Convolve5x5 executed without input, skipping");
464 return;
465 }
466 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
467 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
468
469 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
470 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
471 uint32_t y2 = p->y;
472 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
473 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
474
475 const uchar *py0 = (const uchar *)(pin + stride * y0);
476 const uchar *py1 = (const uchar *)(pin + stride * y1);
477 const uchar *py2 = (const uchar *)(pin + stride * y2);
478 const uchar *py3 = (const uchar *)(pin + stride * y3);
479 const uchar *py4 = (const uchar *)(pin + stride * y4);
480
481 uchar *out = (uchar *)p->out;
482 uint32_t x1 = xstart;
483 uint32_t x2 = xend;
484
485 while((x1 < x2) && (x1 < 2)) {
486 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
487 out++;
488 x1++;
489 }
490
491 #if 0//defined(ARCH_ARM_HAVE_NEON)
492 if((x1 + 3) < x2) {
493 uint32_t len = (x2 - x1 - 3) >> 1;
494 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
495 out += len << 1;
496 x1 += len << 1;
497 }
498 #endif
499
500 while(x1 < x2) {
501 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
502 out++;
503 x1++;
504 }
505 }
506
kernelF4(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)507 void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p,
508 uint32_t xstart, uint32_t xend,
509 uint32_t instep, uint32_t outstep) {
510 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
511 if (!cp->alloc.get()) {
512 ALOGE("Convolve5x5 executed without input, skipping");
513 return;
514 }
515 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
516 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
517
518 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
519 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
520 uint32_t y2 = p->y;
521 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
522 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
523
524 const float4 *py0 = (const float4 *)(pin + stride * y0);
525 const float4 *py1 = (const float4 *)(pin + stride * y1);
526 const float4 *py2 = (const float4 *)(pin + stride * y2);
527 const float4 *py3 = (const float4 *)(pin + stride * y3);
528 const float4 *py4 = (const float4 *)(pin + stride * y4);
529
530 float4 *out = (float4 *)p->out;
531 uint32_t x1 = xstart;
532 uint32_t x2 = xend;
533
534 while((x1 < x2) && (x1 < 2)) {
535 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
536 out++;
537 x1++;
538 }
539
540 #if 0//defined(ARCH_ARM_HAVE_NEON)
541 if((x1 + 3) < x2) {
542 uint32_t len = (x2 - x1 - 3) >> 1;
543 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
544 out += len << 1;
545 x1 += len << 1;
546 }
547 #endif
548
549 while(x1 < x2) {
550 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
551 out++;
552 x1++;
553 }
554 }
555
kernelF2(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)556 void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p,
557 uint32_t xstart, uint32_t xend,
558 uint32_t instep, uint32_t outstep) {
559 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
560 if (!cp->alloc.get()) {
561 ALOGE("Convolve5x5 executed without input, skipping");
562 return;
563 }
564 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
565 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
566
567 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
568 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
569 uint32_t y2 = p->y;
570 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
571 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
572
573 const float2 *py0 = (const float2 *)(pin + stride * y0);
574 const float2 *py1 = (const float2 *)(pin + stride * y1);
575 const float2 *py2 = (const float2 *)(pin + stride * y2);
576 const float2 *py3 = (const float2 *)(pin + stride * y3);
577 const float2 *py4 = (const float2 *)(pin + stride * y4);
578
579 float2 *out = (float2 *)p->out;
580 uint32_t x1 = xstart;
581 uint32_t x2 = xend;
582
583 while((x1 < x2) && (x1 < 2)) {
584 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
585 out++;
586 x1++;
587 }
588
589 #if 0//defined(ARCH_ARM_HAVE_NEON)
590 if((x1 + 3) < x2) {
591 uint32_t len = (x2 - x1 - 3) >> 1;
592 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
593 out += len << 1;
594 x1 += len << 1;
595 }
596 #endif
597
598 while(x1 < x2) {
599 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
600 out++;
601 x1++;
602 }
603 }
604
kernelF1(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)605 void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p,
606 uint32_t xstart, uint32_t xend,
607 uint32_t instep, uint32_t outstep) {
608 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
609 if (!cp->alloc.get()) {
610 ALOGE("Convolve5x5 executed without input, skipping");
611 return;
612 }
613 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
614 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
615
616 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
617 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
618 uint32_t y2 = p->y;
619 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
620 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
621
622 const float *py0 = (const float *)(pin + stride * y0);
623 const float *py1 = (const float *)(pin + stride * y1);
624 const float *py2 = (const float *)(pin + stride * y2);
625 const float *py3 = (const float *)(pin + stride * y3);
626 const float *py4 = (const float *)(pin + stride * y4);
627
628 float *out = (float *)p->out;
629 uint32_t x1 = xstart;
630 uint32_t x2 = xend;
631
632 while((x1 < x2) && (x1 < 2)) {
633 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
634 out++;
635 x1++;
636 }
637
638 #if 0//defined(ARCH_ARM_HAVE_NEON)
639 if((x1 + 3) < x2) {
640 uint32_t len = (x2 - x1 - 3) >> 1;
641 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
642 out += len << 1;
643 x1 += len << 1;
644 }
645 #endif
646
647 while(x1 < x2) {
648 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
649 out++;
650 x1++;
651 }
652 }
653
RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)654 RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
655 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
656 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
657
658 if (e->getType() == RS_TYPE_FLOAT_32) {
659 switch(e->getVectorSize()) {
660 case 1:
661 mRootPtr = &kernelF1;
662 break;
663 case 2:
664 mRootPtr = &kernelF2;
665 break;
666 case 3:
667 case 4:
668 mRootPtr = &kernelF4;
669 break;
670 }
671 } else {
672 switch(e->getVectorSize()) {
673 case 1:
674 mRootPtr = &kernelU1;
675 break;
676 case 2:
677 mRootPtr = &kernelU2;
678 break;
679 case 3:
680 case 4:
681 mRootPtr = &kernelU4;
682 break;
683 }
684 }
685 for(int ct=0; ct < 25; ct++) {
686 mFp[ct] = 1.f / 25.f;
687 mIp[ct] = (short)(mFp[ct] * 256.f);
688 }
689 }
690
~RsdCpuScriptIntrinsicConvolve5x5()691 RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
692 }
693
populateScript(Script * s)694 void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
695 s->mHal.info.exportedVariableCount = 2;
696 }
697
invokeFreeChildren()698 void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
699 alloc.clear();
700 }
701
702
rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)703 RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
704 const Script *s, const Element *e) {
705
706 return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
707 }
708
709
710
711