1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 
18 #include "rsCpuIntrinsic.h"
19 #include "rsCpuIntrinsicInlines.h"
20 
21 using namespace android;
22 using namespace android::renderscript;
23 
24 namespace android {
25 namespace renderscript {
26 
27 
28 class RsdCpuScriptIntrinsicResize : public RsdCpuScriptIntrinsic {
29 public:
30     void populateScript(Script *) override;
31     void invokeFreeChildren() override;
32 
33     void setGlobalObj(uint32_t slot, ObjectBase *data) override;
34 
35     ~RsdCpuScriptIntrinsicResize() override;
36     RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
37 
38     void preLaunch(uint32_t slot, const Allocation ** ains,
39                    uint32_t inLen, Allocation * aout, const void * usr,
40                    uint32_t usrLen, const RsScriptCall *sc) override;
41 
42     float scaleX;
43     float scaleY;
44 
45 protected:
46     ObjectBaseRef<const Allocation> mAlloc;
47     ObjectBaseRef<const Element> mElement;
48 
49     static void kernelU1(const RsExpandKernelDriverInfo *info,
50                          uint32_t xstart, uint32_t xend,
51                          uint32_t outstep);
52     static void kernelU2(const RsExpandKernelDriverInfo *info,
53                          uint32_t xstart, uint32_t xend,
54                          uint32_t outstep);
55     static void kernelU4(const RsExpandKernelDriverInfo *info,
56                          uint32_t xstart, uint32_t xend,
57                          uint32_t outstep);
58     static void kernelF1(const RsExpandKernelDriverInfo *info,
59                          uint32_t xstart, uint32_t xend,
60                          uint32_t outstep);
61     static void kernelF2(const RsExpandKernelDriverInfo *info,
62                          uint32_t xstart, uint32_t xend,
63                          uint32_t outstep);
64     static void kernelF4(const RsExpandKernelDriverInfo *info,
65                          uint32_t xstart, uint32_t xend,
66                          uint32_t outstep);
67 };
68 
69 }
70 }
71 
72 
setGlobalObj(uint32_t slot,ObjectBase * data)73 void RsdCpuScriptIntrinsicResize::setGlobalObj(uint32_t slot, ObjectBase *data) {
74     rsAssert(slot == 0);
75     mAlloc.set(static_cast<Allocation *>(data));
76 }
77 
cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3,float x)78 static float4 cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3, float x) {
79     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
80             + x * (3.f * (p1 - p2) + p3 - p0)));
81 }
82 
cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3,float x)83 static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
84     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
85             + x * (3.f * (p1 - p2) + p3 - p0)));
86 }
87 
cubicInterpolate(float p0,float p1,float p2,float p3,float x)88 static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
89     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
90             + x * (3.f * (p1 - p2) + p3 - p0)));
91 }
92 
OneBiCubic(const uchar4 * yp0,const uchar4 * yp1,const uchar4 * yp2,const uchar4 * yp3,float xf,float yf,int width)93 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
94                          float xf, float yf, int width) {
95     int startx = (int) floor(xf - 1);
96     xf = xf - floor(xf);
97     int maxx = width - 1;
98     int xs0 = rsMax(0, startx + 0);
99     int xs1 = rsMax(0, startx + 1);
100     int xs2 = rsMin(maxx, startx + 2);
101     int xs3 = rsMin(maxx, startx + 3);
102 
103     float4 p0  = cubicInterpolate(convert_float4(yp0[xs0]),
104                                   convert_float4(yp0[xs1]),
105                                   convert_float4(yp0[xs2]),
106                                   convert_float4(yp0[xs3]), xf);
107 
108     float4 p1  = cubicInterpolate(convert_float4(yp1[xs0]),
109                                   convert_float4(yp1[xs1]),
110                                   convert_float4(yp1[xs2]),
111                                   convert_float4(yp1[xs3]), xf);
112 
113     float4 p2  = cubicInterpolate(convert_float4(yp2[xs0]),
114                                   convert_float4(yp2[xs1]),
115                                   convert_float4(yp2[xs2]),
116                                   convert_float4(yp2[xs3]), xf);
117 
118     float4 p3  = cubicInterpolate(convert_float4(yp3[xs0]),
119                                   convert_float4(yp3[xs1]),
120                                   convert_float4(yp3[xs2]),
121                                   convert_float4(yp3[xs3]), xf);
122 
123     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
124     p = clamp(p + 0.5f, 0.f, 255.f);
125     return convert_uchar4(p);
126 }
127 
OneBiCubic(const uchar2 * yp0,const uchar2 * yp1,const uchar2 * yp2,const uchar2 * yp3,float xf,float yf,int width)128 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
129                          float xf, float yf, int width) {
130     int startx = (int) floor(xf - 1);
131     xf = xf - floor(xf);
132     int maxx = width - 1;
133     int xs0 = rsMax(0, startx + 0);
134     int xs1 = rsMax(0, startx + 1);
135     int xs2 = rsMin(maxx, startx + 2);
136     int xs3 = rsMin(maxx, startx + 3);
137 
138     float2 p0  = cubicInterpolate(convert_float2(yp0[xs0]),
139                                   convert_float2(yp0[xs1]),
140                                   convert_float2(yp0[xs2]),
141                                   convert_float2(yp0[xs3]), xf);
142 
143     float2 p1  = cubicInterpolate(convert_float2(yp1[xs0]),
144                                   convert_float2(yp1[xs1]),
145                                   convert_float2(yp1[xs2]),
146                                   convert_float2(yp1[xs3]), xf);
147 
148     float2 p2  = cubicInterpolate(convert_float2(yp2[xs0]),
149                                   convert_float2(yp2[xs1]),
150                                   convert_float2(yp2[xs2]),
151                                   convert_float2(yp2[xs3]), xf);
152 
153     float2 p3  = cubicInterpolate(convert_float2(yp3[xs0]),
154                                   convert_float2(yp3[xs1]),
155                                   convert_float2(yp3[xs2]),
156                                   convert_float2(yp3[xs3]), xf);
157 
158     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
159     p = clamp(p + 0.5f, 0.f, 255.f);
160     return convert_uchar2(p);
161 }
162 
OneBiCubic(const uchar * yp0,const uchar * yp1,const uchar * yp2,const uchar * yp3,float xf,float yf,int width)163 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
164                         float xf, float yf, int width) {
165     int startx = (int) floor(xf - 1);
166     xf = xf - floor(xf);
167     int maxx = width - 1;
168     int xs0 = rsMax(0, startx + 0);
169     int xs1 = rsMax(0, startx + 1);
170     int xs2 = rsMin(maxx, startx + 2);
171     int xs3 = rsMin(maxx, startx + 3);
172 
173     float p0  = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
174                                  (float)yp0[xs2], (float)yp0[xs3], xf);
175     float p1  = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
176                                  (float)yp1[xs2], (float)yp1[xs3], xf);
177     float p2  = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
178                                  (float)yp2[xs2], (float)yp2[xs3], xf);
179     float p3  = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
180                                  (float)yp3[xs2], (float)yp3[xs3], xf);
181 
182     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
183     p = clamp(p + 0.5f, 0.f, 255.f);
184     return (uchar)p;
185 }
186 
187 extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
188 
189 extern "C" void rsdIntrinsicResizeB4_K(
190             uchar4 *dst,
191             size_t count,
192             uint32_t xf,
193             uint32_t xinc,
194             uchar4 const *srcn,
195             uchar4 const *src0,
196             uchar4 const *src1,
197             uchar4 const *src2,
198             size_t xclip,
199             size_t avail,
200             uint64_t osc_ctl,
201             int32_t const *yr);
202 
203 extern "C" void rsdIntrinsicResizeB2_K(
204             uchar2 *dst,
205             size_t count,
206             uint32_t xf,
207             uint32_t xinc,
208             uchar2 const *srcn,
209             uchar2 const *src0,
210             uchar2 const *src1,
211             uchar2 const *src2,
212             size_t xclip,
213             size_t avail,
214             uint64_t osc_ctl,
215             int32_t const *yr);
216 
217 extern "C" void rsdIntrinsicResizeB1_K(
218             uchar *dst,
219             size_t count,
220             uint32_t xf,
221             uint32_t xinc,
222             uchar const *srcn,
223             uchar const *src0,
224             uchar const *src1,
225             uchar const *src2,
226             size_t xclip,
227             size_t avail,
228             uint64_t osc_ctl,
229             int32_t const *yr);
230 
231 #if defined(ARCH_ARM_USE_INTRINSICS)
mkYCoeff(int32_t * yr,float yf)232 static void mkYCoeff(int32_t *yr, float yf) {
233     int32_t yf1 = rint(yf * 0x10000);
234     int32_t yf2 = rint(yf * yf * 0x10000);
235     int32_t yf3 = rint(yf * yf * yf * 0x10000);
236 
237     yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
238     yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
239     yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
240     yr[3] = -(yf3 - yf2) >> 1;
241 }
242 #endif
243 
OneBiCubic(const float4 * yp0,const float4 * yp1,const float4 * yp2,const float4 * yp3,float xf,float yf,int width)244 static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
245                          float xf, float yf, int width) {
246     int startx = (int) floor(xf - 1);
247     xf = xf - floor(xf);
248     int maxx = width - 1;
249     int xs0 = rsMax(0, startx + 0);
250     int xs1 = rsMax(0, startx + 1);
251     int xs2 = rsMin(maxx, startx + 2);
252     int xs3 = rsMin(maxx, startx + 3);
253 
254     float4 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
255                                   yp0[xs2], yp0[xs3], xf);
256     float4 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
257                                   yp1[xs2], yp1[xs3], xf);
258     float4 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
259                                   yp2[xs2], yp2[xs3], xf);
260     float4 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
261                                   yp3[xs2], yp3[xs3], xf);
262 
263     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
264     return p;
265 }
266 
OneBiCubic(const float2 * yp0,const float2 * yp1,const float2 * yp2,const float2 * yp3,float xf,float yf,int width)267 static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
268                          float xf, float yf, int width) {
269     int startx = (int) floor(xf - 1);
270     xf = xf - floor(xf);
271     int maxx = width - 1;
272     int xs0 = rsMax(0, startx + 0);
273     int xs1 = rsMax(0, startx + 1);
274     int xs2 = rsMin(maxx, startx + 2);
275     int xs3 = rsMin(maxx, startx + 3);
276 
277     float2 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
278                                   yp0[xs2], yp0[xs3], xf);
279     float2 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
280                                   yp1[xs2], yp1[xs3], xf);
281     float2 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
282                                   yp2[xs2], yp2[xs3], xf);
283     float2 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
284                                   yp3[xs2], yp3[xs3], xf);
285 
286     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
287     return p;
288 }
289 
OneBiCubic(const float * yp0,const float * yp1,const float * yp2,const float * yp3,float xf,float yf,int width)290 static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
291                         float xf, float yf, int width) {
292     int startx = (int) floor(xf - 1);
293     xf = xf - floor(xf);
294     int maxx = width - 1;
295     int xs0 = rsMax(0, startx + 0);
296     int xs1 = rsMax(0, startx + 1);
297     int xs2 = rsMin(maxx, startx + 2);
298     int xs3 = rsMin(maxx, startx + 3);
299 
300     float p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
301                                  yp0[xs2], yp0[xs3], xf);
302     float p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
303                                  yp1[xs2], yp1[xs3], xf);
304     float p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
305                                  yp2[xs2], yp2[xs3], xf);
306     float p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
307                                  yp3[xs2], yp3[xs3], xf);
308 
309     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
310     return p;
311 }
312 
kernelU4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)313 void RsdCpuScriptIntrinsicResize::kernelU4(const RsExpandKernelDriverInfo *info,
314                                                 uint32_t xstart, uint32_t xend,
315                                                 uint32_t outstep) {
316     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
317 
318     if (!cp->mAlloc.get()) {
319         ALOGE("Resize executed without input, skipping");
320         return;
321     }
322     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
323     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
324     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
325     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
326 
327     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
328     int starty = (int) floor(yf - 1);
329     yf = yf - floor(yf);
330     int maxy = srcHeight - 1;
331     int ys0 = rsMax(0, starty + 0);
332     int ys1 = rsMax(0, starty + 1);
333     int ys2 = rsMin(maxy, starty + 2);
334     int ys3 = rsMin(maxy, starty + 3);
335 
336     const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
337     const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
338     const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
339     const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
340 
341     uchar4 *out = ((uchar4 *)info->outPtr[0]) + xstart;
342     uint32_t x1 = xstart;
343     uint32_t x2 = xend;
344 
345 #if defined(ARCH_ARM_USE_INTRINSICS)
346     if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
347         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
348         long xf16 = rint(xf * 0x10000);
349         uint32_t xinc16 = rint(cp->scaleX * 0x10000);
350 
351         int xoff = (xf16 >> 16) - 1;
352         int xclip = rsMax(0, xoff) - xoff;
353         int len = x2 - x1;
354 
355         int32_t yr[4];
356         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
357         mkYCoeff(yr, yf);
358 
359         xoff += xclip;
360 
361         rsdIntrinsicResizeB4_K(
362                 out, len,
363                 xf16 & 0xffff, xinc16,
364                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
365                 xclip, srcWidth - xoff + xclip,
366                 osc_ctl, yr);
367         out += len;
368         x1 += len;
369     }
370 #endif
371 
372     while(x1 < x2) {
373         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
374         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
375         out++;
376         x1++;
377     }
378 }
379 
kernelU2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)380 void RsdCpuScriptIntrinsicResize::kernelU2(const RsExpandKernelDriverInfo *info,
381                                                 uint32_t xstart, uint32_t xend,
382                                                 uint32_t outstep) {
383     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
384 
385     if (!cp->mAlloc.get()) {
386         ALOGE("Resize executed without input, skipping");
387         return;
388     }
389     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
390     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
391     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
392     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
393 
394     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
395     int starty = (int) floor(yf - 1);
396     yf = yf - floor(yf);
397     int maxy = srcHeight - 1;
398     int ys0 = rsMax(0, starty + 0);
399     int ys1 = rsMax(0, starty + 1);
400     int ys2 = rsMin(maxy, starty + 2);
401     int ys3 = rsMin(maxy, starty + 3);
402 
403     const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
404     const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
405     const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
406     const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
407 
408     uchar2 *out = ((uchar2 *)info->outPtr[0]) + xstart;
409     uint32_t x1 = xstart;
410     uint32_t x2 = xend;
411 
412 #if defined(ARCH_ARM_USE_INTRINSICS)
413     if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
414         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
415         long xf16 = rint(xf * 0x10000);
416         uint32_t xinc16 = rint(cp->scaleX * 0x10000);
417 
418         int xoff = (xf16 >> 16) - 1;
419         int xclip = rsMax(0, xoff) - xoff;
420         int len = x2 - x1;
421 
422         int32_t yr[4];
423         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
424         mkYCoeff(yr, yf);
425 
426         xoff += xclip;
427 
428         rsdIntrinsicResizeB2_K(
429                 out, len,
430                 xf16 & 0xffff, xinc16,
431                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
432                 xclip, srcWidth - xoff + xclip,
433                 osc_ctl, yr);
434         out += len;
435         x1 += len;
436     }
437 #endif
438 
439     while(x1 < x2) {
440         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
441         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
442         out++;
443         x1++;
444     }
445 }
446 
kernelU1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)447 void RsdCpuScriptIntrinsicResize::kernelU1(const RsExpandKernelDriverInfo *info,
448                                                 uint32_t xstart, uint32_t xend,
449                                                 uint32_t outstep) {
450     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
451 
452     if (!cp->mAlloc.get()) {
453         ALOGE("Resize executed without input, skipping");
454         return;
455     }
456     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
457     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
458     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
459     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
460 
461     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
462     int starty = (int) floor(yf - 1);
463     yf = yf - floor(yf);
464     int maxy = srcHeight - 1;
465     int ys0 = rsMax(0, starty + 0);
466     int ys1 = rsMax(0, starty + 1);
467     int ys2 = rsMin(maxy, starty + 2);
468     int ys3 = rsMin(maxy, starty + 3);
469 
470     const uchar *yp0 = pin + stride * ys0;
471     const uchar *yp1 = pin + stride * ys1;
472     const uchar *yp2 = pin + stride * ys2;
473     const uchar *yp3 = pin + stride * ys3;
474 
475     uchar *out = ((uchar *)info->outPtr[0]) + xstart;
476     uint32_t x1 = xstart;
477     uint32_t x2 = xend;
478 
479 #if defined(ARCH_ARM_USE_INTRINSICS)
480     if (gArchUseSIMD && x2 > x1 && cp->scaleX < 4.0f) {
481         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
482         long xf16 = rint(xf * 0x10000);
483         uint32_t xinc16 = rint(cp->scaleX * 0x10000);
484 
485         int xoff = (xf16 >> 16) - 1;
486         int xclip = rsMax(0, xoff) - xoff;
487         int len = x2 - x1;
488 
489         int32_t yr[4];
490         uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
491         mkYCoeff(yr, yf);
492 
493         xoff += xclip;
494 
495         rsdIntrinsicResizeB1_K(
496                 out, len,
497                 xf16 & 0xffff, xinc16,
498                 yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
499                 xclip, srcWidth - xoff + xclip,
500                 osc_ctl, yr);
501         out += len;
502         x1 += len;
503     }
504 #endif
505 
506     while(x1 < x2) {
507         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
508         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
509         out++;
510         x1++;
511     }
512 }
513 
kernelF4(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)514 void RsdCpuScriptIntrinsicResize::kernelF4(const RsExpandKernelDriverInfo *info,
515                                                 uint32_t xstart, uint32_t xend,
516                                                 uint32_t outstep) {
517     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
518 
519     if (!cp->mAlloc.get()) {
520         ALOGE("Resize executed without input, skipping");
521         return;
522     }
523     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
524     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
525     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
526     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
527 
528     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
529     int starty = (int) floor(yf - 1);
530     yf = yf - floor(yf);
531     int maxy = srcHeight - 1;
532     int ys0 = rsMax(0, starty + 0);
533     int ys1 = rsMax(0, starty + 1);
534     int ys2 = rsMin(maxy, starty + 2);
535     int ys3 = rsMin(maxy, starty + 3);
536 
537     const float4 *yp0 = (const float4 *)(pin + stride * ys0);
538     const float4 *yp1 = (const float4 *)(pin + stride * ys1);
539     const float4 *yp2 = (const float4 *)(pin + stride * ys2);
540     const float4 *yp3 = (const float4 *)(pin + stride * ys3);
541 
542     float4 *out = ((float4 *)info->outPtr[0]) + xstart;
543     uint32_t x1 = xstart;
544     uint32_t x2 = xend;
545 
546     while(x1 < x2) {
547         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
548         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
549         out++;
550         x1++;
551     }
552 }
553 
kernelF2(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)554 void RsdCpuScriptIntrinsicResize::kernelF2(const RsExpandKernelDriverInfo *info,
555                                                 uint32_t xstart, uint32_t xend,
556                                                 uint32_t outstep) {
557     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
558 
559     if (!cp->mAlloc.get()) {
560         ALOGE("Resize executed without input, skipping");
561         return;
562     }
563     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
564     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
565     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
566     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
567 
568     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
569     int starty = (int) floor(yf - 1);
570     yf = yf - floor(yf);
571     int maxy = srcHeight - 1;
572     int ys0 = rsMax(0, starty + 0);
573     int ys1 = rsMax(0, starty + 1);
574     int ys2 = rsMin(maxy, starty + 2);
575     int ys3 = rsMin(maxy, starty + 3);
576 
577     const float2 *yp0 = (const float2 *)(pin + stride * ys0);
578     const float2 *yp1 = (const float2 *)(pin + stride * ys1);
579     const float2 *yp2 = (const float2 *)(pin + stride * ys2);
580     const float2 *yp3 = (const float2 *)(pin + stride * ys3);
581 
582     float2 *out = ((float2 *)info->outPtr[0]) + xstart;
583     uint32_t x1 = xstart;
584     uint32_t x2 = xend;
585 
586     while(x1 < x2) {
587         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
588         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
589         out++;
590         x1++;
591     }
592 }
593 
kernelF1(const RsExpandKernelDriverInfo * info,uint32_t xstart,uint32_t xend,uint32_t outstep)594 void RsdCpuScriptIntrinsicResize::kernelF1(const RsExpandKernelDriverInfo *info,
595                                                 uint32_t xstart, uint32_t xend,
596                                                 uint32_t outstep) {
597     RsdCpuScriptIntrinsicResize *cp = (RsdCpuScriptIntrinsicResize *)info->usr;
598 
599     if (!cp->mAlloc.get()) {
600         ALOGE("Resize executed without input, skipping");
601         return;
602     }
603     const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
604     const int srcHeight = cp->mAlloc->mHal.drvState.lod[0].dimY;
605     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
606     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
607 
608     float yf = (info->current.y + 0.5f) * cp->scaleY - 0.5f;
609     int starty = (int) floor(yf - 1);
610     yf = yf - floor(yf);
611     int maxy = srcHeight - 1;
612     int ys0 = rsMax(0, starty + 0);
613     int ys1 = rsMax(0, starty + 1);
614     int ys2 = rsMin(maxy, starty + 2);
615     int ys3 = rsMin(maxy, starty + 3);
616 
617     const float *yp0 = (const float *)(pin + stride * ys0);
618     const float *yp1 = (const float *)(pin + stride * ys1);
619     const float *yp2 = (const float *)(pin + stride * ys2);
620     const float *yp3 = (const float *)(pin + stride * ys3);
621 
622     float *out = ((float *)info->outPtr[0]) + xstart;
623     uint32_t x1 = xstart;
624     uint32_t x2 = xend;
625 
626     while(x1 < x2) {
627         float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
628         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
629         out++;
630         x1++;
631     }
632 }
633 
RsdCpuScriptIntrinsicResize(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)634 RsdCpuScriptIntrinsicResize::RsdCpuScriptIntrinsicResize (
635             RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
636             : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_RESIZE) {
637 
638 }
639 
~RsdCpuScriptIntrinsicResize()640 RsdCpuScriptIntrinsicResize::~RsdCpuScriptIntrinsicResize() {
641 }
642 
preLaunch(uint32_t slot,const Allocation ** ains,uint32_t inLen,Allocation * aout,const void * usr,uint32_t usrLen,const RsScriptCall * sc)643 void RsdCpuScriptIntrinsicResize::preLaunch(uint32_t slot,
644                                             const Allocation ** ains,
645                                             uint32_t inLen, Allocation * aout,
646                                             const void * usr, uint32_t usrLen,
647                                             const RsScriptCall *sc)
648 {
649     if (!mAlloc.get()) {
650         ALOGE("Resize executed without input, skipping");
651         return;
652     }
653     const uint32_t srcHeight = mAlloc->mHal.drvState.lod[0].dimY;
654     const uint32_t srcWidth = mAlloc->mHal.drvState.lod[0].dimX;
655     const size_t stride = mAlloc->mHal.drvState.lod[0].stride;
656 
657     //check the data type to determine F or U.
658     if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
659         switch(mAlloc->getType()->getElement()->getVectorSize()) {
660         case 1:
661             mRootPtr = &kernelU1;
662             break;
663         case 2:
664             mRootPtr = &kernelU2;
665             break;
666         case 3:
667         case 4:
668             mRootPtr = &kernelU4;
669             break;
670         }
671     } else {
672         switch(mAlloc->getType()->getElement()->getVectorSize()) {
673         case 1:
674             mRootPtr = &kernelF1;
675             break;
676         case 2:
677             mRootPtr = &kernelF2;
678             break;
679         case 3:
680         case 4:
681             mRootPtr = &kernelF4;
682             break;
683         }
684     }
685 
686     scaleX = (float)srcWidth / aout->mHal.drvState.lod[0].dimX;
687     scaleY = (float)srcHeight / aout->mHal.drvState.lod[0].dimY;
688 
689 }
690 
populateScript(Script * s)691 void RsdCpuScriptIntrinsicResize::populateScript(Script *s) {
692     s->mHal.info.exportedVariableCount = 1;
693 }
694 
invokeFreeChildren()695 void RsdCpuScriptIntrinsicResize::invokeFreeChildren() {
696     mAlloc.clear();
697 }
698 
699 
rsdIntrinsic_Resize(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)700 RsdCpuScriptImpl * rsdIntrinsic_Resize(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
701 
702     return new RsdCpuScriptIntrinsicResize(ctx, s, e);
703 }
704