1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include <vector>
46 #include <cuda_runtime.h>
47 
48 #include "opencv2/core/cuda/warp.hpp"
49 #include "opencv2/core/cuda/warp_shuffle.hpp"
50 
51 #include "opencv2/cudalegacy/NPP_staging.hpp"
52 
53 
54 texture<Ncv8u,  1, cudaReadModeElementType> tex8u;
55 texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
56 texture<uint2,  1, cudaReadModeElementType> tex64u;
57 
58 
59 //==============================================================================
60 //
61 // CUDA streams handling
62 //
63 //==============================================================================
64 
65 
66 static cudaStream_t nppStream = 0;
67 
68 
nppStGetActiveCUDAstream(void)69 cudaStream_t nppStGetActiveCUDAstream(void)
70 {
71     return nppStream;
72 }
73 
74 
75 
nppStSetActiveCUDAstream(cudaStream_t cudaStream)76 cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
77 {
78     cudaStream_t tmp = nppStream;
79     nppStream = cudaStream;
80     return tmp;
81 }
82 
83 
84 //==============================================================================
85 //
86 // BlockScan.cuh
87 //
88 //==============================================================================
89 
90 
91 NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
92 
93 
94 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
95 //assuming size <= WARP_SIZE and size is power of 2
96 template <class T>
warpScanInclusive(T idata,volatile T * s_Data)97 inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
98 {
99 #if __CUDA_ARCH__ >= 300
100     const unsigned int laneId = cv::cuda::device::Warp::laneId();
101 
102     // scan on shuffl functions
103     #pragma unroll
104     for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
105     {
106         const T n = cv::cuda::device::shfl_up(idata, i);
107         if (laneId >= i)
108               idata += n;
109     }
110 
111     return idata;
112 #else
113     Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
114     s_Data[pos] = 0;
115     pos += K_WARP_SIZE;
116     s_Data[pos] = idata;
117 
118     s_Data[pos] += s_Data[pos - 1];
119     s_Data[pos] += s_Data[pos - 2];
120     s_Data[pos] += s_Data[pos - 4];
121     s_Data[pos] += s_Data[pos - 8];
122     s_Data[pos] += s_Data[pos - 16];
123 
124     return s_Data[pos];
125 #endif
126 }
warpScanInclusive(Ncv64u idata,volatile Ncv64u * s_Data)127 inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
128 {
129     Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
130     s_Data[pos] = 0;
131     pos += K_WARP_SIZE;
132     s_Data[pos] = idata;
133 
134     s_Data[pos] += s_Data[pos - 1];
135     s_Data[pos] += s_Data[pos - 2];
136     s_Data[pos] += s_Data[pos - 4];
137     s_Data[pos] += s_Data[pos - 8];
138     s_Data[pos] += s_Data[pos - 16];
139 
140     return s_Data[pos];
141 }
142 
143 
144 template <class T>
warpScanExclusive(T idata,volatile T * s_Data)145 inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
146 {
147     return warpScanInclusive(idata, s_Data) - idata;
148 }
149 
150 
151 template <class T, Ncv32u tiNumScanThreads>
blockScanInclusive(T idata,volatile T * s_Data)152 inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
153 {
154     if (tiNumScanThreads > K_WARP_SIZE)
155     {
156         //Bottom-level inclusive warp scan
157         T warpResult = warpScanInclusive(idata, s_Data);
158 
159         //Save top elements of each warp for exclusive warp scan
160         //sync to wait for warp scans to complete (because s_Data is being overwritten)
161         __syncthreads();
162         if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
163         {
164             s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
165         }
166 
167         //wait for warp scans to complete
168         __syncthreads();
169 
170         if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
171         {
172             //grab top warp elements
173             T val = s_Data[threadIdx.x];
174             //calculate exclusive scan and write back to shared memory
175             s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
176         }
177 
178         //return updated warp scans with exclusive scan results
179         __syncthreads();
180         return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
181     }
182     else
183     {
184         return warpScanInclusive(idata, s_Data);
185     }
186 }
187 
188 
189 //==============================================================================
190 //
191 // IntegralImage.cu
192 //
193 //==============================================================================
194 
195 
196 const Ncv32u NUM_SCAN_THREADS = 256;
197 const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
198 
199 
200 template<class T_in, class T_out>
201 struct _scanElemOp
202 {
203     template<bool tbDoSqr>
scanElemOp_scanElemOp204     static inline __host__ __device__ T_out scanElemOp(T_in elem)
205     {
206         return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
207     }
208 
209 private:
210 
211     template <int v> struct Int2Type { enum { value = v }; };
212 
scanElemOp_scanElemOp213     static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
214     {
215         return (T_out)elem;
216     }
217 
scanElemOp_scanElemOp218     static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
219     {
220         return (T_out)(elem*elem);
221     }
222 };
223 
224 
225 template<class T>
226 inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
227 
228 
229 template<>
readElem(Ncv8u * d_src,Ncv32u texOffs,Ncv32u srcStride,Ncv32u curElemOffs)230 inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
231 {
232     return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
233 }
234 
235 
236 template<>
readElem(Ncv32u * d_src,Ncv32u texOffs,Ncv32u srcStride,Ncv32u curElemOffs)237 inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
238 {
239     return d_src[curElemOffs];
240 }
241 
242 
243 template<>
readElem(Ncv32f * d_src,Ncv32u texOffs,Ncv32u srcStride,Ncv32u curElemOffs)244 inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
245 {
246     return d_src[curElemOffs];
247 }
248 
249 
250 /**
251 * \brief Segmented scan kernel
252 *
253 * Calculates per-row prefix scans of the input image.
254 * Out-of-bounds safe: reads 'size' elements, writes 'size+1' elements
255 *
256 * \tparam T_in      Type of input image elements
257 * \tparam T_out     Type of output image elements
258 * \tparam T_op      Defines an operation to be performed on the input image pixels
259 *
260 * \param d_src      [IN] Source image pointer
261 * \param srcWidth   [IN] Source image width
262 * \param srcStride  [IN] Source image stride
263 * \param d_II       [OUT] Output image pointer
264 * \param IIstride   [IN] Output image stride
265 *
266 * \return None
267 */
268 template <class T_in, class T_out, bool tbDoSqr>
scanRows(T_in * d_src,Ncv32u texOffs,Ncv32u srcWidth,Ncv32u srcStride,T_out * d_II,Ncv32u IIstride)269 __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
270                          T_out *d_II, Ncv32u IIstride)
271 {
272     //advance pointers to the current line
273     if (sizeof(T_in) != 1)
274     {
275         d_src += srcStride * blockIdx.x;
276     }
277     //for initial image 8bit source we use texref tex8u
278     d_II += IIstride * blockIdx.x;
279 
280     Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS;
281     Ncv32u offsetX = 0;
282 
283     __shared__ T_out shmem[NUM_SCAN_THREADS * 2];
284     __shared__ T_out carryElem;
285     carryElem = 0;
286     __syncthreads();
287 
288     while (numBuckets--)
289     {
290         Ncv32u curElemOffs = offsetX + threadIdx.x;
291         T_out curScanElem;
292 
293         T_in curElem;
294         T_out curElemMod;
295 
296         if (curElemOffs < srcWidth)
297         {
298             //load elements
299             curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
300         }
301         curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
302 
303         //inclusive scan
304         curScanElem = blockScanInclusive<T_out, NUM_SCAN_THREADS>(curElemMod, shmem);
305 
306         if (curElemOffs <= srcWidth)
307         {
308             //make scan exclusive and write the bucket to the output buffer
309             d_II[curElemOffs] = carryElem + curScanElem - curElemMod;
310             offsetX += NUM_SCAN_THREADS;
311         }
312 
313         //remember last element for subsequent buckets adjustment
314         __syncthreads();
315         if (threadIdx.x == NUM_SCAN_THREADS-1)
316         {
317             carryElem += curScanElem;
318         }
319         __syncthreads();
320     }
321 
322     if (offsetX == srcWidth && !threadIdx.x)
323     {
324         d_II[offsetX] = carryElem;
325     }
326 }
327 
328 
329 template <bool tbDoSqr, class T_in, class T_out>
scanRowsWrapperDevice(T_in * d_src,Ncv32u srcStride,T_out * d_dst,Ncv32u dstStride,NcvSize32u roi)330 NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
331                                 T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
332 {
333     cudaChannelFormatDesc cfdTex;
334     size_t alignmentOffset = 0;
335     if (sizeof(T_in) == 1)
336     {
337         cfdTex = cudaCreateChannelDesc<Ncv8u>();
338         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
339         if (alignmentOffset > 0)
340         {
341             ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
342             ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
343         }
344     }
345     scanRows
346         <T_in, T_out, tbDoSqr>
347         <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
348         (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
349 
350     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
351 
352     return NPPST_SUCCESS;
353 }
354 
355 
getPaddedDimension(Ncv32u dim,Ncv32u elemTypeSize,Ncv32u allocatorAlignment)356 static Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
357 {
358     Ncv32u alignMask = allocatorAlignment-1;
359     Ncv32u inverseAlignMask = ~alignMask;
360     Ncv32u dimBytes = dim * elemTypeSize;
361     Ncv32u pitch = (dimBytes + alignMask) & inverseAlignMask;
362     Ncv32u PaddedDim = pitch / elemTypeSize;
363     return PaddedDim;
364 }
365 
366 
367 template <class T_in, class T_out>
ncvIntegralImage_device(T_in * d_src,Ncv32u srcStep,T_out * d_dst,Ncv32u dstStep,NcvSize32u roi,INCVMemAllocator & gpuAllocator)368 NCVStatus ncvIntegralImage_device(T_in *d_src, Ncv32u srcStep,
369                                   T_out *d_dst, Ncv32u dstStep, NcvSize32u roi,
370                                   INCVMemAllocator &gpuAllocator)
371 {
372     ncvAssertReturn(sizeof(T_out) == sizeof(Ncv32u), NPPST_MEM_INTERNAL_ERROR);
373     ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice ||
374                       gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR);
375     ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
376     ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
377     ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
378     ncvAssertReturn(srcStep >= roi.width * sizeof(T_in) &&
379                       dstStep >= (roi.width + 1) * sizeof(T_out) &&
380                       srcStep % sizeof(T_in) == 0 &&
381                       dstStep % sizeof(T_out) == 0, NPPST_INVALID_STEP);
382     srcStep /= sizeof(T_in);
383     dstStep /= sizeof(T_out);
384 
385     Ncv32u WidthII = roi.width + 1;
386     Ncv32u HeightII = roi.height + 1;
387     Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment());
388     Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment());
389 
390     NCVMatrixAlloc<T_out> Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32);
391     ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
392     NCVMatrixAlloc<T_out> Tmp32_2(gpuAllocator, PaddedHeightII32, PaddedWidthII32);
393     ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_2.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
394     ncvAssertReturn(Tmp32_1.pitch() * Tmp32_1.height() == Tmp32_2.pitch() * Tmp32_2.height(), NPPST_MEM_INTERNAL_ERROR);
395 
396     NCVStatus ncvStat;
397     NCV_SET_SKIP_COND(gpuAllocator.isCounting());
398 
399     NCV_SKIP_COND_BEGIN
400 
401     ncvStat = scanRowsWrapperDevice
402         <false>
403         (d_src, srcStep, Tmp32_1.ptr(), PaddedWidthII32, roi);
404     ncvAssertReturnNcvStat(ncvStat);
405 
406     ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedWidthII32*sizeof(Ncv32u),
407                                       (Ncv32u *)Tmp32_2.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height));
408     ncvAssertReturnNcvStat(ncvStat);
409 
410     ncvStat = scanRowsWrapperDevice
411         <false>
412         (Tmp32_2.ptr(), PaddedHeightII32, Tmp32_1.ptr(), PaddedHeightII32, NcvSize32u(roi.height, WidthII));
413     ncvAssertReturnNcvStat(ncvStat);
414 
415     ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u),
416                                       (Ncv32u *)d_dst, dstStep*sizeof(Ncv32u), NcvSize32u(HeightII, WidthII));
417     ncvAssertReturnNcvStat(ncvStat);
418 
419     NCV_SKIP_COND_END
420 
421     return NPPST_SUCCESS;
422 }
423 
424 
ncvSquaredIntegralImage_device(Ncv8u * d_src,Ncv32u srcStep,Ncv64u * d_dst,Ncv32u dstStep,NcvSize32u roi,INCVMemAllocator & gpuAllocator)425 NCVStatus ncvSquaredIntegralImage_device(Ncv8u *d_src, Ncv32u srcStep,
426                                          Ncv64u *d_dst, Ncv32u dstStep, NcvSize32u roi,
427                                          INCVMemAllocator &gpuAllocator)
428 {
429     ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
430     ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice ||
431                       gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR);
432     ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
433     ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
434     ncvAssertReturn(srcStep >= roi.width &&
435                       dstStep >= (roi.width + 1) * sizeof(Ncv64u) &&
436                       dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP);
437     dstStep /= sizeof(Ncv64u);
438 
439     Ncv32u WidthII = roi.width + 1;
440     Ncv32u HeightII = roi.height + 1;
441     Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment());
442     Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment());
443     Ncv32u PaddedWidthII64 = getPaddedDimension(WidthII, sizeof(Ncv64u), gpuAllocator.alignment());
444     Ncv32u PaddedHeightII64 = getPaddedDimension(HeightII, sizeof(Ncv64u), gpuAllocator.alignment());
445     Ncv32u PaddedWidthMax = PaddedWidthII32 > PaddedWidthII64 ? PaddedWidthII32 : PaddedWidthII64;
446     Ncv32u PaddedHeightMax = PaddedHeightII32 > PaddedHeightII64 ? PaddedHeightII32 : PaddedHeightII64;
447 
448     NCVMatrixAlloc<Ncv32u> Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32);
449     ncvAssertReturn(Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
450     NCVMatrixAlloc<Ncv64u> Tmp64(gpuAllocator, PaddedWidthMax, PaddedHeightMax);
451     ncvAssertReturn(Tmp64.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
452 
453     NCVMatrixReuse<Ncv32u> Tmp32_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII32, PaddedHeightII32);
454     ncvAssertReturn(Tmp32_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR);
455     NCVMatrixReuse<Ncv64u> Tmp64_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII64, PaddedHeightII64);
456     ncvAssertReturn(Tmp64_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR);
457 
458     NCVStatus ncvStat;
459     NCV_SET_SKIP_COND(gpuAllocator.isCounting());
460 
461     NCV_SKIP_COND_BEGIN
462 
463     ncvStat = scanRowsWrapperDevice
464         <true, Ncv8u, Ncv32u>
465         (d_src, srcStep, Tmp32_2.ptr(), PaddedWidthII32, roi);
466     ncvAssertReturnNcvStat(ncvStat);
467 
468     ncvStat = nppiStTranspose_32u_C1R(Tmp32_2.ptr(), PaddedWidthII32*sizeof(Ncv32u),
469                                       Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height));
470     ncvAssertReturnNcvStat(ncvStat);
471 
472     ncvStat = scanRowsWrapperDevice
473         <false, Ncv32u, Ncv64u>
474         (Tmp32_1.ptr(), PaddedHeightII32, Tmp64_2.ptr(), PaddedHeightII64, NcvSize32u(roi.height, WidthII));
475     ncvAssertReturnNcvStat(ncvStat);
476 
477     ncvStat = nppiStTranspose_64u_C1R(Tmp64_2.ptr(), PaddedHeightII64*sizeof(Ncv64u),
478                                       d_dst, dstStep*sizeof(Ncv64u), NcvSize32u(HeightII, WidthII));
479     ncvAssertReturnNcvStat(ncvStat);
480 
481     NCV_SKIP_COND_END
482 
483     return NPPST_SUCCESS;
484 }
485 
486 
nppiStIntegralGetSize_8u32u(NcvSize32u roiSize,Ncv32u * pBufsize,cudaDeviceProp & devProp)487 NCVStatus nppiStIntegralGetSize_8u32u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
488 {
489     ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
490     ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
491 
492     NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
493     ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
494 
495     NCVStatus ncvStat = ncvIntegralImage_device((Ncv8u*)NULL, roiSize.width,
496                                                   (Ncv32u*)NULL, (roiSize.width+1) * sizeof(Ncv32u),
497                                                   roiSize, gpuCounter);
498     ncvAssertReturnNcvStat(ncvStat);
499 
500     *pBufsize = (Ncv32u)gpuCounter.maxSize();
501     return NPPST_SUCCESS;
502 }
503 
504 
nppiStIntegralGetSize_32f32f(NcvSize32u roiSize,Ncv32u * pBufsize,cudaDeviceProp & devProp)505 NCVStatus nppiStIntegralGetSize_32f32f(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
506 {
507     ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
508     ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
509 
510     NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
511     ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
512 
513     NCVStatus ncvStat = ncvIntegralImage_device((Ncv32f*)NULL, roiSize.width * sizeof(Ncv32f),
514                                                   (Ncv32f*)NULL, (roiSize.width+1) * sizeof(Ncv32f),
515                                                   roiSize, gpuCounter);
516     ncvAssertReturnNcvStat(ncvStat);
517 
518     *pBufsize = (Ncv32u)gpuCounter.maxSize();
519     return NPPST_SUCCESS;
520 }
521 
522 
nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize,Ncv32u * pBufsize,cudaDeviceProp & devProp)523 NCVStatus nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
524 {
525     ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
526     ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
527 
528     NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
529     ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
530 
531     NCVStatus ncvStat = ncvSquaredIntegralImage_device(NULL, roiSize.width,
532                                                          NULL, (roiSize.width+1) * sizeof(Ncv64u),
533                                                          roiSize, gpuCounter);
534     ncvAssertReturnNcvStat(ncvStat);
535 
536     *pBufsize = (Ncv32u)gpuCounter.maxSize();
537     return NPPST_SUCCESS;
538 }
539 
540 
nppiStIntegral_8u32u_C1R(Ncv8u * d_src,Ncv32u srcStep,Ncv32u * d_dst,Ncv32u dstStep,NcvSize32u roiSize,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)541 NCVStatus nppiStIntegral_8u32u_C1R(Ncv8u *d_src, Ncv32u srcStep,
542                                    Ncv32u *d_dst, Ncv32u dstStep,
543                                    NcvSize32u roiSize, Ncv8u *pBuffer,
544                                    Ncv32u bufSize, cudaDeviceProp &devProp)
545 {
546     NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
547     ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
548 
549     NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
550     ncvAssertReturnNcvStat(ncvStat);
551 
552     return NPPST_SUCCESS;
553 }
554 
555 
nppiStIntegral_32f32f_C1R(Ncv32f * d_src,Ncv32u srcStep,Ncv32f * d_dst,Ncv32u dstStep,NcvSize32u roiSize,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)556 NCVStatus nppiStIntegral_32f32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
557                                     Ncv32f *d_dst, Ncv32u dstStep,
558                                     NcvSize32u roiSize, Ncv8u *pBuffer,
559                                     Ncv32u bufSize, cudaDeviceProp &devProp)
560 {
561     NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
562     ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
563 
564     NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
565     ncvAssertReturnNcvStat(ncvStat);
566 
567     return NPPST_SUCCESS;
568 }
569 
570 
nppiStSqrIntegral_8u64u_C1R(Ncv8u * d_src,Ncv32u srcStep,Ncv64u * d_dst,Ncv32u dstStep,NcvSize32u roiSize,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)571 NCVStatus nppiStSqrIntegral_8u64u_C1R(Ncv8u *d_src, Ncv32u srcStep,
572                                       Ncv64u *d_dst, Ncv32u dstStep,
573                                       NcvSize32u roiSize, Ncv8u *pBuffer,
574                                       Ncv32u bufSize, cudaDeviceProp &devProp)
575 {
576     NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
577     ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
578 
579     NCVStatus ncvStat = ncvSquaredIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
580     ncvAssertReturnNcvStat(ncvStat);
581 
582     return NPPST_SUCCESS;
583 }
584 
585 
nppiStIntegral_8u32u_C1R_host(Ncv8u * h_src,Ncv32u srcStep,Ncv32u * h_dst,Ncv32u dstStep,NcvSize32u roiSize)586 NCVStatus nppiStIntegral_8u32u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
587                                         Ncv32u *h_dst, Ncv32u dstStep,
588                                         NcvSize32u roiSize)
589 {
590     ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
591     ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
592     ncvAssertReturn(srcStep >= roiSize.width &&
593                       dstStep >= (roiSize.width + 1) * sizeof(Ncv32u) &&
594                       dstStep % sizeof(Ncv32u) == 0, NPPST_INVALID_STEP);
595     dstStep /= sizeof(Ncv32u);
596 
597     Ncv32u WidthII = roiSize.width + 1;
598     Ncv32u HeightII = roiSize.height + 1;
599 
600     memset(h_dst, 0, WidthII * sizeof(Ncv32u));
601     for (Ncv32u i=1; i<HeightII; i++)
602     {
603         h_dst[i * dstStep] = 0;
604         for (Ncv32u j=1; j<WidthII; j++)
605         {
606             Ncv32u top = h_dst[(i-1) * dstStep + j];
607             Ncv32u left = h_dst[i * dstStep + (j - 1)];
608             Ncv32u topleft = h_dst[(i - 1) * dstStep + (j - 1)];
609             Ncv32u elem = h_src[(i - 1) * srcStep + (j - 1)];
610             h_dst[i * dstStep + j] = elem + left - topleft + top;
611         }
612     }
613 
614     return NPPST_SUCCESS;
615 }
616 
617 
nppiStIntegral_32f32f_C1R_host(Ncv32f * h_src,Ncv32u srcStep,Ncv32f * h_dst,Ncv32u dstStep,NcvSize32u roiSize)618 NCVStatus nppiStIntegral_32f32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
619                                          Ncv32f *h_dst, Ncv32u dstStep,
620                                          NcvSize32u roiSize)
621 {
622     ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
623     ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
624     ncvAssertReturn(srcStep >= roiSize.width * sizeof(Ncv32f) &&
625                       dstStep >= (roiSize.width + 1) * sizeof(Ncv32f) &&
626                       srcStep % sizeof(Ncv32f) == 0 &&
627                       dstStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
628     srcStep /= sizeof(Ncv32f);
629     dstStep /= sizeof(Ncv32f);
630 
631     Ncv32u WidthII = roiSize.width + 1;
632     Ncv32u HeightII = roiSize.height + 1;
633 
634     memset(h_dst, 0, WidthII * sizeof(Ncv32u));
635     for (Ncv32u i=1; i<HeightII; i++)
636     {
637         h_dst[i * dstStep] = 0.0f;
638         for (Ncv32u j=1; j<WidthII; j++)
639         {
640             Ncv32f top = h_dst[(i-1) * dstStep + j];
641             Ncv32f left = h_dst[i * dstStep + (j - 1)];
642             Ncv32f topleft = h_dst[(i - 1) * dstStep + (j - 1)];
643             Ncv32f elem = h_src[(i - 1) * srcStep + (j - 1)];
644             h_dst[i * dstStep + j] = elem + left - topleft + top;
645         }
646     }
647 
648     return NPPST_SUCCESS;
649 }
650 
651 
nppiStSqrIntegral_8u64u_C1R_host(Ncv8u * h_src,Ncv32u srcStep,Ncv64u * h_dst,Ncv32u dstStep,NcvSize32u roiSize)652 NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
653                                            Ncv64u *h_dst, Ncv32u dstStep,
654                                            NcvSize32u roiSize)
655 {
656     ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
657     ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
658     ncvAssertReturn(srcStep >= roiSize.width &&
659                       dstStep >= (roiSize.width + 1) * sizeof(Ncv64u) &&
660                       dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP);
661     dstStep /= sizeof(Ncv64u);
662 
663     Ncv32u WidthII = roiSize.width + 1;
664     Ncv32u HeightII = roiSize.height + 1;
665 
666     memset(h_dst, 0, WidthII * sizeof(Ncv64u));
667     for (Ncv32u i=1; i<HeightII; i++)
668     {
669         h_dst[i * dstStep] = 0;
670         for (Ncv32u j=1; j<WidthII; j++)
671         {
672             Ncv64u top = h_dst[(i-1) * dstStep + j];
673             Ncv64u left = h_dst[i * dstStep + (j - 1)];
674             Ncv64u topleft = h_dst[(i - 1) * dstStep + (j - 1)];
675             Ncv64u elem = h_src[(i - 1) * srcStep + (j - 1)];
676             h_dst[i * dstStep + j] = elem*elem + left - topleft + top;
677         }
678     }
679 
680     return NPPST_SUCCESS;
681 }
682 
683 
684 //==============================================================================
685 //
686 // Decimate.cu
687 //
688 //==============================================================================
689 
690 
691 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32;
692 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
693 
694 
695 template<class T, NcvBool tbCacheTexture>
696 __device__ T getElem_Decimate(Ncv32u x, T *d_src);
697 
698 
699 template<>
getElem_Decimate(Ncv32u x,Ncv32u * d_src)700 __device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
701 {
702     return tex1Dfetch(tex32u, x);
703 }
704 
705 
706 template<>
getElem_Decimate(Ncv32u x,Ncv32u * d_src)707 __device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
708 {
709     return d_src[x];
710 }
711 
712 
713 template<>
getElem_Decimate(Ncv32u x,Ncv64u * d_src)714 __device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
715 {
716     uint2 tmp = tex1Dfetch(tex64u, x);
717     Ncv64u res = (Ncv64u)tmp.y;
718     res <<= 32;
719     res |= tmp.x;
720     return res;
721 }
722 
723 
724 template<>
getElem_Decimate(Ncv32u x,Ncv64u * d_src)725 __device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
726 {
727     return d_src[x];
728 }
729 
730 
731 template <class T, NcvBool tbCacheTexture>
decimate_C1R(T * d_src,Ncv32u srcStep,T * d_dst,Ncv32u dstStep,NcvSize32u dstRoi,Ncv32u scale)732 __global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
733                                       NcvSize32u dstRoi, Ncv32u scale)
734 {
735     int curX = blockIdx.x * blockDim.x + threadIdx.x;
736     int curY = blockIdx.y * blockDim.y + threadIdx.y;
737 
738     if (curX >= dstRoi.width || curY >= dstRoi.height)
739     {
740         return;
741     }
742 
743     d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
744 }
745 
746 
747 template <class T>
decimateWrapperDevice(T * d_src,Ncv32u srcStep,T * d_dst,Ncv32u dstStep,NcvSize32u srcRoi,Ncv32u scale,NcvBool readThruTexture)748 static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
749                                                 T *d_dst, Ncv32u dstStep,
750                                                 NcvSize32u srcRoi, Ncv32u scale,
751                                                 NcvBool readThruTexture)
752 {
753     ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR);
754     ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
755     ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE);
756     ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) &&
757                       dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale), NPPST_INVALID_STEP);
758     srcStep /= sizeof(T);
759     dstStep /= sizeof(T);
760 
761     NcvSize32u dstRoi;
762     dstRoi.width = srcRoi.width / scale;
763     dstRoi.height = srcRoi.height / scale;
764 
765     dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X,
766               (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
767     dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
768 
769     if (!readThruTexture)
770     {
771         decimate_C1R
772             <T, false>
773             <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
774             (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
775     }
776     else
777     {
778         cudaChannelFormatDesc cfdTexSrc;
779 
780         if (sizeof(T) == sizeof(Ncv32u))
781         {
782             cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
783 
784             size_t alignmentOffset;
785             ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
786             ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
787         }
788         else
789         {
790             cfdTexSrc = cudaCreateChannelDesc<uint2>();
791 
792             size_t alignmentOffset;
793             ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
794             ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
795         }
796 
797         decimate_C1R
798             <T, true>
799             <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
800             (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
801     }
802 
803     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
804 
805     return NPPST_SUCCESS;
806 }
807 
808 
809 template <class T>
decimateWrapperHost(T * h_src,Ncv32u srcStep,T * h_dst,Ncv32u dstStep,NcvSize32u srcRoi,Ncv32u scale)810 static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
811                                               T *h_dst, Ncv32u dstStep,
812                                               NcvSize32u srcRoi, Ncv32u scale)
813 {
814     ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
815     ncvAssertReturn(srcRoi.width != 0 && srcRoi.height != 0, NPPST_INVALID_ROI);
816     ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE);
817     ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) &&
818                       dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale) &&
819                       srcStep % sizeof(T) == 0 && dstStep % sizeof(T) == 0, NPPST_INVALID_STEP);
820     srcStep /= sizeof(T);
821     dstStep /= sizeof(T);
822 
823     NcvSize32u dstRoi;
824     dstRoi.width = srcRoi.width / scale;
825     dstRoi.height = srcRoi.height / scale;
826 
827     for (Ncv32u i=0; i<dstRoi.height; i++)
828     {
829         for (Ncv32u j=0; j<dstRoi.width; j++)
830         {
831             h_dst[i*dstStep+j] = h_src[i*scale*srcStep + j*scale];
832         }
833     }
834 
835     return NPPST_SUCCESS;
836 }
837 
838 
839 #define implementNppDecimate(bit, typ) \
840     NCVStatus nppiStDecimate_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
841                                                      Ncv##bit##typ *d_dst, Ncv32u dstStep, \
842                                                      NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \
843     { \
844         return decimateWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
845                                                            (Ncv##bit##u *)d_dst, dstStep, \
846                                                            srcRoi, scale, readThruTexture); \
847     }
848 
849 
850 #define implementNppDecimateHost(bit, typ) \
851     NCVStatus nppiStDecimate_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
852                                                           Ncv##bit##typ *h_dst, Ncv32u dstStep, \
853                                                           NcvSize32u srcRoi, Ncv32u scale) \
854     { \
855         return decimateWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
856                                                          (Ncv##bit##u *)h_dst, dstStep, \
857                                                          srcRoi, scale); \
858     }
859 
860 
861 implementNppDecimate(32, u)
862 implementNppDecimate(32, s)
863 implementNppDecimate(32, f)
864 implementNppDecimate(64, u)
865 implementNppDecimate(64, s)
866 implementNppDecimate(64, f)
867 implementNppDecimateHost(32, u)
868 implementNppDecimateHost(32, s)
869 implementNppDecimateHost(32, f)
870 implementNppDecimateHost(64, u)
871 implementNppDecimateHost(64, s)
872 implementNppDecimateHost(64, f)
873 
874 
875 //==============================================================================
876 //
877 // RectStdDev.cu
878 //
879 //==============================================================================
880 
881 
882 const Ncv32u NUM_RECTSTDDEV_THREADS = 128;
883 
884 
885 template <NcvBool tbCacheTexture>
getElemSum(Ncv32u x,Ncv32u * d_sum)886 __device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum)
887 {
888     if (tbCacheTexture)
889     {
890         return tex1Dfetch(tex32u, x);
891     }
892     else
893     {
894         return d_sum[x];
895     }
896 }
897 
898 
899 template <NcvBool tbCacheTexture>
getElemSqSum(Ncv32u x,Ncv64u * d_sqsum)900 __device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum)
901 {
902     if (tbCacheTexture)
903     {
904         uint2 tmp = tex1Dfetch(tex64u, x);
905         Ncv64u res = (Ncv64u)tmp.y;
906         res <<= 32;
907         res |= tmp.x;
908         return res;
909     }
910     else
911     {
912         return d_sqsum[x];
913     }
914 }
915 
916 
917 template <NcvBool tbCacheTexture>
rectStdDev_32f_C1R(Ncv32u * d_sum,Ncv32u sumStep,Ncv64u * d_sqsum,Ncv32u sqsumStep,Ncv32f * d_norm,Ncv32u normStep,NcvSize32u roi,NcvRect32u rect,Ncv32f invRectArea)918 __global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
919                                    Ncv64u *d_sqsum, Ncv32u sqsumStep,
920                                    Ncv32f *d_norm, Ncv32u normStep,
921                                    NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
922 {
923     Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x;
924     if (x_offs >= roi.width)
925     {
926         return;
927     }
928 
929     Ncv32u sum_offset = blockIdx.y * sumStep + x_offs;
930     Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs;
931 
932     //OPT: try swapping order (could change cache hit/miss ratio)
933     Ncv32u sum_tl = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x, d_sum);
934     Ncv32u sum_bl = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
935     Ncv32u sum_tr = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
936     Ncv32u sum_br = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
937     Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl;
938 
939     Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br;
940     sqsum_tl = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
941     sqsum_bl = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
942     sqsum_tr = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
943     sqsum_br = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
944     Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl;
945 
946     Ncv32f mean = sum_val * invRectArea;
947 
948     //////////////////////////////////////////////////////////////////////////
949     // sqsum_val_res = sqsum_val / rectArea
950     //////////////////////////////////////////////////////////////////////////
951 
952     Ncv32f sqsum_val_1 = __ull2float_rz(sqsum_val);
953     Ncv64u sqsum_val_2 = __float2ull_rz(sqsum_val_1);
954     Ncv64u sqsum_val_3 = sqsum_val - sqsum_val_2;
955     Ncv32f sqsum_val_4 = __ull2float_rn(sqsum_val_3);
956     sqsum_val_1 *= invRectArea;
957     sqsum_val_4 *= invRectArea;
958     Ncv32f sqsum_val_res = sqsum_val_1 + sqsum_val_4;
959 
960     //////////////////////////////////////////////////////////////////////////
961     // variance = sqsum_val_res - mean * mean
962     //////////////////////////////////////////////////////////////////////////
963 
964 #if defined DISABLE_MAD_SELECTIVELY
965     Ncv32f variance = sqsum_val_2 - __fmul_rn(mean, mean);
966 #else
967     Ncv32f variance = sqsum_val_res - mean * mean;
968 #endif
969 
970     //////////////////////////////////////////////////////////////////////////
971     // stddev = sqrtf(variance)
972     //////////////////////////////////////////////////////////////////////////
973 
974     //Ncv32f stddev = sqrtf(variance);
975     Ncv32f stddev = __fsqrt_rn(variance);
976 
977     d_norm[blockIdx.y * normStep + x_offs] = stddev;
978 }
979 
980 
nppiStRectStdDev_32f_C1R(Ncv32u * d_sum,Ncv32u sumStep,Ncv64u * d_sqsum,Ncv32u sqsumStep,Ncv32f * d_norm,Ncv32u normStep,NcvSize32u roi,NcvRect32u rect,Ncv32f scaleArea,NcvBool readThruTexture)981 NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
982                                    Ncv64u *d_sqsum, Ncv32u sqsumStep,
983                                    Ncv32f *d_norm, Ncv32u normStep,
984                                    NcvSize32u roi, NcvRect32u rect,
985                                    Ncv32f scaleArea, NcvBool readThruTexture)
986 {
987     ncvAssertReturn(d_sum != NULL && d_sqsum != NULL && d_norm != NULL, NPPST_NULL_POINTER_ERROR);
988     ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
989     ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) &&
990                       sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) &&
991                       normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) &&
992                       sumStep % sizeof(Ncv32u) == 0 &&
993                       sqsumStep % sizeof(Ncv64u) == 0 &&
994                       normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
995     ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE);
996     sumStep /= sizeof(Ncv32u);
997     sqsumStep /= sizeof(Ncv64u);
998     normStep /= sizeof(Ncv32f);
999 
1000     Ncv32f rectArea = rect.width * rect.height * scaleArea;
1001     Ncv32f invRectArea = 1.0f / rectArea;
1002 
1003     dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height);
1004     dim3 block(NUM_RECTSTDDEV_THREADS);
1005 
1006     if (!readThruTexture)
1007     {
1008         rectStdDev_32f_C1R
1009             <false>
1010             <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1011             (d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
1012     }
1013     else
1014     {
1015         cudaChannelFormatDesc cfdTexSrc;
1016         cudaChannelFormatDesc cfdTexSqr;
1017         cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
1018         cfdTexSqr = cudaCreateChannelDesc<uint2>();
1019 
1020         size_t alignmentOffset;
1021         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR);
1022         ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
1023         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR);
1024         ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
1025 
1026         rectStdDev_32f_C1R
1027             <true>
1028             <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1029             (NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
1030     }
1031 
1032     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1033 
1034     return NPPST_SUCCESS;
1035 }
1036 
1037 
nppiStRectStdDev_32f_C1R_host(Ncv32u * h_sum,Ncv32u sumStep,Ncv64u * h_sqsum,Ncv32u sqsumStep,Ncv32f * h_norm,Ncv32u normStep,NcvSize32u roi,NcvRect32u rect,Ncv32f scaleArea)1038 NCVStatus nppiStRectStdDev_32f_C1R_host(Ncv32u *h_sum, Ncv32u sumStep,
1039                                         Ncv64u *h_sqsum, Ncv32u sqsumStep,
1040                                         Ncv32f *h_norm, Ncv32u normStep,
1041                                         NcvSize32u roi, NcvRect32u rect,
1042                                         Ncv32f scaleArea)
1043 {
1044     ncvAssertReturn(h_sum != NULL && h_sqsum != NULL && h_norm != NULL, NPPST_NULL_POINTER_ERROR);
1045     ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
1046     ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) &&
1047                       sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) &&
1048                       normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) &&
1049                       sumStep % sizeof(Ncv32u) == 0 &&
1050                       sqsumStep % sizeof(Ncv64u) == 0 &&
1051                       normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
1052     ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE);
1053     sumStep /= sizeof(Ncv32u);
1054     sqsumStep /= sizeof(Ncv64u);
1055     normStep /= sizeof(Ncv32f);
1056 
1057     Ncv32f rectArea = rect.width * rect.height * scaleArea;
1058     Ncv32f invRectArea = 1.0f / rectArea;
1059 
1060     for (Ncv32u i=0; i<roi.height; i++)
1061     {
1062         for (Ncv32u j=0; j<roi.width; j++)
1063         {
1064             Ncv32u sum_offset = i * sumStep + j;
1065             Ncv32u sqsum_offset = i * sqsumStep + j;
1066 
1067             Ncv32u sum_tl = h_sum[sum_offset + rect.y * sumStep + rect.x];
1068             Ncv32u sum_bl = h_sum[sum_offset + (rect.y + rect.height) * sumStep + rect.x];
1069             Ncv32u sum_tr = h_sum[sum_offset + rect.y * sumStep + rect.x + rect.width];
1070             Ncv32u sum_br = h_sum[sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width];
1071             Ncv64f sum_val = sum_br + sum_tl - sum_tr - sum_bl;
1072 
1073             Ncv64u sqsum_tl = h_sqsum[sqsum_offset + rect.y * sqsumStep + rect.x];
1074             Ncv64u sqsum_bl = h_sqsum[sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x];
1075             Ncv64u sqsum_tr = h_sqsum[sqsum_offset + rect.y * sqsumStep + rect.x + rect.width];
1076             Ncv64u sqsum_br = h_sqsum[sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width];
1077             Ncv64f sqsum_val = (Ncv64f)(sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl);
1078 
1079             Ncv64f mean = sum_val * invRectArea;
1080             Ncv64f sqsum_val_2 = sqsum_val / rectArea;
1081             Ncv64f variance = sqsum_val_2 - mean * mean;
1082 
1083             h_norm[i * normStep + j] = (Ncv32f)sqrt(variance);
1084         }
1085     }
1086 
1087     return NPPST_SUCCESS;
1088 }
1089 
1090 
1091 //==============================================================================
1092 //
1093 // Transpose.cu
1094 //
1095 //==============================================================================
1096 
1097 
1098 const Ncv32u TRANSPOSE_TILE_DIM   = 16;
1099 const Ncv32u TRANSPOSE_BLOCK_ROWS = 16;
1100 
1101 
1102 /**
1103 * \brief Matrix transpose kernel
1104 *
1105 * Calculates transpose of the input image
1106 * \see TRANSPOSE_TILE_DIM
1107 *
1108 * \tparam T_in      Type of input image elements
1109 * \tparam T_out     Type of output image elements
1110 *
1111 * \param d_src      [IN] Source image pointer
1112 * \param srcStride  [IN] Source image stride
1113 * \param d_dst      [OUT] Output image pointer
1114 * \param dstStride  [IN] Output image stride
1115 *
1116 * \return None
1117 */
1118 template <class T>
transpose(T * d_src,Ncv32u srcStride,T * d_dst,Ncv32u dstStride,NcvSize32u srcRoi)1119 __global__ void transpose(T *d_src, Ncv32u srcStride,
1120                           T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi)
1121 {
1122     __shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM+1];
1123 
1124     Ncv32u blockIdx_x, blockIdx_y;
1125 
1126     // do diagonal reordering
1127     if (gridDim.x == gridDim.y)
1128     {
1129         blockIdx_y = blockIdx.x;
1130         blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
1131     }
1132     else
1133     {
1134         Ncv32u bid = blockIdx.x + gridDim.x * blockIdx.y;
1135         blockIdx_y = bid % gridDim.y;
1136         blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
1137     }
1138 
1139     Ncv32u xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
1140     Ncv32u yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
1141     Ncv32u index_gmem = xIndex + yIndex * srcStride;
1142 
1143     if (xIndex < srcRoi.width)
1144     {
1145         for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
1146         {
1147             if (yIndex + i < srcRoi.height)
1148             {
1149                 tile[threadIdx.y+i][threadIdx.x] = d_src[index_gmem+i*srcStride];
1150             }
1151         }
1152     }
1153 
1154     __syncthreads();
1155 
1156     xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
1157     yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
1158     index_gmem = xIndex + yIndex * dstStride;
1159 
1160     if (xIndex < srcRoi.height)
1161     {
1162         for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
1163         {
1164             if (yIndex + i < srcRoi.width)
1165             {
1166                 d_dst[index_gmem+i*dstStride] = tile[threadIdx.x][threadIdx.y+i];
1167             }
1168         }
1169     }
1170 }
1171 
1172 
1173 template <class T>
transposeWrapperDevice(T * d_src,Ncv32u srcStride,T * d_dst,Ncv32u dstStride,NcvSize32u srcRoi)1174 NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
1175                                    T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi)
1176 {
1177     ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR);
1178     ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
1179     ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) &&
1180                       dstStride >= srcRoi.height * sizeof(T) &&
1181                       srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP);
1182     srcStride /= sizeof(T);
1183     dstStride /= sizeof(T);
1184 
1185     dim3 grid((srcRoi.width + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM,
1186               (srcRoi.height + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM);
1187     dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
1188     transpose
1189         <T>
1190         <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1191         (d_src, srcStride, d_dst, dstStride, srcRoi);
1192     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1193 
1194     return NPPST_SUCCESS;
1195 }
1196 
1197 
1198 template <class T>
transposeWrapperHost(T * h_src,Ncv32u srcStride,T * h_dst,Ncv32u dstStride,NcvSize32u srcRoi)1199 static NCVStatus transposeWrapperHost(T *h_src, Ncv32u srcStride,
1200                                         T *h_dst, Ncv32u dstStride, NcvSize32u srcRoi)
1201 {
1202     ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
1203     ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
1204     ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) &&
1205                       dstStride >= srcRoi.height * sizeof(T) &&
1206                       srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP);
1207     srcStride /= sizeof(T);
1208     dstStride /= sizeof(T);
1209 
1210     for (Ncv32u i=0; i<srcRoi.height; i++)
1211     {
1212         for (Ncv32u j=0; j<srcRoi.width; j++)
1213         {
1214             h_dst[j*dstStride+i] = h_src[i*srcStride + j];
1215         }
1216     }
1217 
1218     return NPPST_SUCCESS;
1219 }
1220 
1221 
1222 #define implementNppTranspose(bit, typ) \
1223     NCVStatus nppiStTranspose_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
1224                                              Ncv##bit##typ *d_dst, Ncv32u dstStep, NcvSize32u srcRoi) \
1225     { \
1226         return transposeWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
1227                                                    (Ncv##bit##u *)d_dst, dstStep, srcRoi); \
1228     }
1229 
1230 
1231 #define implementNppTransposeHost(bit, typ) \
1232     NCVStatus nppiStTranspose_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
1233                                                   Ncv##bit##typ *h_dst, Ncv32u dstStep, \
1234                                                   NcvSize32u srcRoi) \
1235     { \
1236         return transposeWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
1237                                                  (Ncv##bit##u *)h_dst, dstStep, srcRoi); \
1238     }
1239 
1240 
1241 implementNppTranspose(32,u)
1242 implementNppTranspose(32,s)
1243 implementNppTranspose(32,f)
1244 implementNppTranspose(64,u)
1245 implementNppTranspose(64,s)
1246 implementNppTranspose(64,f)
1247 
1248 implementNppTransposeHost(32,u)
1249 implementNppTransposeHost(32,s)
1250 implementNppTransposeHost(32,f)
1251 implementNppTransposeHost(64,u)
1252 implementNppTransposeHost(64,s)
1253 implementNppTransposeHost(64,f)
1254 
1255 
nppiStTranspose_128_C1R(void * d_src,Ncv32u srcStep,void * d_dst,Ncv32u dstStep,NcvSize32u srcRoi)1256 NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
1257                                   void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
1258 {
1259     return transposeWrapperDevice<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
1260 }
1261 
1262 
nppiStTranspose_128_C1R_host(void * d_src,Ncv32u srcStep,void * d_dst,Ncv32u dstStep,NcvSize32u srcRoi)1263 NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
1264                                        void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
1265 {
1266     return transposeWrapperHost<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
1267 }
1268 
1269 
1270 //==============================================================================
1271 //
1272 // Compact.cu
1273 //
1274 //==============================================================================
1275 
1276 
1277 const Ncv32u NUM_REMOVE_THREADS = 256;
1278 
1279 
1280 template <bool bRemove, bool bWritePartial>
removePass1Scan(Ncv32u * d_src,Ncv32u srcLen,Ncv32u * d_offsets,Ncv32u * d_blockSums,Ncv32u elemRemove)1281 __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
1282                                 Ncv32u *d_offsets, Ncv32u *d_blockSums,
1283                                 Ncv32u elemRemove)
1284 {
1285     Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1286     Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
1287 
1288     if (elemAddrIn > srcLen + blockDim.x)
1289     {
1290         return;
1291     }
1292 
1293     __shared__ Ncv32u shmem[NUM_REMOVE_THREADS * 2];
1294 
1295     Ncv32u scanElem = 0;
1296     if (elemAddrIn < srcLen)
1297     {
1298         if (bRemove)
1299         {
1300             scanElem = (d_src[elemAddrIn] != elemRemove) ? 1 : 0;
1301         }
1302         else
1303         {
1304             scanElem = d_src[elemAddrIn];
1305         }
1306     }
1307 
1308     Ncv32u localScanInc = blockScanInclusive<Ncv32u, NUM_REMOVE_THREADS>(scanElem, shmem);
1309     __syncthreads();
1310 
1311     if (elemAddrIn < srcLen)
1312     {
1313         if (threadIdx.x == NUM_REMOVE_THREADS-1 && bWritePartial)
1314         {
1315             d_blockSums[blockId] = localScanInc;
1316         }
1317 
1318         if (bRemove)
1319         {
1320             d_offsets[elemAddrIn] = localScanInc - scanElem;
1321         }
1322         else
1323         {
1324             d_src[elemAddrIn] = localScanInc - scanElem;
1325         }
1326     }
1327 }
1328 
1329 
removePass2Adjust(Ncv32u * d_offsets,Ncv32u srcLen,Ncv32u * d_blockSums)1330 __global__ void removePass2Adjust(Ncv32u *d_offsets, Ncv32u srcLen, Ncv32u *d_blockSums)
1331 {
1332     Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1333     Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
1334     if (elemAddrIn >= srcLen)
1335     {
1336         return;
1337     }
1338 
1339     __shared__ Ncv32u valOffs;
1340     valOffs = d_blockSums[blockId];
1341     __syncthreads();
1342 
1343     d_offsets[elemAddrIn] += valOffs;
1344 }
1345 
1346 
removePass3Compact(Ncv32u * d_src,Ncv32u srcLen,Ncv32u * d_offsets,Ncv32u * d_dst,Ncv32u elemRemove,Ncv32u * dstLenValue)1347 __global__ void removePass3Compact(Ncv32u *d_src, Ncv32u srcLen,
1348                                    Ncv32u *d_offsets, Ncv32u *d_dst,
1349                                    Ncv32u elemRemove, Ncv32u *dstLenValue)
1350 {
1351     Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1352     Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
1353     if (elemAddrIn >= srcLen)
1354     {
1355         return;
1356     }
1357 
1358     Ncv32u elem = d_src[elemAddrIn];
1359     Ncv32u elemAddrOut = d_offsets[elemAddrIn];
1360     if (elem != elemRemove)
1361     {
1362         d_dst[elemAddrOut] = elem;
1363     }
1364 
1365     if (elemAddrIn == srcLen-1)
1366     {
1367         if (elem != elemRemove)
1368         {
1369             *dstLenValue = elemAddrOut + 1;
1370         }
1371         else
1372         {
1373             *dstLenValue = elemAddrOut;
1374         }
1375     }
1376 }
1377 
1378 
compactVector_32u_device(Ncv32u * d_src,Ncv32u srcLen,Ncv32u * d_dst,Ncv32u * dstLenPinned,Ncv32u elemRemove,INCVMemAllocator & gpuAllocator)1379 NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
1380                                    Ncv32u *d_dst, Ncv32u *dstLenPinned,
1381                                    Ncv32u elemRemove,
1382                                    INCVMemAllocator &gpuAllocator)
1383 {
1384     ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
1385     ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
1386 
1387     if (srcLen == 0)
1388     {
1389         if (dstLenPinned != NULL)
1390         {
1391             *dstLenPinned = 0;
1392         }
1393         return NPPST_SUCCESS;
1394     }
1395 
1396     std::vector<Ncv32u> partSumNums;
1397     std::vector<Ncv32u> partSumOffsets;
1398     Ncv32u partSumLastNum = srcLen;
1399     Ncv32u partSumLastOffs = 0;
1400     do
1401     {
1402         partSumNums.push_back(partSumLastNum);
1403         partSumOffsets.push_back(partSumLastOffs);
1404 
1405         Ncv32u curPartSumAlignedLength = alignUp(partSumLastNum * sizeof(Ncv32u),
1406                                                  gpuAllocator.alignment()) / sizeof(Ncv32u);
1407         partSumLastOffs += curPartSumAlignedLength;
1408 
1409         partSumLastNum = (partSumLastNum + NUM_REMOVE_THREADS - 1) / NUM_REMOVE_THREADS;
1410     }
1411     while (partSumLastNum>1);
1412     partSumNums.push_back(partSumLastNum);
1413     partSumOffsets.push_back(partSumLastOffs);
1414 
1415     NCVVectorAlloc<Ncv32u> d_hierSums(gpuAllocator, partSumLastOffs+1);
1416     ncvAssertReturn(gpuAllocator.isCounting() || d_hierSums.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
1417     NCVVectorAlloc<Ncv32u> d_numDstElements(gpuAllocator, 1);
1418     ncvAssertReturn(gpuAllocator.isCounting() || d_numDstElements.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
1419 
1420     NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1421     NCV_SKIP_COND_BEGIN
1422 
1423     dim3 block(NUM_REMOVE_THREADS);
1424 
1425     //calculate zero-level partial sums for indices calculation
1426     if (partSumNums.size() > 2)
1427     {
1428         dim3 grid(partSumNums[1]);
1429 
1430         if (grid.x > 65535)
1431         {
1432             grid.y = (grid.x + 65534) / 65535;
1433             grid.x = 65535;
1434         }
1435         removePass1Scan
1436             <true, true>
1437             <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1438             (d_src, srcLen,
1439              d_hierSums.ptr(),
1440              d_hierSums.ptr() + partSumOffsets[1],
1441              elemRemove);
1442 
1443         ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1444 
1445         //calculate hierarchical partial sums
1446         for (Ncv32u i=1; i<partSumNums.size()-1; i++)
1447         {
1448             dim3 grid_partial(partSumNums[i+1]);
1449             if (grid_partial.x > 65535)
1450             {
1451                 grid_partial.y = (grid_partial.x + 65534) / 65535;
1452                 grid_partial.x = 65535;
1453             }
1454             if (grid_partial.x != 1)
1455             {
1456                 removePass1Scan
1457                     <false, true>
1458                     <<<grid_partial, block, 0, nppStGetActiveCUDAstream()>>>
1459                     (d_hierSums.ptr() + partSumOffsets[i],
1460                      partSumNums[i], NULL,
1461                      d_hierSums.ptr() + partSumOffsets[i+1],
1462                      0);
1463             }
1464             else
1465             {
1466                 removePass1Scan
1467                     <false, false>
1468                     <<<grid_partial, block, 0, nppStGetActiveCUDAstream()>>>
1469                     (d_hierSums.ptr() + partSumOffsets[i],
1470                      partSumNums[i], NULL,
1471                      NULL,
1472                      0);
1473             }
1474 
1475             ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1476         }
1477 
1478         //adjust hierarchical partial sums
1479         for (Ncv32s i=(Ncv32s)partSumNums.size()-3; i>=0; i--)
1480         {
1481             dim3 grid_local(partSumNums[i+1]);
1482             if (grid_local.x > 65535)
1483             {
1484                 grid_local.y = (grid_local.x + 65534) / 65535;
1485                 grid_local.x = 65535;
1486             }
1487             removePass2Adjust
1488                 <<<grid_local, block, 0, nppStGetActiveCUDAstream()>>>
1489                 (d_hierSums.ptr() + partSumOffsets[i], partSumNums[i],
1490                  d_hierSums.ptr() + partSumOffsets[i+1]);
1491 
1492             ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1493         }
1494     }
1495     else
1496     {
1497         dim3 grid_local(partSumNums[1]);
1498         removePass1Scan
1499             <true, false>
1500             <<<grid_local, block, 0, nppStGetActiveCUDAstream()>>>
1501             (d_src, srcLen,
1502              d_hierSums.ptr(),
1503              NULL, elemRemove);
1504 
1505         ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1506     }
1507 
1508     //compact source vector using indices
1509     dim3 grid(partSumNums[1]);
1510     if (grid.x > 65535)
1511     {
1512         grid.y = (grid.x + 65534) / 65535;
1513         grid.x = 65535;
1514     }
1515     removePass3Compact
1516         <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1517         (d_src, srcLen, d_hierSums.ptr(), d_dst,
1518          elemRemove, d_numDstElements.ptr());
1519 
1520     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1521 
1522     //get number of dst elements
1523     if (dstLenPinned != NULL)
1524     {
1525         ncvAssertCUDAReturn(cudaMemcpyAsync(dstLenPinned, d_numDstElements.ptr(), sizeof(Ncv32u),
1526                                               cudaMemcpyDeviceToHost, nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR);
1527         ncvAssertCUDAReturn(cudaStreamSynchronize(nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR);
1528     }
1529 
1530     NCV_SKIP_COND_END
1531 
1532     return NPPST_SUCCESS;
1533 }
1534 
1535 
nppsStCompactGetSize_32u(Ncv32u srcLen,Ncv32u * pBufsize,cudaDeviceProp & devProp)1536 NCVStatus nppsStCompactGetSize_32u(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
1537 {
1538     ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
1539 
1540     if (srcLen == 0)
1541     {
1542         *pBufsize = 0;
1543         return NPPST_SUCCESS;
1544     }
1545 
1546     NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
1547     ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
1548 
1549     NCVStatus ncvStat = compactVector_32u_device(NULL, srcLen, NULL, NULL, 0xC001C0DE,
1550                                                  gpuCounter);
1551     ncvAssertReturnNcvStat(ncvStat);
1552 
1553     *pBufsize = (Ncv32u)gpuCounter.maxSize();
1554     return NPPST_SUCCESS;
1555 }
1556 
1557 
nppsStCompactGetSize_32s(Ncv32u srcLen,Ncv32u * pBufsize,cudaDeviceProp & devProp)1558 NCVStatus nppsStCompactGetSize_32s(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
1559 {
1560     return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp);
1561 }
1562 
1563 
nppsStCompactGetSize_32f(Ncv32u srcLen,Ncv32u * pBufsize,cudaDeviceProp & devProp)1564 NCVStatus nppsStCompactGetSize_32f(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
1565 {
1566     return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp);
1567 }
1568 
1569 
nppsStCompact_32u(Ncv32u * d_src,Ncv32u srcLen,Ncv32u * d_dst,Ncv32u * p_dstLen,Ncv32u elemRemove,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)1570 NCVStatus nppsStCompact_32u(Ncv32u *d_src, Ncv32u srcLen,
1571                             Ncv32u *d_dst, Ncv32u *p_dstLen,
1572                             Ncv32u elemRemove, Ncv8u *pBuffer,
1573                             Ncv32u bufSize, cudaDeviceProp &devProp)
1574 {
1575     NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
1576     ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
1577 
1578     NCVStatus ncvStat = compactVector_32u_device(d_src, srcLen, d_dst, p_dstLen, elemRemove,
1579                                                  gpuAllocator);
1580     ncvAssertReturnNcvStat(ncvStat);
1581 
1582     return NPPST_SUCCESS;
1583 }
1584 
1585 
nppsStCompact_32s(Ncv32s * d_src,Ncv32u srcLen,Ncv32s * d_dst,Ncv32u * p_dstLen,Ncv32s elemRemove,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)1586 NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
1587                             Ncv32s *d_dst, Ncv32u *p_dstLen,
1588                             Ncv32s elemRemove, Ncv8u *pBuffer,
1589                             Ncv32u bufSize, cudaDeviceProp &devProp)
1590 {
1591     return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
1592                              *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp);
1593 }
1594 
1595 
1596 #if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__  > 4
1597 typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
1598 #else
1599 typedef Ncv32u Ncv32u_a;
1600 #endif
1601 
nppsStCompact_32f(Ncv32f * d_src,Ncv32u srcLen,Ncv32f * d_dst,Ncv32u * p_dstLen,Ncv32f elemRemove,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)1602 NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
1603                             Ncv32f *d_dst, Ncv32u *p_dstLen,
1604                             Ncv32f elemRemove, Ncv8u *pBuffer,
1605                             Ncv32u bufSize, cudaDeviceProp &devProp)
1606 {
1607     return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
1608                              *(Ncv32u_a *)&elemRemove, pBuffer, bufSize, devProp);
1609 }
1610 
nppsStCompact_32u_host(Ncv32u * h_src,Ncv32u srcLen,Ncv32u * h_dst,Ncv32u * dstLen,Ncv32u elemRemove)1611 NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
1612                                  Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove)
1613 {
1614     ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
1615 
1616     if (srcLen == 0)
1617     {
1618         if (dstLen != NULL)
1619         {
1620             *dstLen = 0;
1621         }
1622         return NPPST_SUCCESS;
1623     }
1624 
1625     Ncv32u dstIndex = 0;
1626     for (Ncv32u srcIndex=0; srcIndex<srcLen; srcIndex++)
1627     {
1628         if (h_src[srcIndex] != elemRemove)
1629         {
1630             h_dst[dstIndex++] = h_src[srcIndex];
1631         }
1632     }
1633 
1634     if (dstLen != NULL)
1635     {
1636         *dstLen = dstIndex;
1637     }
1638 
1639     return NPPST_SUCCESS;
1640 }
1641 
1642 
nppsStCompact_32s_host(Ncv32s * h_src,Ncv32u srcLen,Ncv32s * h_dst,Ncv32u * dstLen,Ncv32s elemRemove)1643 NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
1644                                  Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove)
1645 {
1646     return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
1647 }
1648 
1649 
nppsStCompact_32f_host(Ncv32f * h_src,Ncv32u srcLen,Ncv32f * h_dst,Ncv32u * dstLen,Ncv32f elemRemove)1650 NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
1651                                  Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove)
1652 {
1653     return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
1654 }
1655 
1656 //==============================================================================
1657 //
1658 // Filter.cu
1659 //
1660 //==============================================================================
1661 
1662 
1663 texture <float, 1, cudaReadModeElementType> texSrc;
1664 texture <float, 1, cudaReadModeElementType> texKernel;
1665 
1666 
getValueMirrorRow(const int rowOffset,int i,int w)1667 __forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
1668                                                    int i,
1669                                                    int w)
1670 {
1671     if (i < 0) i = 1 - i;
1672     if (i >= w) i = w + w - i - 1;
1673     return tex1Dfetch (texSrc, rowOffset + i);
1674 }
1675 
1676 
getValueMirrorColumn(const int offset,const int rowStep,int j,int h)1677 __forceinline__ __device__ float getValueMirrorColumn(const int offset,
1678                                                       const int rowStep,
1679                                                       int j,
1680                                                       int h)
1681 {
1682     if (j < 0) j = 1 - j;
1683     if (j >= h) j = h + h - j - 1;
1684     return tex1Dfetch (texSrc, offset + j * rowStep);
1685 }
1686 
1687 
FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u dstStep,NcvRect32u roi,Ncv32s nKernelSize,Ncv32s nAnchor,Ncv32f multiplier)1688 __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
1689                                               Ncv32f *pDst,
1690                                               NcvSize32u dstSize,
1691                                               Ncv32u dstStep,
1692                                               NcvRect32u roi,
1693                                               Ncv32s nKernelSize,
1694                                               Ncv32s nAnchor,
1695                                               Ncv32f multiplier)
1696 {
1697     // position within ROI
1698     const int ix = blockDim.x * blockIdx.x + threadIdx.x;
1699     const int iy = blockDim.y * blockIdx.y + threadIdx.y;
1700 
1701     if (ix >= roi.width || iy >= roi.height)
1702     {
1703         return;
1704     }
1705 
1706     const int p = nKernelSize - nAnchor - 1;
1707 
1708     const int j = roi.y + iy;
1709 
1710     const int rowOffset = j * srcStep + roi.x;
1711 
1712     float sum = 0.0f;
1713     for (int m = 0; m < nKernelSize; ++m)
1714     {
1715         sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width)
1716             * tex1Dfetch (texKernel, m);
1717     }
1718 
1719     pDst[iy * dstStep + ix] = sum * multiplier;
1720 }
1721 
1722 
FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u dstStep,NcvRect32u roi,Ncv32s nKernelSize,Ncv32s nAnchor,Ncv32f multiplier)1723 __global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
1724                                                  Ncv32f *pDst,
1725                                                  NcvSize32u dstSize,
1726                                                  Ncv32u dstStep,
1727                                                  NcvRect32u roi,
1728                                                  Ncv32s nKernelSize,
1729                                                  Ncv32s nAnchor,
1730                                                  Ncv32f multiplier)
1731 {
1732     const int ix = blockDim.x * blockIdx.x + threadIdx.x;
1733     const int iy = blockDim.y * blockIdx.y + threadIdx.y;
1734 
1735     if (ix >= roi.width || iy >= roi.height)
1736     {
1737         return;
1738     }
1739 
1740     const int p = nKernelSize - nAnchor - 1;
1741     const int i = roi.x + ix;
1742     const int offset = i + roi.y * srcStep;
1743 
1744     float sum = 0.0f;
1745     for (int m = 0; m < nKernelSize; ++m)
1746     {
1747         sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height)
1748             * tex1Dfetch (texKernel, m);
1749     }
1750 
1751     pDst[ix + iy * dstStep] = sum * multiplier;
1752 }
1753 
1754 
nppiStFilterRowBorder_32f_C1R(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u nDstStep,NcvRect32u oROI,NppStBorderType borderType,const Ncv32f * pKernel,Ncv32s nKernelSize,Ncv32s nAnchor,Ncv32f multiplier)1755 NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
1756                                         NcvSize32u srcSize,
1757                                         Ncv32u nSrcStep,
1758                                         Ncv32f *pDst,
1759                                         NcvSize32u dstSize,
1760                                         Ncv32u nDstStep,
1761                                         NcvRect32u oROI,
1762                                         NppStBorderType borderType,
1763                                         const Ncv32f *pKernel,
1764                                         Ncv32s nKernelSize,
1765                                         Ncv32s nAnchor,
1766                                         Ncv32f multiplier)
1767 {
1768     ncvAssertReturn (pSrc != NULL &&
1769         pDst != NULL &&
1770         pKernel != NULL, NCV_NULL_PTR);
1771 
1772     ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
1773 
1774     ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
1775         dstSize.width * sizeof (Ncv32f) <= nDstStep &&
1776         oROI.width * sizeof (Ncv32f) <= nSrcStep &&
1777         oROI.width * sizeof (Ncv32f) <= nDstStep &&
1778         nSrcStep % sizeof (Ncv32f) == 0 &&
1779         nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
1780 
1781     Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
1782     Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
1783 
1784     // adjust ROI size to be within source image
1785     if (oROI.x + oROI.width > srcSize.width)
1786     {
1787         oROI.width = srcSize.width - oROI.x;
1788     }
1789 
1790     if (oROI.y + oROI.height > srcSize.height)
1791     {
1792         oROI.height = srcSize.height - oROI.y;
1793     }
1794 
1795     cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
1796     texSrc.normalized    = false;
1797     texKernel.normalized = false;
1798 
1799     cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
1800     cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
1801 
1802     dim3 ctaSize (32, 6);
1803     dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
1804         (oROI.height + ctaSize.y - 1) / ctaSize.y);
1805 
1806     switch (borderType)
1807     {
1808     case nppStBorderNone:
1809         return NPPST_ERROR;
1810     case nppStBorderClamp:
1811         return NPPST_ERROR;
1812     case nppStBorderWrap:
1813         return NPPST_ERROR;
1814     case nppStBorderMirror:
1815         FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
1816             (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
1817         ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1818         break;
1819     default:
1820         return NPPST_ERROR;
1821     }
1822 
1823     return NPPST_SUCCESS;
1824 }
1825 
1826 
nppiStFilterColumnBorder_32f_C1R(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u nDstStep,NcvRect32u oROI,NppStBorderType borderType,const Ncv32f * pKernel,Ncv32s nKernelSize,Ncv32s nAnchor,Ncv32f multiplier)1827 NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
1828                                            NcvSize32u srcSize,
1829                                            Ncv32u nSrcStep,
1830                                            Ncv32f *pDst,
1831                                            NcvSize32u dstSize,
1832                                            Ncv32u nDstStep,
1833                                            NcvRect32u oROI,
1834                                            NppStBorderType borderType,
1835                                            const Ncv32f *pKernel,
1836                                            Ncv32s nKernelSize,
1837                                            Ncv32s nAnchor,
1838                                            Ncv32f multiplier)
1839 {
1840     ncvAssertReturn (pSrc != NULL &&
1841         pDst != NULL &&
1842         pKernel != NULL, NCV_NULL_PTR);
1843 
1844     ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
1845 
1846     ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
1847         dstSize.width * sizeof (Ncv32f) <= nDstStep &&
1848         oROI.width * sizeof (Ncv32f) <= nSrcStep &&
1849         oROI.width * sizeof (Ncv32f) <= nDstStep &&
1850         nSrcStep % sizeof (Ncv32f) == 0 &&
1851         nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
1852 
1853     Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
1854     Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
1855 
1856     // adjust ROI size to be within source image
1857     if (oROI.x + oROI.width > srcSize.width)
1858     {
1859         oROI.width = srcSize.width - oROI.x;
1860     }
1861 
1862     if (oROI.y + oROI.height > srcSize.height)
1863     {
1864         oROI.height = srcSize.height - oROI.y;
1865     }
1866 
1867     cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
1868     texSrc.normalized    = false;
1869     texKernel.normalized = false;
1870 
1871     cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
1872     cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
1873 
1874     dim3 ctaSize (32, 6);
1875     dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
1876         (oROI.height + ctaSize.y - 1) / ctaSize.y);
1877 
1878     switch (borderType)
1879     {
1880     case nppStBorderClamp:
1881         return NPPST_ERROR;
1882     case nppStBorderWrap:
1883         return NPPST_ERROR;
1884     case nppStBorderMirror:
1885         FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
1886             (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
1887         ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1888         break;
1889     default:
1890         return NPPST_ERROR;
1891     }
1892 
1893     return NPPST_SUCCESS;
1894 }
1895 
1896 
1897 //==============================================================================
1898 //
1899 // FrameInterpolate.cu
1900 //
1901 //==============================================================================
1902 
1903 
iDivUp(Ncv32u num,Ncv32u denom)1904 inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
1905 {
1906     return (num + denom - 1)/denom;
1907 }
1908 
1909 
1910 texture<float, 2, cudaReadModeElementType> tex_src1;
1911 texture<float, 2, cudaReadModeElementType> tex_src0;
1912 
1913 
BlendFramesKernel(const float * u,const float * v,const float * ur,const float * vr,const float * o0,const float * o1,int w,int h,int s,float theta,float * out)1914 __global__ void BlendFramesKernel(const float *u, const float *v,   // forward flow
1915                                   const float *ur, const float *vr, // backward flow
1916                                   const float *o0, const float *o1, // coverage masks
1917                                   int w, int h, int s,
1918                                   float theta, float *out)
1919 {
1920     const int ix = threadIdx.x + blockDim.x * blockIdx.x;
1921     const int iy = threadIdx.y + blockDim.y * blockIdx.y;
1922 
1923     const int pos = ix + s * iy;
1924 
1925     if (ix >= w || iy >= h) return;
1926 
1927     float _u = u[pos];
1928     float _v = v[pos];
1929 
1930     float _ur = ur[pos];
1931     float _vr = vr[pos];
1932 
1933     float x = (float)ix + 0.5f;
1934     float y = (float)iy + 0.5f;
1935     bool b0 = o0[pos] > 1e-4f;
1936     bool b1 = o1[pos] > 1e-4f;
1937 
1938     if (b0 && b1)
1939     {
1940         // pixel is visible on both frames
1941         out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) +
1942             tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
1943     }
1944     else if (b0)
1945     {
1946         // visible on the first frame only
1947         out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
1948     }
1949     else
1950     {
1951         // visible on the second frame only
1952         out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
1953     }
1954 }
1955 
1956 
BlendFrames(const Ncv32f * src0,const Ncv32f * src1,const Ncv32f * ufi,const Ncv32f * vfi,const Ncv32f * ubi,const Ncv32f * vbi,const Ncv32f * o1,const Ncv32f * o2,Ncv32u width,Ncv32u height,Ncv32u stride,Ncv32f theta,Ncv32f * out)1957 NCVStatus BlendFrames(const Ncv32f *src0,
1958                       const Ncv32f *src1,
1959                       const Ncv32f *ufi,
1960                       const Ncv32f *vfi,
1961                       const Ncv32f *ubi,
1962                       const Ncv32f *vbi,
1963                       const Ncv32f *o1,
1964                       const Ncv32f *o2,
1965                       Ncv32u width,
1966                       Ncv32u height,
1967                       Ncv32u stride,
1968                       Ncv32f theta,
1969                       Ncv32f *out)
1970 {
1971     tex_src1.addressMode[0] = cudaAddressModeClamp;
1972     tex_src1.addressMode[1] = cudaAddressModeClamp;
1973     tex_src1.filterMode = cudaFilterModeLinear;
1974     tex_src1.normalized = false;
1975 
1976     tex_src0.addressMode[0] = cudaAddressModeClamp;
1977     tex_src0.addressMode[1] = cudaAddressModeClamp;
1978     tex_src0.filterMode = cudaFilterModeLinear;
1979     tex_src0.normalized = false;
1980 
1981     cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
1982     const Ncv32u pitch = stride * sizeof (float);
1983     ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
1984     ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
1985 
1986     dim3 threads (32, 4);
1987     dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
1988 
1989     BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
1990         (ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
1991 
1992     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1993 
1994     return NPPST_SUCCESS;
1995 }
1996 
1997 
nppiStGetInterpolationBufferSize(NcvSize32u srcSize,Ncv32u nStep,Ncv32u * hpSize)1998 NCVStatus nppiStGetInterpolationBufferSize(NcvSize32u srcSize,
1999                                            Ncv32u nStep,
2000                                            Ncv32u *hpSize)
2001 {
2002     NCVStatus status = NPPST_ERROR;
2003     status = nppiStVectorWarpGetBufferSize(srcSize, nStep, hpSize);
2004     return status;
2005 }
2006 
2007 
nppiStInterpolateFrames(const NppStInterpolationState * pState)2008 NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
2009 {
2010     // check state validity
2011     ncvAssertReturn (pState->pSrcFrame0 != 0 &&
2012         pState->pSrcFrame1 != 0 &&
2013         pState->pFU != 0 &&
2014         pState->pFV != 0 &&
2015         pState->pBU != 0 &&
2016         pState->pBV != 0 &&
2017         pState->pNewFrame != 0 &&
2018         pState->ppBuffers[0] != 0 &&
2019         pState->ppBuffers[1] != 0 &&
2020         pState->ppBuffers[2] != 0 &&
2021         pState->ppBuffers[3] != 0 &&
2022         pState->ppBuffers[4] != 0 &&
2023         pState->ppBuffers[5] != 0, NPPST_NULL_POINTER_ERROR);
2024 
2025     ncvAssertReturn (pState->size.width  > 0 &&
2026         pState->size.height > 0, NPPST_ERROR);
2027 
2028     ncvAssertReturn (pState->nStep >= pState->size.width * sizeof (Ncv32f) &&
2029         pState->nStep > 0 &&
2030         pState->nStep % sizeof (Ncv32f) == 0,
2031         NPPST_INVALID_STEP);
2032 
2033     // change notation
2034     Ncv32f *cov0 = pState->ppBuffers[0];
2035     Ncv32f *cov1 = pState->ppBuffers[1];
2036     Ncv32f *fwdU = pState->ppBuffers[2]; // forward u
2037     Ncv32f *fwdV = pState->ppBuffers[3]; // forward v
2038     Ncv32f *bwdU = pState->ppBuffers[4]; // backward u
2039     Ncv32f *bwdV = pState->ppBuffers[5]; // backward v
2040     // warp flow
2041     ncvAssertReturnNcvStat (
2042         nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFU,
2043         pState->size,
2044         pState->nStep,
2045         pState->pFU,
2046         pState->pFV,
2047         pState->nStep,
2048         cov0,
2049         pState->pos,
2050         fwdU) );
2051     ncvAssertReturnNcvStat (
2052         nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFV,
2053         pState->size,
2054         pState->nStep,
2055         pState->pFU,
2056         pState->pFV,
2057         pState->nStep,
2058         cov0,
2059         pState->pos,
2060         fwdV) );
2061     // warp backward flow
2062     ncvAssertReturnNcvStat (
2063         nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBU,
2064         pState->size,
2065         pState->nStep,
2066         pState->pBU,
2067         pState->pBV,
2068         pState->nStep,
2069         cov1,
2070         1.0f - pState->pos,
2071         bwdU) );
2072     ncvAssertReturnNcvStat (
2073         nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBV,
2074         pState->size,
2075         pState->nStep,
2076         pState->pBU,
2077         pState->pBV,
2078         pState->nStep,
2079         cov1,
2080         1.0f - pState->pos,
2081         bwdU) );
2082     // interpolate frame
2083     ncvAssertReturnNcvStat (
2084         BlendFrames (pState->pSrcFrame0,
2085         pState->pSrcFrame1,
2086         fwdU,
2087         fwdV,
2088         bwdU,
2089         bwdV,
2090         cov0,
2091         cov1,
2092         pState->size.width,
2093         pState->size.height,
2094         pState->nStep / sizeof (Ncv32f),
2095         pState->pos,
2096         pState->pNewFrame) );
2097 
2098     return NPPST_SUCCESS;
2099 }
2100 
2101 
2102 //==============================================================================
2103 //
2104 // VectorWarpFrame.cu
2105 //
2106 //==============================================================================
2107 
2108 
2109 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
2110 
2111 // FP32 atomic add
_atomicAdd(float * addr,float val)2112 static __forceinline__ __device__ float _atomicAdd(float *addr, float val)
2113 {
2114     float old = *addr, assumed;
2115 
2116     do {
2117         assumed = old;
2118         old = int_as_float(__iAtomicCAS((int*)addr,
2119               float_as_int(assumed),
2120               float_as_int(val+assumed)));
2121     } while( assumed!=old );
2122 
2123     return old;
2124 }
2125 #else
2126 #define _atomicAdd atomicAdd
2127 #endif
2128 
2129 
ForwardWarpKernel_PSF2x2(const float * u,const float * v,const float * src,const int w,const int h,const int flow_stride,const int image_stride,const float time_scale,float * normalization_factor,float * dst)2130 __global__ void ForwardWarpKernel_PSF2x2(const float *u,
2131                                          const float *v,
2132                                          const float *src,
2133                                          const int w,
2134                                          const int h,
2135                                          const int flow_stride,
2136                                          const int image_stride,
2137                                          const float time_scale,
2138                                          float *normalization_factor,
2139                                          float *dst)
2140 {
2141     int j = threadIdx.x + blockDim.x * blockIdx.x;
2142     int i = threadIdx.y + blockDim.y * blockIdx.y;
2143 
2144     if (i >= h || j >= w) return;
2145 
2146     int flow_row_offset  = i * flow_stride;
2147     int image_row_offset = i * image_stride;
2148 
2149     //bottom left corner of a target pixel
2150     float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
2151     float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
2152     // pixel containing bottom left corner
2153     float px;
2154     float py;
2155     float dx = modff (cx, &px);
2156     float dy = modff (cy, &py);
2157     // target pixel integer coords
2158     int tx;
2159     int ty;
2160     tx = (int) px;
2161     ty = (int) py;
2162     float value = src[image_row_offset + j];
2163     float weight;
2164     // fill pixel containing bottom right corner
2165     if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2166     {
2167         weight = dx * dy;
2168         _atomicAdd (dst + ty * image_stride + tx, value * weight);
2169         _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
2170     }
2171 
2172     // fill pixel containing bottom left corner
2173     tx -= 1;
2174     if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2175     {
2176         weight = (1.0f - dx) * dy;
2177         _atomicAdd (dst + ty * image_stride + tx, value * weight);
2178         _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
2179     }
2180 
2181     // fill pixel containing upper left corner
2182     ty -= 1;
2183     if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2184     {
2185         weight = (1.0f - dx) * (1.0f - dy);
2186         _atomicAdd (dst + ty * image_stride + tx, value * weight);
2187         _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
2188     }
2189 
2190     // fill pixel containing upper right corner
2191     tx += 1;
2192     if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2193     {
2194         weight = dx * (1.0f - dy);
2195         _atomicAdd (dst + ty * image_stride + tx, value * weight);
2196         _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
2197     }
2198 }
2199 
2200 
ForwardWarpKernel_PSF1x1(const float * u,const float * v,const float * src,const int w,const int h,const int flow_stride,const int image_stride,const float time_scale,float * dst)2201 __global__ void ForwardWarpKernel_PSF1x1(const float *u,
2202                                          const float *v,
2203                                          const float *src,
2204                                          const int w,
2205                                          const int h,
2206                                          const int flow_stride,
2207                                          const int image_stride,
2208                                          const float time_scale,
2209                                          float *dst)
2210 {
2211     int j = threadIdx.x + blockDim.x * blockIdx.x;
2212     int i = threadIdx.y + blockDim.y * blockIdx.y;
2213 
2214     if (i >= h || j >= w) return;
2215 
2216     int flow_row_offset = i * flow_stride;
2217     int image_row_offset = i * image_stride;
2218 
2219     float u_ = u[flow_row_offset + j];
2220     float v_ = v[flow_row_offset + j];
2221 
2222     //bottom left corner of target pixel
2223     float cx = u_ * time_scale + (float)j + 1.0f;
2224     float cy = v_ * time_scale + (float)i + 1.0f;
2225     // pixel containing bottom left corner
2226     int tx = __float2int_rn (cx);
2227     int ty = __float2int_rn (cy);
2228 
2229     float value = src[image_row_offset + j];
2230     // fill pixel
2231     if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2232     {
2233         _atomicAdd (dst + ty * image_stride + tx, value);
2234     }
2235 }
2236 
2237 
NormalizeKernel(const float * normalization_factor,int w,int h,int s,float * image)2238 __global__ void NormalizeKernel(const float *normalization_factor, int w, int h, int s, float *image)
2239 {
2240     int i = threadIdx.y + blockDim.y * blockIdx.y;
2241     int j = threadIdx.x + blockDim.x * blockIdx.x;
2242 
2243     if (i >= h || j >= w) return;
2244 
2245     const int pos = i * s + j;
2246 
2247     float scale = normalization_factor[pos];
2248 
2249     float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
2250 
2251     image[pos] *= invScale;
2252 }
2253 
2254 
MemsetKernel(const float value,int w,int h,float * image)2255 __global__ void MemsetKernel(const float value, int w, int h, float *image)
2256 {
2257     int i = threadIdx.y + blockDim.y * blockIdx.y;
2258     int j = threadIdx.x + blockDim.x * blockIdx.x;
2259 
2260     if (i >= h || j >= w) return;
2261 
2262     const int pos = i * w + j;
2263 
2264     image[pos] = value;
2265 }
2266 
2267 
nppiStVectorWarpGetBufferSize(NcvSize32u srcSize,Ncv32u nSrcStep,Ncv32u * hpSize)2268 NCVStatus nppiStVectorWarpGetBufferSize (NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32u *hpSize)
2269 {
2270     ncvAssertReturn (hpSize != NULL, NPPST_NULL_POINTER_ERROR);
2271     ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep,
2272         NPPST_INVALID_STEP);
2273 
2274     *hpSize = nSrcStep * srcSize.height;
2275 
2276     return NPPST_SUCCESS;
2277 }
2278 
2279 
2280 // does not require normalization
nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,const Ncv32f * pU,const Ncv32f * pV,Ncv32u nVFStep,Ncv32f timeScale,Ncv32f * pDst)2281 NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
2282                                          NcvSize32u srcSize,
2283                                          Ncv32u nSrcStep,
2284                                          const Ncv32f *pU,
2285                                          const Ncv32f *pV,
2286                                          Ncv32u nVFStep,
2287                                          Ncv32f timeScale,
2288                                          Ncv32f *pDst)
2289 {
2290     ncvAssertReturn (pSrc != NULL &&
2291         pU   != NULL &&
2292         pV   != NULL &&
2293         pDst != NULL, NPPST_NULL_POINTER_ERROR);
2294 
2295     ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
2296         srcSize.width * sizeof (Ncv32f) <= nVFStep,
2297         NPPST_INVALID_STEP);
2298 
2299     Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
2300     Ncv32u vfStep  = nVFStep / sizeof (Ncv32f);
2301 
2302     dim3 ctaSize (32, 6);
2303     dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
2304 
2305     ForwardWarpKernel_PSF1x1 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
2306         (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pDst);
2307 
2308     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2309 
2310     return NPPST_SUCCESS;
2311 }
2312 
2313 
nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,const Ncv32f * pU,const Ncv32f * pV,Ncv32u nVFStep,Ncv32f * pBuffer,Ncv32f timeScale,Ncv32f * pDst)2314 NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
2315                                          NcvSize32u srcSize,
2316                                          Ncv32u nSrcStep,
2317                                          const Ncv32f *pU,
2318                                          const Ncv32f *pV,
2319                                          Ncv32u nVFStep,
2320                                          Ncv32f *pBuffer,
2321                                          Ncv32f timeScale,
2322                                          Ncv32f *pDst)
2323 {
2324     ncvAssertReturn (pSrc != NULL &&
2325         pU   != NULL &&
2326         pV   != NULL &&
2327         pDst != NULL &&
2328         pBuffer != NULL, NPPST_NULL_POINTER_ERROR);
2329 
2330     ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
2331         srcSize.width * sizeof (Ncv32f) <= nVFStep, NPPST_INVALID_STEP);
2332 
2333     Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
2334     Ncv32u vfStep = nVFStep / sizeof(Ncv32f);
2335 
2336     dim3 ctaSize(32, 6);
2337     dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
2338 
2339     MemsetKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
2340         (0, srcSize.width, srcSize.height, pBuffer);
2341 
2342     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2343 
2344     ForwardWarpKernel_PSF2x2 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
2345         (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pBuffer, pDst);
2346 
2347     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2348 
2349     NormalizeKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
2350         (pBuffer, srcSize.width, srcSize.height, srcStep, pDst);
2351 
2352     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2353 
2354     return NPPST_SUCCESS;
2355 }
2356 
2357 
2358 //==============================================================================
2359 //
2360 // Resize.cu
2361 //
2362 //==============================================================================
2363 
2364 
2365 texture <float, 2, cudaReadModeElementType> texSrc2D;
2366 
2367 
2368 __forceinline__
processLine(int spos,float xmin,float xmax,int ixmin,int ixmax,float fxmin,float cxmax)2369 __device__ float processLine(int spos,
2370                              float xmin,
2371                              float xmax,
2372                              int ixmin,
2373                              int ixmax,
2374                              float fxmin,
2375                              float cxmax)
2376 {
2377     // first element
2378     float wsum = 1.0f - xmin + fxmin;
2379     float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin);
2380     spos++;
2381     for (int ix = ixmin + 1; ix < ixmax; ++ix)
2382     {
2383         sum += tex1Dfetch(texSrc, spos);
2384         spos++;
2385         wsum += 1.0f;
2386     }
2387     sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax);
2388     wsum += cxmax - xmax;
2389     return sum / wsum;
2390 }
2391 
2392 
resizeSuperSample_32f(NcvSize32u srcSize,Ncv32u srcStep,NcvRect32u srcROI,Ncv32f * dst,NcvSize32u dstSize,Ncv32u dstStep,NcvRect32u dstROI,Ncv32f scaleX,Ncv32f scaleY)2393 __global__ void resizeSuperSample_32f(NcvSize32u srcSize,
2394                                       Ncv32u srcStep,
2395                                       NcvRect32u srcROI,
2396                                       Ncv32f *dst,
2397                                       NcvSize32u dstSize,
2398                                       Ncv32u dstStep,
2399                                       NcvRect32u dstROI,
2400                                       Ncv32f scaleX,
2401                                       Ncv32f scaleY)
2402 {
2403     // position within dst ROI
2404     const int ix = blockIdx.x * blockDim.x + threadIdx.x;
2405     const int iy = blockIdx.y * blockDim.y + threadIdx.y;
2406 
2407     if (ix >= dstROI.width || iy >= dstROI.height)
2408     {
2409         return;
2410     }
2411 
2412     float rw = (float) srcROI.width;
2413     float rh = (float) srcROI.height;
2414 
2415     // source position
2416     float x = scaleX * (float) ix;
2417     float y = scaleY * (float) iy;
2418 
2419     // x sampling range
2420     float xBegin = fmax (x - scaleX, 0.0f);
2421     float xEnd   = fmin (x + scaleX, rw - 1.0f);
2422     // y sampling range
2423     float yBegin = fmax (y - scaleY, 0.0f);
2424     float yEnd   = fmin (y + scaleY, rh - 1.0f);
2425     // x range of source samples
2426     float floorXBegin = floorf (xBegin);
2427     float ceilXEnd    = ceilf (xEnd);
2428     int iXBegin = srcROI.x + (int) floorXBegin;
2429     int iXEnd   = srcROI.x + (int) ceilXEnd;
2430     // y range of source samples
2431     float floorYBegin = floorf (yBegin);
2432     float ceilYEnd    = ceilf (yEnd);
2433     int iYBegin = srcROI.y + (int) floorYBegin;
2434     int iYEnd   = srcROI.y + (int) ceilYEnd;
2435 
2436     // first row
2437     int pos = iYBegin * srcStep + iXBegin;
2438 
2439     float wsum = 1.0f - yBegin + floorYBegin;
2440 
2441     float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
2442         ceilXEnd) * (1.0f - yBegin + floorYBegin);
2443     pos += srcStep;
2444     for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
2445     {
2446         sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
2447             ceilXEnd);
2448         pos += srcStep;
2449         wsum += 1.0f;
2450     }
2451 
2452     sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
2453         ceilXEnd) * (ceilYEnd - yEnd);
2454     wsum += ceilYEnd - yEnd;
2455     sum /= wsum;
2456 
2457     dst[(ix + dstROI.x) + (iy + dstROI.y) * dstStep] = sum;
2458 }
2459 
2460 
2461 // bicubic interpolation
2462 __forceinline__
bicubicCoeff(float x_)2463 __device__ float bicubicCoeff(float x_)
2464 {
2465     float x = fabsf(x_);
2466     if (x <= 1.0f)
2467     {
2468         return x * x * (1.5f * x - 2.5f) + 1.0f;
2469     }
2470     else if (x < 2.0f)
2471     {
2472         return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
2473     }
2474     else
2475     {
2476         return 0.0f;
2477     }
2478 }
2479 
2480 
resizeBicubic(NcvSize32u srcSize,NcvRect32u srcROI,NcvSize32u dstSize,Ncv32u dstStep,Ncv32f * dst,NcvRect32u dstROI,Ncv32f scaleX,Ncv32f scaleY)2481 __global__ void resizeBicubic(NcvSize32u srcSize,
2482                               NcvRect32u srcROI,
2483                               NcvSize32u dstSize,
2484                               Ncv32u dstStep,
2485                               Ncv32f *dst,
2486                               NcvRect32u dstROI,
2487                               Ncv32f scaleX,
2488                               Ncv32f scaleY)
2489 {
2490     const int ix = blockIdx.x * blockDim.x + threadIdx.x;
2491     const int iy = blockIdx.y * blockDim.y + threadIdx.y;
2492 
2493     if (ix >= dstROI.width || iy >= dstROI.height)
2494     {
2495         return;
2496     }
2497 
2498     const float dx = 1.0f / srcROI.width;
2499     const float dy = 1.0f / srcROI.height;
2500 
2501     float rx = (float) srcROI.x;
2502     float ry = (float) srcROI.y;
2503 
2504     float rw = (float) srcROI.width;
2505     float rh = (float) srcROI.height;
2506 
2507     float x = scaleX * (float) ix;
2508     float y = scaleY * (float) iy;
2509 
2510     // sampling range
2511     // border mode is clamp
2512     float xmin = fmax (ceilf (x - 2.0f), 0.0f);
2513     float xmax = fmin (floorf (x + 2.0f), rw - 1.0f);
2514 
2515     float ymin = fmax (ceilf (y - 2.0f), 0.0f);
2516     float ymax = fmin (floorf (y + 2.0f), rh - 1.0f);
2517 
2518     // shift data window to match ROI
2519     rx += 0.5f;
2520     ry += 0.5f;
2521 
2522     x += rx;
2523     y += ry;
2524 
2525     xmin += rx;
2526     xmax += rx;
2527     ymin += ry;
2528     ymax += ry;
2529 
2530     float sum  = 0.0f;
2531     float wsum = 0.0f;
2532 
2533     for (float cy = ymin; cy <= ymax; cy += 1.0f)
2534     {
2535         for (float cx = xmin; cx <= xmax; cx += 1.0f)
2536         {
2537             float xDist = x - cx;
2538             float yDist = y - cy;
2539             float wx = bicubicCoeff (xDist);
2540             float wy = bicubicCoeff (yDist);
2541             wx *= wy;
2542             sum += wx * tex2D (texSrc2D, cx * dx, cy * dy);
2543             wsum += wx;
2544         }
2545     }
2546     dst[(ix + dstROI.x)+ (iy + dstROI.y) * dstStep] = (!wsum)? 0 : sum / wsum;
2547 }
2548 
2549 
nppiStResize_32f_C1R(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,NcvRect32u srcROI,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u nDstStep,NcvRect32u dstROI,Ncv32f xFactor,Ncv32f yFactor,NppStInterpMode interpolation)2550 NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
2551                                NcvSize32u srcSize,
2552                                Ncv32u nSrcStep,
2553                                NcvRect32u srcROI,
2554                                Ncv32f *pDst,
2555                                NcvSize32u dstSize,
2556                                Ncv32u nDstStep,
2557                                NcvRect32u dstROI,
2558                                Ncv32f xFactor,
2559                                Ncv32f yFactor,
2560                                NppStInterpMode interpolation)
2561 {
2562     NCVStatus status = NPPST_SUCCESS;
2563 
2564     ncvAssertReturn (pSrc != NULL && pDst != NULL, NPPST_NULL_POINTER_ERROR);
2565     ncvAssertReturn (xFactor != 0.0 && yFactor != 0.0, NPPST_INVALID_SCALE);
2566 
2567     ncvAssertReturn (nSrcStep >= sizeof (Ncv32f) * (Ncv32u) srcSize.width &&
2568         nDstStep >= sizeof (Ncv32f) * (Ncv32f) dstSize.width,
2569         NPPST_INVALID_STEP);
2570 
2571     Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
2572     Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
2573 
2574     // TODO: preprocess ROI to prevent out of bounds access
2575 
2576     if (interpolation == nppStSupersample)
2577     {
2578         // bind texture
2579         cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
2580         // invoke kernel
2581         dim3 ctaSize (32, 6);
2582         dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,
2583             (dstROI.height + ctaSize.y - 1) / ctaSize.y);
2584 
2585         resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
2586             (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
2587     }
2588     else if (interpolation == nppStBicubic)
2589     {
2590         texSrc2D.addressMode[0] = cudaAddressModeMirror;
2591         texSrc2D.addressMode[1] = cudaAddressModeMirror;
2592         texSrc2D.normalized = true;
2593 
2594         cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
2595 
2596         cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
2597             nSrcStep);
2598 
2599         dim3 ctaSize (32, 6);
2600         dim3 gridSize ((dstSize.width  + ctaSize.x - 1) / ctaSize.x,
2601             (dstSize.height + ctaSize.y - 1) / ctaSize.y);
2602 
2603         resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
2604             (srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
2605     }
2606     else
2607     {
2608         status = NPPST_ERROR;
2609     }
2610 
2611     ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2612 
2613     return status;
2614 }
2615 
2616 #endif /* CUDA_DISABLER */
2617