1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #if !defined CUDA_DISABLER
44
45 #include <vector>
46 #include <cuda_runtime.h>
47
48 #include "opencv2/core/cuda/warp.hpp"
49 #include "opencv2/core/cuda/warp_shuffle.hpp"
50
51 #include "opencv2/cudalegacy/NPP_staging.hpp"
52
53
54 texture<Ncv8u, 1, cudaReadModeElementType> tex8u;
55 texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
56 texture<uint2, 1, cudaReadModeElementType> tex64u;
57
58
59 //==============================================================================
60 //
61 // CUDA streams handling
62 //
63 //==============================================================================
64
65
66 static cudaStream_t nppStream = 0;
67
68
nppStGetActiveCUDAstream(void)69 cudaStream_t nppStGetActiveCUDAstream(void)
70 {
71 return nppStream;
72 }
73
74
75
nppStSetActiveCUDAstream(cudaStream_t cudaStream)76 cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
77 {
78 cudaStream_t tmp = nppStream;
79 nppStream = cudaStream;
80 return tmp;
81 }
82
83
84 //==============================================================================
85 //
86 // BlockScan.cuh
87 //
88 //==============================================================================
89
90
91 NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
92
93
94 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
95 //assuming size <= WARP_SIZE and size is power of 2
96 template <class T>
warpScanInclusive(T idata,volatile T * s_Data)97 inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
98 {
99 #if __CUDA_ARCH__ >= 300
100 const unsigned int laneId = cv::cuda::device::Warp::laneId();
101
102 // scan on shuffl functions
103 #pragma unroll
104 for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
105 {
106 const T n = cv::cuda::device::shfl_up(idata, i);
107 if (laneId >= i)
108 idata += n;
109 }
110
111 return idata;
112 #else
113 Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
114 s_Data[pos] = 0;
115 pos += K_WARP_SIZE;
116 s_Data[pos] = idata;
117
118 s_Data[pos] += s_Data[pos - 1];
119 s_Data[pos] += s_Data[pos - 2];
120 s_Data[pos] += s_Data[pos - 4];
121 s_Data[pos] += s_Data[pos - 8];
122 s_Data[pos] += s_Data[pos - 16];
123
124 return s_Data[pos];
125 #endif
126 }
warpScanInclusive(Ncv64u idata,volatile Ncv64u * s_Data)127 inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
128 {
129 Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
130 s_Data[pos] = 0;
131 pos += K_WARP_SIZE;
132 s_Data[pos] = idata;
133
134 s_Data[pos] += s_Data[pos - 1];
135 s_Data[pos] += s_Data[pos - 2];
136 s_Data[pos] += s_Data[pos - 4];
137 s_Data[pos] += s_Data[pos - 8];
138 s_Data[pos] += s_Data[pos - 16];
139
140 return s_Data[pos];
141 }
142
143
144 template <class T>
warpScanExclusive(T idata,volatile T * s_Data)145 inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
146 {
147 return warpScanInclusive(idata, s_Data) - idata;
148 }
149
150
151 template <class T, Ncv32u tiNumScanThreads>
blockScanInclusive(T idata,volatile T * s_Data)152 inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
153 {
154 if (tiNumScanThreads > K_WARP_SIZE)
155 {
156 //Bottom-level inclusive warp scan
157 T warpResult = warpScanInclusive(idata, s_Data);
158
159 //Save top elements of each warp for exclusive warp scan
160 //sync to wait for warp scans to complete (because s_Data is being overwritten)
161 __syncthreads();
162 if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
163 {
164 s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
165 }
166
167 //wait for warp scans to complete
168 __syncthreads();
169
170 if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
171 {
172 //grab top warp elements
173 T val = s_Data[threadIdx.x];
174 //calculate exclusive scan and write back to shared memory
175 s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
176 }
177
178 //return updated warp scans with exclusive scan results
179 __syncthreads();
180 return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
181 }
182 else
183 {
184 return warpScanInclusive(idata, s_Data);
185 }
186 }
187
188
189 //==============================================================================
190 //
191 // IntegralImage.cu
192 //
193 //==============================================================================
194
195
196 const Ncv32u NUM_SCAN_THREADS = 256;
197 const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
198
199
200 template<class T_in, class T_out>
201 struct _scanElemOp
202 {
203 template<bool tbDoSqr>
scanElemOp_scanElemOp204 static inline __host__ __device__ T_out scanElemOp(T_in elem)
205 {
206 return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
207 }
208
209 private:
210
211 template <int v> struct Int2Type { enum { value = v }; };
212
scanElemOp_scanElemOp213 static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
214 {
215 return (T_out)elem;
216 }
217
scanElemOp_scanElemOp218 static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
219 {
220 return (T_out)(elem*elem);
221 }
222 };
223
224
225 template<class T>
226 inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
227
228
229 template<>
readElem(Ncv8u * d_src,Ncv32u texOffs,Ncv32u srcStride,Ncv32u curElemOffs)230 inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
231 {
232 return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
233 }
234
235
236 template<>
readElem(Ncv32u * d_src,Ncv32u texOffs,Ncv32u srcStride,Ncv32u curElemOffs)237 inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
238 {
239 return d_src[curElemOffs];
240 }
241
242
243 template<>
readElem(Ncv32f * d_src,Ncv32u texOffs,Ncv32u srcStride,Ncv32u curElemOffs)244 inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
245 {
246 return d_src[curElemOffs];
247 }
248
249
250 /**
251 * \brief Segmented scan kernel
252 *
253 * Calculates per-row prefix scans of the input image.
254 * Out-of-bounds safe: reads 'size' elements, writes 'size+1' elements
255 *
256 * \tparam T_in Type of input image elements
257 * \tparam T_out Type of output image elements
258 * \tparam T_op Defines an operation to be performed on the input image pixels
259 *
260 * \param d_src [IN] Source image pointer
261 * \param srcWidth [IN] Source image width
262 * \param srcStride [IN] Source image stride
263 * \param d_II [OUT] Output image pointer
264 * \param IIstride [IN] Output image stride
265 *
266 * \return None
267 */
268 template <class T_in, class T_out, bool tbDoSqr>
scanRows(T_in * d_src,Ncv32u texOffs,Ncv32u srcWidth,Ncv32u srcStride,T_out * d_II,Ncv32u IIstride)269 __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
270 T_out *d_II, Ncv32u IIstride)
271 {
272 //advance pointers to the current line
273 if (sizeof(T_in) != 1)
274 {
275 d_src += srcStride * blockIdx.x;
276 }
277 //for initial image 8bit source we use texref tex8u
278 d_II += IIstride * blockIdx.x;
279
280 Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS;
281 Ncv32u offsetX = 0;
282
283 __shared__ T_out shmem[NUM_SCAN_THREADS * 2];
284 __shared__ T_out carryElem;
285 carryElem = 0;
286 __syncthreads();
287
288 while (numBuckets--)
289 {
290 Ncv32u curElemOffs = offsetX + threadIdx.x;
291 T_out curScanElem;
292
293 T_in curElem;
294 T_out curElemMod;
295
296 if (curElemOffs < srcWidth)
297 {
298 //load elements
299 curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
300 }
301 curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
302
303 //inclusive scan
304 curScanElem = blockScanInclusive<T_out, NUM_SCAN_THREADS>(curElemMod, shmem);
305
306 if (curElemOffs <= srcWidth)
307 {
308 //make scan exclusive and write the bucket to the output buffer
309 d_II[curElemOffs] = carryElem + curScanElem - curElemMod;
310 offsetX += NUM_SCAN_THREADS;
311 }
312
313 //remember last element for subsequent buckets adjustment
314 __syncthreads();
315 if (threadIdx.x == NUM_SCAN_THREADS-1)
316 {
317 carryElem += curScanElem;
318 }
319 __syncthreads();
320 }
321
322 if (offsetX == srcWidth && !threadIdx.x)
323 {
324 d_II[offsetX] = carryElem;
325 }
326 }
327
328
329 template <bool tbDoSqr, class T_in, class T_out>
scanRowsWrapperDevice(T_in * d_src,Ncv32u srcStride,T_out * d_dst,Ncv32u dstStride,NcvSize32u roi)330 NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
331 T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
332 {
333 cudaChannelFormatDesc cfdTex;
334 size_t alignmentOffset = 0;
335 if (sizeof(T_in) == 1)
336 {
337 cfdTex = cudaCreateChannelDesc<Ncv8u>();
338 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
339 if (alignmentOffset > 0)
340 {
341 ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
342 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
343 }
344 }
345 scanRows
346 <T_in, T_out, tbDoSqr>
347 <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
348 (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
349
350 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
351
352 return NPPST_SUCCESS;
353 }
354
355
getPaddedDimension(Ncv32u dim,Ncv32u elemTypeSize,Ncv32u allocatorAlignment)356 static Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
357 {
358 Ncv32u alignMask = allocatorAlignment-1;
359 Ncv32u inverseAlignMask = ~alignMask;
360 Ncv32u dimBytes = dim * elemTypeSize;
361 Ncv32u pitch = (dimBytes + alignMask) & inverseAlignMask;
362 Ncv32u PaddedDim = pitch / elemTypeSize;
363 return PaddedDim;
364 }
365
366
367 template <class T_in, class T_out>
ncvIntegralImage_device(T_in * d_src,Ncv32u srcStep,T_out * d_dst,Ncv32u dstStep,NcvSize32u roi,INCVMemAllocator & gpuAllocator)368 NCVStatus ncvIntegralImage_device(T_in *d_src, Ncv32u srcStep,
369 T_out *d_dst, Ncv32u dstStep, NcvSize32u roi,
370 INCVMemAllocator &gpuAllocator)
371 {
372 ncvAssertReturn(sizeof(T_out) == sizeof(Ncv32u), NPPST_MEM_INTERNAL_ERROR);
373 ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice ||
374 gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR);
375 ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
376 ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
377 ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
378 ncvAssertReturn(srcStep >= roi.width * sizeof(T_in) &&
379 dstStep >= (roi.width + 1) * sizeof(T_out) &&
380 srcStep % sizeof(T_in) == 0 &&
381 dstStep % sizeof(T_out) == 0, NPPST_INVALID_STEP);
382 srcStep /= sizeof(T_in);
383 dstStep /= sizeof(T_out);
384
385 Ncv32u WidthII = roi.width + 1;
386 Ncv32u HeightII = roi.height + 1;
387 Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment());
388 Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment());
389
390 NCVMatrixAlloc<T_out> Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32);
391 ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
392 NCVMatrixAlloc<T_out> Tmp32_2(gpuAllocator, PaddedHeightII32, PaddedWidthII32);
393 ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_2.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
394 ncvAssertReturn(Tmp32_1.pitch() * Tmp32_1.height() == Tmp32_2.pitch() * Tmp32_2.height(), NPPST_MEM_INTERNAL_ERROR);
395
396 NCVStatus ncvStat;
397 NCV_SET_SKIP_COND(gpuAllocator.isCounting());
398
399 NCV_SKIP_COND_BEGIN
400
401 ncvStat = scanRowsWrapperDevice
402 <false>
403 (d_src, srcStep, Tmp32_1.ptr(), PaddedWidthII32, roi);
404 ncvAssertReturnNcvStat(ncvStat);
405
406 ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedWidthII32*sizeof(Ncv32u),
407 (Ncv32u *)Tmp32_2.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height));
408 ncvAssertReturnNcvStat(ncvStat);
409
410 ncvStat = scanRowsWrapperDevice
411 <false>
412 (Tmp32_2.ptr(), PaddedHeightII32, Tmp32_1.ptr(), PaddedHeightII32, NcvSize32u(roi.height, WidthII));
413 ncvAssertReturnNcvStat(ncvStat);
414
415 ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u),
416 (Ncv32u *)d_dst, dstStep*sizeof(Ncv32u), NcvSize32u(HeightII, WidthII));
417 ncvAssertReturnNcvStat(ncvStat);
418
419 NCV_SKIP_COND_END
420
421 return NPPST_SUCCESS;
422 }
423
424
ncvSquaredIntegralImage_device(Ncv8u * d_src,Ncv32u srcStep,Ncv64u * d_dst,Ncv32u dstStep,NcvSize32u roi,INCVMemAllocator & gpuAllocator)425 NCVStatus ncvSquaredIntegralImage_device(Ncv8u *d_src, Ncv32u srcStep,
426 Ncv64u *d_dst, Ncv32u dstStep, NcvSize32u roi,
427 INCVMemAllocator &gpuAllocator)
428 {
429 ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
430 ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice ||
431 gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR);
432 ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
433 ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
434 ncvAssertReturn(srcStep >= roi.width &&
435 dstStep >= (roi.width + 1) * sizeof(Ncv64u) &&
436 dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP);
437 dstStep /= sizeof(Ncv64u);
438
439 Ncv32u WidthII = roi.width + 1;
440 Ncv32u HeightII = roi.height + 1;
441 Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment());
442 Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment());
443 Ncv32u PaddedWidthII64 = getPaddedDimension(WidthII, sizeof(Ncv64u), gpuAllocator.alignment());
444 Ncv32u PaddedHeightII64 = getPaddedDimension(HeightII, sizeof(Ncv64u), gpuAllocator.alignment());
445 Ncv32u PaddedWidthMax = PaddedWidthII32 > PaddedWidthII64 ? PaddedWidthII32 : PaddedWidthII64;
446 Ncv32u PaddedHeightMax = PaddedHeightII32 > PaddedHeightII64 ? PaddedHeightII32 : PaddedHeightII64;
447
448 NCVMatrixAlloc<Ncv32u> Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32);
449 ncvAssertReturn(Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
450 NCVMatrixAlloc<Ncv64u> Tmp64(gpuAllocator, PaddedWidthMax, PaddedHeightMax);
451 ncvAssertReturn(Tmp64.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
452
453 NCVMatrixReuse<Ncv32u> Tmp32_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII32, PaddedHeightII32);
454 ncvAssertReturn(Tmp32_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR);
455 NCVMatrixReuse<Ncv64u> Tmp64_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII64, PaddedHeightII64);
456 ncvAssertReturn(Tmp64_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR);
457
458 NCVStatus ncvStat;
459 NCV_SET_SKIP_COND(gpuAllocator.isCounting());
460
461 NCV_SKIP_COND_BEGIN
462
463 ncvStat = scanRowsWrapperDevice
464 <true, Ncv8u, Ncv32u>
465 (d_src, srcStep, Tmp32_2.ptr(), PaddedWidthII32, roi);
466 ncvAssertReturnNcvStat(ncvStat);
467
468 ncvStat = nppiStTranspose_32u_C1R(Tmp32_2.ptr(), PaddedWidthII32*sizeof(Ncv32u),
469 Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height));
470 ncvAssertReturnNcvStat(ncvStat);
471
472 ncvStat = scanRowsWrapperDevice
473 <false, Ncv32u, Ncv64u>
474 (Tmp32_1.ptr(), PaddedHeightII32, Tmp64_2.ptr(), PaddedHeightII64, NcvSize32u(roi.height, WidthII));
475 ncvAssertReturnNcvStat(ncvStat);
476
477 ncvStat = nppiStTranspose_64u_C1R(Tmp64_2.ptr(), PaddedHeightII64*sizeof(Ncv64u),
478 d_dst, dstStep*sizeof(Ncv64u), NcvSize32u(HeightII, WidthII));
479 ncvAssertReturnNcvStat(ncvStat);
480
481 NCV_SKIP_COND_END
482
483 return NPPST_SUCCESS;
484 }
485
486
nppiStIntegralGetSize_8u32u(NcvSize32u roiSize,Ncv32u * pBufsize,cudaDeviceProp & devProp)487 NCVStatus nppiStIntegralGetSize_8u32u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
488 {
489 ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
490 ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
491
492 NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
493 ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
494
495 NCVStatus ncvStat = ncvIntegralImage_device((Ncv8u*)NULL, roiSize.width,
496 (Ncv32u*)NULL, (roiSize.width+1) * sizeof(Ncv32u),
497 roiSize, gpuCounter);
498 ncvAssertReturnNcvStat(ncvStat);
499
500 *pBufsize = (Ncv32u)gpuCounter.maxSize();
501 return NPPST_SUCCESS;
502 }
503
504
nppiStIntegralGetSize_32f32f(NcvSize32u roiSize,Ncv32u * pBufsize,cudaDeviceProp & devProp)505 NCVStatus nppiStIntegralGetSize_32f32f(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
506 {
507 ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
508 ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
509
510 NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
511 ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
512
513 NCVStatus ncvStat = ncvIntegralImage_device((Ncv32f*)NULL, roiSize.width * sizeof(Ncv32f),
514 (Ncv32f*)NULL, (roiSize.width+1) * sizeof(Ncv32f),
515 roiSize, gpuCounter);
516 ncvAssertReturnNcvStat(ncvStat);
517
518 *pBufsize = (Ncv32u)gpuCounter.maxSize();
519 return NPPST_SUCCESS;
520 }
521
522
nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize,Ncv32u * pBufsize,cudaDeviceProp & devProp)523 NCVStatus nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp)
524 {
525 ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
526 ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
527
528 NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
529 ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
530
531 NCVStatus ncvStat = ncvSquaredIntegralImage_device(NULL, roiSize.width,
532 NULL, (roiSize.width+1) * sizeof(Ncv64u),
533 roiSize, gpuCounter);
534 ncvAssertReturnNcvStat(ncvStat);
535
536 *pBufsize = (Ncv32u)gpuCounter.maxSize();
537 return NPPST_SUCCESS;
538 }
539
540
nppiStIntegral_8u32u_C1R(Ncv8u * d_src,Ncv32u srcStep,Ncv32u * d_dst,Ncv32u dstStep,NcvSize32u roiSize,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)541 NCVStatus nppiStIntegral_8u32u_C1R(Ncv8u *d_src, Ncv32u srcStep,
542 Ncv32u *d_dst, Ncv32u dstStep,
543 NcvSize32u roiSize, Ncv8u *pBuffer,
544 Ncv32u bufSize, cudaDeviceProp &devProp)
545 {
546 NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
547 ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
548
549 NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
550 ncvAssertReturnNcvStat(ncvStat);
551
552 return NPPST_SUCCESS;
553 }
554
555
nppiStIntegral_32f32f_C1R(Ncv32f * d_src,Ncv32u srcStep,Ncv32f * d_dst,Ncv32u dstStep,NcvSize32u roiSize,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)556 NCVStatus nppiStIntegral_32f32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
557 Ncv32f *d_dst, Ncv32u dstStep,
558 NcvSize32u roiSize, Ncv8u *pBuffer,
559 Ncv32u bufSize, cudaDeviceProp &devProp)
560 {
561 NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
562 ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
563
564 NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
565 ncvAssertReturnNcvStat(ncvStat);
566
567 return NPPST_SUCCESS;
568 }
569
570
nppiStSqrIntegral_8u64u_C1R(Ncv8u * d_src,Ncv32u srcStep,Ncv64u * d_dst,Ncv32u dstStep,NcvSize32u roiSize,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)571 NCVStatus nppiStSqrIntegral_8u64u_C1R(Ncv8u *d_src, Ncv32u srcStep,
572 Ncv64u *d_dst, Ncv32u dstStep,
573 NcvSize32u roiSize, Ncv8u *pBuffer,
574 Ncv32u bufSize, cudaDeviceProp &devProp)
575 {
576 NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
577 ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
578
579 NCVStatus ncvStat = ncvSquaredIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator);
580 ncvAssertReturnNcvStat(ncvStat);
581
582 return NPPST_SUCCESS;
583 }
584
585
nppiStIntegral_8u32u_C1R_host(Ncv8u * h_src,Ncv32u srcStep,Ncv32u * h_dst,Ncv32u dstStep,NcvSize32u roiSize)586 NCVStatus nppiStIntegral_8u32u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
587 Ncv32u *h_dst, Ncv32u dstStep,
588 NcvSize32u roiSize)
589 {
590 ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
591 ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
592 ncvAssertReturn(srcStep >= roiSize.width &&
593 dstStep >= (roiSize.width + 1) * sizeof(Ncv32u) &&
594 dstStep % sizeof(Ncv32u) == 0, NPPST_INVALID_STEP);
595 dstStep /= sizeof(Ncv32u);
596
597 Ncv32u WidthII = roiSize.width + 1;
598 Ncv32u HeightII = roiSize.height + 1;
599
600 memset(h_dst, 0, WidthII * sizeof(Ncv32u));
601 for (Ncv32u i=1; i<HeightII; i++)
602 {
603 h_dst[i * dstStep] = 0;
604 for (Ncv32u j=1; j<WidthII; j++)
605 {
606 Ncv32u top = h_dst[(i-1) * dstStep + j];
607 Ncv32u left = h_dst[i * dstStep + (j - 1)];
608 Ncv32u topleft = h_dst[(i - 1) * dstStep + (j - 1)];
609 Ncv32u elem = h_src[(i - 1) * srcStep + (j - 1)];
610 h_dst[i * dstStep + j] = elem + left - topleft + top;
611 }
612 }
613
614 return NPPST_SUCCESS;
615 }
616
617
nppiStIntegral_32f32f_C1R_host(Ncv32f * h_src,Ncv32u srcStep,Ncv32f * h_dst,Ncv32u dstStep,NcvSize32u roiSize)618 NCVStatus nppiStIntegral_32f32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
619 Ncv32f *h_dst, Ncv32u dstStep,
620 NcvSize32u roiSize)
621 {
622 ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
623 ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
624 ncvAssertReturn(srcStep >= roiSize.width * sizeof(Ncv32f) &&
625 dstStep >= (roiSize.width + 1) * sizeof(Ncv32f) &&
626 srcStep % sizeof(Ncv32f) == 0 &&
627 dstStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
628 srcStep /= sizeof(Ncv32f);
629 dstStep /= sizeof(Ncv32f);
630
631 Ncv32u WidthII = roiSize.width + 1;
632 Ncv32u HeightII = roiSize.height + 1;
633
634 memset(h_dst, 0, WidthII * sizeof(Ncv32u));
635 for (Ncv32u i=1; i<HeightII; i++)
636 {
637 h_dst[i * dstStep] = 0.0f;
638 for (Ncv32u j=1; j<WidthII; j++)
639 {
640 Ncv32f top = h_dst[(i-1) * dstStep + j];
641 Ncv32f left = h_dst[i * dstStep + (j - 1)];
642 Ncv32f topleft = h_dst[(i - 1) * dstStep + (j - 1)];
643 Ncv32f elem = h_src[(i - 1) * srcStep + (j - 1)];
644 h_dst[i * dstStep + j] = elem + left - topleft + top;
645 }
646 }
647
648 return NPPST_SUCCESS;
649 }
650
651
nppiStSqrIntegral_8u64u_C1R_host(Ncv8u * h_src,Ncv32u srcStep,Ncv64u * h_dst,Ncv32u dstStep,NcvSize32u roiSize)652 NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
653 Ncv64u *h_dst, Ncv32u dstStep,
654 NcvSize32u roiSize)
655 {
656 ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
657 ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI);
658 ncvAssertReturn(srcStep >= roiSize.width &&
659 dstStep >= (roiSize.width + 1) * sizeof(Ncv64u) &&
660 dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP);
661 dstStep /= sizeof(Ncv64u);
662
663 Ncv32u WidthII = roiSize.width + 1;
664 Ncv32u HeightII = roiSize.height + 1;
665
666 memset(h_dst, 0, WidthII * sizeof(Ncv64u));
667 for (Ncv32u i=1; i<HeightII; i++)
668 {
669 h_dst[i * dstStep] = 0;
670 for (Ncv32u j=1; j<WidthII; j++)
671 {
672 Ncv64u top = h_dst[(i-1) * dstStep + j];
673 Ncv64u left = h_dst[i * dstStep + (j - 1)];
674 Ncv64u topleft = h_dst[(i - 1) * dstStep + (j - 1)];
675 Ncv64u elem = h_src[(i - 1) * srcStep + (j - 1)];
676 h_dst[i * dstStep + j] = elem*elem + left - topleft + top;
677 }
678 }
679
680 return NPPST_SUCCESS;
681 }
682
683
684 //==============================================================================
685 //
686 // Decimate.cu
687 //
688 //==============================================================================
689
690
691 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32;
692 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
693
694
695 template<class T, NcvBool tbCacheTexture>
696 __device__ T getElem_Decimate(Ncv32u x, T *d_src);
697
698
699 template<>
getElem_Decimate(Ncv32u x,Ncv32u * d_src)700 __device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
701 {
702 return tex1Dfetch(tex32u, x);
703 }
704
705
706 template<>
getElem_Decimate(Ncv32u x,Ncv32u * d_src)707 __device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
708 {
709 return d_src[x];
710 }
711
712
713 template<>
getElem_Decimate(Ncv32u x,Ncv64u * d_src)714 __device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
715 {
716 uint2 tmp = tex1Dfetch(tex64u, x);
717 Ncv64u res = (Ncv64u)tmp.y;
718 res <<= 32;
719 res |= tmp.x;
720 return res;
721 }
722
723
724 template<>
getElem_Decimate(Ncv32u x,Ncv64u * d_src)725 __device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
726 {
727 return d_src[x];
728 }
729
730
731 template <class T, NcvBool tbCacheTexture>
decimate_C1R(T * d_src,Ncv32u srcStep,T * d_dst,Ncv32u dstStep,NcvSize32u dstRoi,Ncv32u scale)732 __global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
733 NcvSize32u dstRoi, Ncv32u scale)
734 {
735 int curX = blockIdx.x * blockDim.x + threadIdx.x;
736 int curY = blockIdx.y * blockDim.y + threadIdx.y;
737
738 if (curX >= dstRoi.width || curY >= dstRoi.height)
739 {
740 return;
741 }
742
743 d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
744 }
745
746
747 template <class T>
decimateWrapperDevice(T * d_src,Ncv32u srcStep,T * d_dst,Ncv32u dstStep,NcvSize32u srcRoi,Ncv32u scale,NcvBool readThruTexture)748 static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
749 T *d_dst, Ncv32u dstStep,
750 NcvSize32u srcRoi, Ncv32u scale,
751 NcvBool readThruTexture)
752 {
753 ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR);
754 ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
755 ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE);
756 ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) &&
757 dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale), NPPST_INVALID_STEP);
758 srcStep /= sizeof(T);
759 dstStep /= sizeof(T);
760
761 NcvSize32u dstRoi;
762 dstRoi.width = srcRoi.width / scale;
763 dstRoi.height = srcRoi.height / scale;
764
765 dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X,
766 (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
767 dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
768
769 if (!readThruTexture)
770 {
771 decimate_C1R
772 <T, false>
773 <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
774 (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
775 }
776 else
777 {
778 cudaChannelFormatDesc cfdTexSrc;
779
780 if (sizeof(T) == sizeof(Ncv32u))
781 {
782 cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
783
784 size_t alignmentOffset;
785 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
786 ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
787 }
788 else
789 {
790 cfdTexSrc = cudaCreateChannelDesc<uint2>();
791
792 size_t alignmentOffset;
793 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
794 ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
795 }
796
797 decimate_C1R
798 <T, true>
799 <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
800 (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
801 }
802
803 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
804
805 return NPPST_SUCCESS;
806 }
807
808
809 template <class T>
decimateWrapperHost(T * h_src,Ncv32u srcStep,T * h_dst,Ncv32u dstStep,NcvSize32u srcRoi,Ncv32u scale)810 static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
811 T *h_dst, Ncv32u dstStep,
812 NcvSize32u srcRoi, Ncv32u scale)
813 {
814 ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
815 ncvAssertReturn(srcRoi.width != 0 && srcRoi.height != 0, NPPST_INVALID_ROI);
816 ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE);
817 ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) &&
818 dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale) &&
819 srcStep % sizeof(T) == 0 && dstStep % sizeof(T) == 0, NPPST_INVALID_STEP);
820 srcStep /= sizeof(T);
821 dstStep /= sizeof(T);
822
823 NcvSize32u dstRoi;
824 dstRoi.width = srcRoi.width / scale;
825 dstRoi.height = srcRoi.height / scale;
826
827 for (Ncv32u i=0; i<dstRoi.height; i++)
828 {
829 for (Ncv32u j=0; j<dstRoi.width; j++)
830 {
831 h_dst[i*dstStep+j] = h_src[i*scale*srcStep + j*scale];
832 }
833 }
834
835 return NPPST_SUCCESS;
836 }
837
838
839 #define implementNppDecimate(bit, typ) \
840 NCVStatus nppiStDecimate_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
841 Ncv##bit##typ *d_dst, Ncv32u dstStep, \
842 NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \
843 { \
844 return decimateWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
845 (Ncv##bit##u *)d_dst, dstStep, \
846 srcRoi, scale, readThruTexture); \
847 }
848
849
850 #define implementNppDecimateHost(bit, typ) \
851 NCVStatus nppiStDecimate_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
852 Ncv##bit##typ *h_dst, Ncv32u dstStep, \
853 NcvSize32u srcRoi, Ncv32u scale) \
854 { \
855 return decimateWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
856 (Ncv##bit##u *)h_dst, dstStep, \
857 srcRoi, scale); \
858 }
859
860
861 implementNppDecimate(32, u)
862 implementNppDecimate(32, s)
863 implementNppDecimate(32, f)
864 implementNppDecimate(64, u)
865 implementNppDecimate(64, s)
866 implementNppDecimate(64, f)
867 implementNppDecimateHost(32, u)
868 implementNppDecimateHost(32, s)
869 implementNppDecimateHost(32, f)
870 implementNppDecimateHost(64, u)
871 implementNppDecimateHost(64, s)
872 implementNppDecimateHost(64, f)
873
874
875 //==============================================================================
876 //
877 // RectStdDev.cu
878 //
879 //==============================================================================
880
881
882 const Ncv32u NUM_RECTSTDDEV_THREADS = 128;
883
884
885 template <NcvBool tbCacheTexture>
getElemSum(Ncv32u x,Ncv32u * d_sum)886 __device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum)
887 {
888 if (tbCacheTexture)
889 {
890 return tex1Dfetch(tex32u, x);
891 }
892 else
893 {
894 return d_sum[x];
895 }
896 }
897
898
899 template <NcvBool tbCacheTexture>
getElemSqSum(Ncv32u x,Ncv64u * d_sqsum)900 __device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum)
901 {
902 if (tbCacheTexture)
903 {
904 uint2 tmp = tex1Dfetch(tex64u, x);
905 Ncv64u res = (Ncv64u)tmp.y;
906 res <<= 32;
907 res |= tmp.x;
908 return res;
909 }
910 else
911 {
912 return d_sqsum[x];
913 }
914 }
915
916
917 template <NcvBool tbCacheTexture>
rectStdDev_32f_C1R(Ncv32u * d_sum,Ncv32u sumStep,Ncv64u * d_sqsum,Ncv32u sqsumStep,Ncv32f * d_norm,Ncv32u normStep,NcvSize32u roi,NcvRect32u rect,Ncv32f invRectArea)918 __global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
919 Ncv64u *d_sqsum, Ncv32u sqsumStep,
920 Ncv32f *d_norm, Ncv32u normStep,
921 NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
922 {
923 Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x;
924 if (x_offs >= roi.width)
925 {
926 return;
927 }
928
929 Ncv32u sum_offset = blockIdx.y * sumStep + x_offs;
930 Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs;
931
932 //OPT: try swapping order (could change cache hit/miss ratio)
933 Ncv32u sum_tl = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x, d_sum);
934 Ncv32u sum_bl = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
935 Ncv32u sum_tr = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
936 Ncv32u sum_br = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
937 Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl;
938
939 Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br;
940 sqsum_tl = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
941 sqsum_bl = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
942 sqsum_tr = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
943 sqsum_br = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
944 Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl;
945
946 Ncv32f mean = sum_val * invRectArea;
947
948 //////////////////////////////////////////////////////////////////////////
949 // sqsum_val_res = sqsum_val / rectArea
950 //////////////////////////////////////////////////////////////////////////
951
952 Ncv32f sqsum_val_1 = __ull2float_rz(sqsum_val);
953 Ncv64u sqsum_val_2 = __float2ull_rz(sqsum_val_1);
954 Ncv64u sqsum_val_3 = sqsum_val - sqsum_val_2;
955 Ncv32f sqsum_val_4 = __ull2float_rn(sqsum_val_3);
956 sqsum_val_1 *= invRectArea;
957 sqsum_val_4 *= invRectArea;
958 Ncv32f sqsum_val_res = sqsum_val_1 + sqsum_val_4;
959
960 //////////////////////////////////////////////////////////////////////////
961 // variance = sqsum_val_res - mean * mean
962 //////////////////////////////////////////////////////////////////////////
963
964 #if defined DISABLE_MAD_SELECTIVELY
965 Ncv32f variance = sqsum_val_2 - __fmul_rn(mean, mean);
966 #else
967 Ncv32f variance = sqsum_val_res - mean * mean;
968 #endif
969
970 //////////////////////////////////////////////////////////////////////////
971 // stddev = sqrtf(variance)
972 //////////////////////////////////////////////////////////////////////////
973
974 //Ncv32f stddev = sqrtf(variance);
975 Ncv32f stddev = __fsqrt_rn(variance);
976
977 d_norm[blockIdx.y * normStep + x_offs] = stddev;
978 }
979
980
nppiStRectStdDev_32f_C1R(Ncv32u * d_sum,Ncv32u sumStep,Ncv64u * d_sqsum,Ncv32u sqsumStep,Ncv32f * d_norm,Ncv32u normStep,NcvSize32u roi,NcvRect32u rect,Ncv32f scaleArea,NcvBool readThruTexture)981 NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
982 Ncv64u *d_sqsum, Ncv32u sqsumStep,
983 Ncv32f *d_norm, Ncv32u normStep,
984 NcvSize32u roi, NcvRect32u rect,
985 Ncv32f scaleArea, NcvBool readThruTexture)
986 {
987 ncvAssertReturn(d_sum != NULL && d_sqsum != NULL && d_norm != NULL, NPPST_NULL_POINTER_ERROR);
988 ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
989 ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) &&
990 sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) &&
991 normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) &&
992 sumStep % sizeof(Ncv32u) == 0 &&
993 sqsumStep % sizeof(Ncv64u) == 0 &&
994 normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
995 ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE);
996 sumStep /= sizeof(Ncv32u);
997 sqsumStep /= sizeof(Ncv64u);
998 normStep /= sizeof(Ncv32f);
999
1000 Ncv32f rectArea = rect.width * rect.height * scaleArea;
1001 Ncv32f invRectArea = 1.0f / rectArea;
1002
1003 dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height);
1004 dim3 block(NUM_RECTSTDDEV_THREADS);
1005
1006 if (!readThruTexture)
1007 {
1008 rectStdDev_32f_C1R
1009 <false>
1010 <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1011 (d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
1012 }
1013 else
1014 {
1015 cudaChannelFormatDesc cfdTexSrc;
1016 cudaChannelFormatDesc cfdTexSqr;
1017 cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
1018 cfdTexSqr = cudaCreateChannelDesc<uint2>();
1019
1020 size_t alignmentOffset;
1021 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR);
1022 ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
1023 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR);
1024 ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
1025
1026 rectStdDev_32f_C1R
1027 <true>
1028 <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1029 (NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
1030 }
1031
1032 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1033
1034 return NPPST_SUCCESS;
1035 }
1036
1037
nppiStRectStdDev_32f_C1R_host(Ncv32u * h_sum,Ncv32u sumStep,Ncv64u * h_sqsum,Ncv32u sqsumStep,Ncv32f * h_norm,Ncv32u normStep,NcvSize32u roi,NcvRect32u rect,Ncv32f scaleArea)1038 NCVStatus nppiStRectStdDev_32f_C1R_host(Ncv32u *h_sum, Ncv32u sumStep,
1039 Ncv64u *h_sqsum, Ncv32u sqsumStep,
1040 Ncv32f *h_norm, Ncv32u normStep,
1041 NcvSize32u roi, NcvRect32u rect,
1042 Ncv32f scaleArea)
1043 {
1044 ncvAssertReturn(h_sum != NULL && h_sqsum != NULL && h_norm != NULL, NPPST_NULL_POINTER_ERROR);
1045 ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI);
1046 ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) &&
1047 sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) &&
1048 normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) &&
1049 sumStep % sizeof(Ncv32u) == 0 &&
1050 sqsumStep % sizeof(Ncv64u) == 0 &&
1051 normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP);
1052 ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE);
1053 sumStep /= sizeof(Ncv32u);
1054 sqsumStep /= sizeof(Ncv64u);
1055 normStep /= sizeof(Ncv32f);
1056
1057 Ncv32f rectArea = rect.width * rect.height * scaleArea;
1058 Ncv32f invRectArea = 1.0f / rectArea;
1059
1060 for (Ncv32u i=0; i<roi.height; i++)
1061 {
1062 for (Ncv32u j=0; j<roi.width; j++)
1063 {
1064 Ncv32u sum_offset = i * sumStep + j;
1065 Ncv32u sqsum_offset = i * sqsumStep + j;
1066
1067 Ncv32u sum_tl = h_sum[sum_offset + rect.y * sumStep + rect.x];
1068 Ncv32u sum_bl = h_sum[sum_offset + (rect.y + rect.height) * sumStep + rect.x];
1069 Ncv32u sum_tr = h_sum[sum_offset + rect.y * sumStep + rect.x + rect.width];
1070 Ncv32u sum_br = h_sum[sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width];
1071 Ncv64f sum_val = sum_br + sum_tl - sum_tr - sum_bl;
1072
1073 Ncv64u sqsum_tl = h_sqsum[sqsum_offset + rect.y * sqsumStep + rect.x];
1074 Ncv64u sqsum_bl = h_sqsum[sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x];
1075 Ncv64u sqsum_tr = h_sqsum[sqsum_offset + rect.y * sqsumStep + rect.x + rect.width];
1076 Ncv64u sqsum_br = h_sqsum[sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width];
1077 Ncv64f sqsum_val = (Ncv64f)(sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl);
1078
1079 Ncv64f mean = sum_val * invRectArea;
1080 Ncv64f sqsum_val_2 = sqsum_val / rectArea;
1081 Ncv64f variance = sqsum_val_2 - mean * mean;
1082
1083 h_norm[i * normStep + j] = (Ncv32f)sqrt(variance);
1084 }
1085 }
1086
1087 return NPPST_SUCCESS;
1088 }
1089
1090
1091 //==============================================================================
1092 //
1093 // Transpose.cu
1094 //
1095 //==============================================================================
1096
1097
1098 const Ncv32u TRANSPOSE_TILE_DIM = 16;
1099 const Ncv32u TRANSPOSE_BLOCK_ROWS = 16;
1100
1101
1102 /**
1103 * \brief Matrix transpose kernel
1104 *
1105 * Calculates transpose of the input image
1106 * \see TRANSPOSE_TILE_DIM
1107 *
1108 * \tparam T_in Type of input image elements
1109 * \tparam T_out Type of output image elements
1110 *
1111 * \param d_src [IN] Source image pointer
1112 * \param srcStride [IN] Source image stride
1113 * \param d_dst [OUT] Output image pointer
1114 * \param dstStride [IN] Output image stride
1115 *
1116 * \return None
1117 */
1118 template <class T>
transpose(T * d_src,Ncv32u srcStride,T * d_dst,Ncv32u dstStride,NcvSize32u srcRoi)1119 __global__ void transpose(T *d_src, Ncv32u srcStride,
1120 T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi)
1121 {
1122 __shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM+1];
1123
1124 Ncv32u blockIdx_x, blockIdx_y;
1125
1126 // do diagonal reordering
1127 if (gridDim.x == gridDim.y)
1128 {
1129 blockIdx_y = blockIdx.x;
1130 blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
1131 }
1132 else
1133 {
1134 Ncv32u bid = blockIdx.x + gridDim.x * blockIdx.y;
1135 blockIdx_y = bid % gridDim.y;
1136 blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
1137 }
1138
1139 Ncv32u xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
1140 Ncv32u yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
1141 Ncv32u index_gmem = xIndex + yIndex * srcStride;
1142
1143 if (xIndex < srcRoi.width)
1144 {
1145 for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
1146 {
1147 if (yIndex + i < srcRoi.height)
1148 {
1149 tile[threadIdx.y+i][threadIdx.x] = d_src[index_gmem+i*srcStride];
1150 }
1151 }
1152 }
1153
1154 __syncthreads();
1155
1156 xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
1157 yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
1158 index_gmem = xIndex + yIndex * dstStride;
1159
1160 if (xIndex < srcRoi.height)
1161 {
1162 for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
1163 {
1164 if (yIndex + i < srcRoi.width)
1165 {
1166 d_dst[index_gmem+i*dstStride] = tile[threadIdx.x][threadIdx.y+i];
1167 }
1168 }
1169 }
1170 }
1171
1172
1173 template <class T>
transposeWrapperDevice(T * d_src,Ncv32u srcStride,T * d_dst,Ncv32u dstStride,NcvSize32u srcRoi)1174 NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
1175 T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi)
1176 {
1177 ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR);
1178 ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
1179 ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) &&
1180 dstStride >= srcRoi.height * sizeof(T) &&
1181 srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP);
1182 srcStride /= sizeof(T);
1183 dstStride /= sizeof(T);
1184
1185 dim3 grid((srcRoi.width + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM,
1186 (srcRoi.height + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM);
1187 dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM);
1188 transpose
1189 <T>
1190 <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1191 (d_src, srcStride, d_dst, dstStride, srcRoi);
1192 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1193
1194 return NPPST_SUCCESS;
1195 }
1196
1197
1198 template <class T>
transposeWrapperHost(T * h_src,Ncv32u srcStride,T * h_dst,Ncv32u dstStride,NcvSize32u srcRoi)1199 static NCVStatus transposeWrapperHost(T *h_src, Ncv32u srcStride,
1200 T *h_dst, Ncv32u dstStride, NcvSize32u srcRoi)
1201 {
1202 ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
1203 ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI);
1204 ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) &&
1205 dstStride >= srcRoi.height * sizeof(T) &&
1206 srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP);
1207 srcStride /= sizeof(T);
1208 dstStride /= sizeof(T);
1209
1210 for (Ncv32u i=0; i<srcRoi.height; i++)
1211 {
1212 for (Ncv32u j=0; j<srcRoi.width; j++)
1213 {
1214 h_dst[j*dstStride+i] = h_src[i*srcStride + j];
1215 }
1216 }
1217
1218 return NPPST_SUCCESS;
1219 }
1220
1221
1222 #define implementNppTranspose(bit, typ) \
1223 NCVStatus nppiStTranspose_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
1224 Ncv##bit##typ *d_dst, Ncv32u dstStep, NcvSize32u srcRoi) \
1225 { \
1226 return transposeWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
1227 (Ncv##bit##u *)d_dst, dstStep, srcRoi); \
1228 }
1229
1230
1231 #define implementNppTransposeHost(bit, typ) \
1232 NCVStatus nppiStTranspose_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
1233 Ncv##bit##typ *h_dst, Ncv32u dstStep, \
1234 NcvSize32u srcRoi) \
1235 { \
1236 return transposeWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
1237 (Ncv##bit##u *)h_dst, dstStep, srcRoi); \
1238 }
1239
1240
1241 implementNppTranspose(32,u)
1242 implementNppTranspose(32,s)
1243 implementNppTranspose(32,f)
1244 implementNppTranspose(64,u)
1245 implementNppTranspose(64,s)
1246 implementNppTranspose(64,f)
1247
1248 implementNppTransposeHost(32,u)
1249 implementNppTransposeHost(32,s)
1250 implementNppTransposeHost(32,f)
1251 implementNppTransposeHost(64,u)
1252 implementNppTransposeHost(64,s)
1253 implementNppTransposeHost(64,f)
1254
1255
nppiStTranspose_128_C1R(void * d_src,Ncv32u srcStep,void * d_dst,Ncv32u dstStep,NcvSize32u srcRoi)1256 NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
1257 void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
1258 {
1259 return transposeWrapperDevice<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
1260 }
1261
1262
nppiStTranspose_128_C1R_host(void * d_src,Ncv32u srcStep,void * d_dst,Ncv32u dstStep,NcvSize32u srcRoi)1263 NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
1264 void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
1265 {
1266 return transposeWrapperHost<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
1267 }
1268
1269
1270 //==============================================================================
1271 //
1272 // Compact.cu
1273 //
1274 //==============================================================================
1275
1276
1277 const Ncv32u NUM_REMOVE_THREADS = 256;
1278
1279
1280 template <bool bRemove, bool bWritePartial>
removePass1Scan(Ncv32u * d_src,Ncv32u srcLen,Ncv32u * d_offsets,Ncv32u * d_blockSums,Ncv32u elemRemove)1281 __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
1282 Ncv32u *d_offsets, Ncv32u *d_blockSums,
1283 Ncv32u elemRemove)
1284 {
1285 Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1286 Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
1287
1288 if (elemAddrIn > srcLen + blockDim.x)
1289 {
1290 return;
1291 }
1292
1293 __shared__ Ncv32u shmem[NUM_REMOVE_THREADS * 2];
1294
1295 Ncv32u scanElem = 0;
1296 if (elemAddrIn < srcLen)
1297 {
1298 if (bRemove)
1299 {
1300 scanElem = (d_src[elemAddrIn] != elemRemove) ? 1 : 0;
1301 }
1302 else
1303 {
1304 scanElem = d_src[elemAddrIn];
1305 }
1306 }
1307
1308 Ncv32u localScanInc = blockScanInclusive<Ncv32u, NUM_REMOVE_THREADS>(scanElem, shmem);
1309 __syncthreads();
1310
1311 if (elemAddrIn < srcLen)
1312 {
1313 if (threadIdx.x == NUM_REMOVE_THREADS-1 && bWritePartial)
1314 {
1315 d_blockSums[blockId] = localScanInc;
1316 }
1317
1318 if (bRemove)
1319 {
1320 d_offsets[elemAddrIn] = localScanInc - scanElem;
1321 }
1322 else
1323 {
1324 d_src[elemAddrIn] = localScanInc - scanElem;
1325 }
1326 }
1327 }
1328
1329
removePass2Adjust(Ncv32u * d_offsets,Ncv32u srcLen,Ncv32u * d_blockSums)1330 __global__ void removePass2Adjust(Ncv32u *d_offsets, Ncv32u srcLen, Ncv32u *d_blockSums)
1331 {
1332 Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1333 Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
1334 if (elemAddrIn >= srcLen)
1335 {
1336 return;
1337 }
1338
1339 __shared__ Ncv32u valOffs;
1340 valOffs = d_blockSums[blockId];
1341 __syncthreads();
1342
1343 d_offsets[elemAddrIn] += valOffs;
1344 }
1345
1346
removePass3Compact(Ncv32u * d_src,Ncv32u srcLen,Ncv32u * d_offsets,Ncv32u * d_dst,Ncv32u elemRemove,Ncv32u * dstLenValue)1347 __global__ void removePass3Compact(Ncv32u *d_src, Ncv32u srcLen,
1348 Ncv32u *d_offsets, Ncv32u *d_dst,
1349 Ncv32u elemRemove, Ncv32u *dstLenValue)
1350 {
1351 Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
1352 Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x;
1353 if (elemAddrIn >= srcLen)
1354 {
1355 return;
1356 }
1357
1358 Ncv32u elem = d_src[elemAddrIn];
1359 Ncv32u elemAddrOut = d_offsets[elemAddrIn];
1360 if (elem != elemRemove)
1361 {
1362 d_dst[elemAddrOut] = elem;
1363 }
1364
1365 if (elemAddrIn == srcLen-1)
1366 {
1367 if (elem != elemRemove)
1368 {
1369 *dstLenValue = elemAddrOut + 1;
1370 }
1371 else
1372 {
1373 *dstLenValue = elemAddrOut;
1374 }
1375 }
1376 }
1377
1378
compactVector_32u_device(Ncv32u * d_src,Ncv32u srcLen,Ncv32u * d_dst,Ncv32u * dstLenPinned,Ncv32u elemRemove,INCVMemAllocator & gpuAllocator)1379 NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
1380 Ncv32u *d_dst, Ncv32u *dstLenPinned,
1381 Ncv32u elemRemove,
1382 INCVMemAllocator &gpuAllocator)
1383 {
1384 ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
1385 ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR);
1386
1387 if (srcLen == 0)
1388 {
1389 if (dstLenPinned != NULL)
1390 {
1391 *dstLenPinned = 0;
1392 }
1393 return NPPST_SUCCESS;
1394 }
1395
1396 std::vector<Ncv32u> partSumNums;
1397 std::vector<Ncv32u> partSumOffsets;
1398 Ncv32u partSumLastNum = srcLen;
1399 Ncv32u partSumLastOffs = 0;
1400 do
1401 {
1402 partSumNums.push_back(partSumLastNum);
1403 partSumOffsets.push_back(partSumLastOffs);
1404
1405 Ncv32u curPartSumAlignedLength = alignUp(partSumLastNum * sizeof(Ncv32u),
1406 gpuAllocator.alignment()) / sizeof(Ncv32u);
1407 partSumLastOffs += curPartSumAlignedLength;
1408
1409 partSumLastNum = (partSumLastNum + NUM_REMOVE_THREADS - 1) / NUM_REMOVE_THREADS;
1410 }
1411 while (partSumLastNum>1);
1412 partSumNums.push_back(partSumLastNum);
1413 partSumOffsets.push_back(partSumLastOffs);
1414
1415 NCVVectorAlloc<Ncv32u> d_hierSums(gpuAllocator, partSumLastOffs+1);
1416 ncvAssertReturn(gpuAllocator.isCounting() || d_hierSums.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
1417 NCVVectorAlloc<Ncv32u> d_numDstElements(gpuAllocator, 1);
1418 ncvAssertReturn(gpuAllocator.isCounting() || d_numDstElements.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR);
1419
1420 NCV_SET_SKIP_COND(gpuAllocator.isCounting());
1421 NCV_SKIP_COND_BEGIN
1422
1423 dim3 block(NUM_REMOVE_THREADS);
1424
1425 //calculate zero-level partial sums for indices calculation
1426 if (partSumNums.size() > 2)
1427 {
1428 dim3 grid(partSumNums[1]);
1429
1430 if (grid.x > 65535)
1431 {
1432 grid.y = (grid.x + 65534) / 65535;
1433 grid.x = 65535;
1434 }
1435 removePass1Scan
1436 <true, true>
1437 <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1438 (d_src, srcLen,
1439 d_hierSums.ptr(),
1440 d_hierSums.ptr() + partSumOffsets[1],
1441 elemRemove);
1442
1443 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1444
1445 //calculate hierarchical partial sums
1446 for (Ncv32u i=1; i<partSumNums.size()-1; i++)
1447 {
1448 dim3 grid_partial(partSumNums[i+1]);
1449 if (grid_partial.x > 65535)
1450 {
1451 grid_partial.y = (grid_partial.x + 65534) / 65535;
1452 grid_partial.x = 65535;
1453 }
1454 if (grid_partial.x != 1)
1455 {
1456 removePass1Scan
1457 <false, true>
1458 <<<grid_partial, block, 0, nppStGetActiveCUDAstream()>>>
1459 (d_hierSums.ptr() + partSumOffsets[i],
1460 partSumNums[i], NULL,
1461 d_hierSums.ptr() + partSumOffsets[i+1],
1462 0);
1463 }
1464 else
1465 {
1466 removePass1Scan
1467 <false, false>
1468 <<<grid_partial, block, 0, nppStGetActiveCUDAstream()>>>
1469 (d_hierSums.ptr() + partSumOffsets[i],
1470 partSumNums[i], NULL,
1471 NULL,
1472 0);
1473 }
1474
1475 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1476 }
1477
1478 //adjust hierarchical partial sums
1479 for (Ncv32s i=(Ncv32s)partSumNums.size()-3; i>=0; i--)
1480 {
1481 dim3 grid_local(partSumNums[i+1]);
1482 if (grid_local.x > 65535)
1483 {
1484 grid_local.y = (grid_local.x + 65534) / 65535;
1485 grid_local.x = 65535;
1486 }
1487 removePass2Adjust
1488 <<<grid_local, block, 0, nppStGetActiveCUDAstream()>>>
1489 (d_hierSums.ptr() + partSumOffsets[i], partSumNums[i],
1490 d_hierSums.ptr() + partSumOffsets[i+1]);
1491
1492 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1493 }
1494 }
1495 else
1496 {
1497 dim3 grid_local(partSumNums[1]);
1498 removePass1Scan
1499 <true, false>
1500 <<<grid_local, block, 0, nppStGetActiveCUDAstream()>>>
1501 (d_src, srcLen,
1502 d_hierSums.ptr(),
1503 NULL, elemRemove);
1504
1505 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1506 }
1507
1508 //compact source vector using indices
1509 dim3 grid(partSumNums[1]);
1510 if (grid.x > 65535)
1511 {
1512 grid.y = (grid.x + 65534) / 65535;
1513 grid.x = 65535;
1514 }
1515 removePass3Compact
1516 <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
1517 (d_src, srcLen, d_hierSums.ptr(), d_dst,
1518 elemRemove, d_numDstElements.ptr());
1519
1520 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1521
1522 //get number of dst elements
1523 if (dstLenPinned != NULL)
1524 {
1525 ncvAssertCUDAReturn(cudaMemcpyAsync(dstLenPinned, d_numDstElements.ptr(), sizeof(Ncv32u),
1526 cudaMemcpyDeviceToHost, nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR);
1527 ncvAssertCUDAReturn(cudaStreamSynchronize(nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR);
1528 }
1529
1530 NCV_SKIP_COND_END
1531
1532 return NPPST_SUCCESS;
1533 }
1534
1535
nppsStCompactGetSize_32u(Ncv32u srcLen,Ncv32u * pBufsize,cudaDeviceProp & devProp)1536 NCVStatus nppsStCompactGetSize_32u(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
1537 {
1538 ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR);
1539
1540 if (srcLen == 0)
1541 {
1542 *pBufsize = 0;
1543 return NPPST_SUCCESS;
1544 }
1545
1546 NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
1547 ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
1548
1549 NCVStatus ncvStat = compactVector_32u_device(NULL, srcLen, NULL, NULL, 0xC001C0DE,
1550 gpuCounter);
1551 ncvAssertReturnNcvStat(ncvStat);
1552
1553 *pBufsize = (Ncv32u)gpuCounter.maxSize();
1554 return NPPST_SUCCESS;
1555 }
1556
1557
nppsStCompactGetSize_32s(Ncv32u srcLen,Ncv32u * pBufsize,cudaDeviceProp & devProp)1558 NCVStatus nppsStCompactGetSize_32s(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
1559 {
1560 return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp);
1561 }
1562
1563
nppsStCompactGetSize_32f(Ncv32u srcLen,Ncv32u * pBufsize,cudaDeviceProp & devProp)1564 NCVStatus nppsStCompactGetSize_32f(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp)
1565 {
1566 return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp);
1567 }
1568
1569
nppsStCompact_32u(Ncv32u * d_src,Ncv32u srcLen,Ncv32u * d_dst,Ncv32u * p_dstLen,Ncv32u elemRemove,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)1570 NCVStatus nppsStCompact_32u(Ncv32u *d_src, Ncv32u srcLen,
1571 Ncv32u *d_dst, Ncv32u *p_dstLen,
1572 Ncv32u elemRemove, Ncv8u *pBuffer,
1573 Ncv32u bufSize, cudaDeviceProp &devProp)
1574 {
1575 NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), pBuffer);
1576 ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR);
1577
1578 NCVStatus ncvStat = compactVector_32u_device(d_src, srcLen, d_dst, p_dstLen, elemRemove,
1579 gpuAllocator);
1580 ncvAssertReturnNcvStat(ncvStat);
1581
1582 return NPPST_SUCCESS;
1583 }
1584
1585
nppsStCompact_32s(Ncv32s * d_src,Ncv32u srcLen,Ncv32s * d_dst,Ncv32u * p_dstLen,Ncv32s elemRemove,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)1586 NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen,
1587 Ncv32s *d_dst, Ncv32u *p_dstLen,
1588 Ncv32s elemRemove, Ncv8u *pBuffer,
1589 Ncv32u bufSize, cudaDeviceProp &devProp)
1590 {
1591 return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
1592 *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp);
1593 }
1594
1595
1596 #if defined __GNUC__ && __GNUC__ > 2 && __GNUC_MINOR__ > 4
1597 typedef Ncv32u __attribute__((__may_alias__)) Ncv32u_a;
1598 #else
1599 typedef Ncv32u Ncv32u_a;
1600 #endif
1601
nppsStCompact_32f(Ncv32f * d_src,Ncv32u srcLen,Ncv32f * d_dst,Ncv32u * p_dstLen,Ncv32f elemRemove,Ncv8u * pBuffer,Ncv32u bufSize,cudaDeviceProp & devProp)1602 NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen,
1603 Ncv32f *d_dst, Ncv32u *p_dstLen,
1604 Ncv32f elemRemove, Ncv8u *pBuffer,
1605 Ncv32u bufSize, cudaDeviceProp &devProp)
1606 {
1607 return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen,
1608 *(Ncv32u_a *)&elemRemove, pBuffer, bufSize, devProp);
1609 }
1610
nppsStCompact_32u_host(Ncv32u * h_src,Ncv32u srcLen,Ncv32u * h_dst,Ncv32u * dstLen,Ncv32u elemRemove)1611 NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen,
1612 Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove)
1613 {
1614 ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR);
1615
1616 if (srcLen == 0)
1617 {
1618 if (dstLen != NULL)
1619 {
1620 *dstLen = 0;
1621 }
1622 return NPPST_SUCCESS;
1623 }
1624
1625 Ncv32u dstIndex = 0;
1626 for (Ncv32u srcIndex=0; srcIndex<srcLen; srcIndex++)
1627 {
1628 if (h_src[srcIndex] != elemRemove)
1629 {
1630 h_dst[dstIndex++] = h_src[srcIndex];
1631 }
1632 }
1633
1634 if (dstLen != NULL)
1635 {
1636 *dstLen = dstIndex;
1637 }
1638
1639 return NPPST_SUCCESS;
1640 }
1641
1642
nppsStCompact_32s_host(Ncv32s * h_src,Ncv32u srcLen,Ncv32s * h_dst,Ncv32u * dstLen,Ncv32s elemRemove)1643 NCVStatus nppsStCompact_32s_host(Ncv32s *h_src, Ncv32u srcLen,
1644 Ncv32s *h_dst, Ncv32u *dstLen, Ncv32s elemRemove)
1645 {
1646 return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
1647 }
1648
1649
nppsStCompact_32f_host(Ncv32f * h_src,Ncv32u srcLen,Ncv32f * h_dst,Ncv32u * dstLen,Ncv32f elemRemove)1650 NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
1651 Ncv32f *h_dst, Ncv32u *dstLen, Ncv32f elemRemove)
1652 {
1653 return nppsStCompact_32u_host((Ncv32u *)h_src, srcLen, (Ncv32u *)h_dst, dstLen, *(Ncv32u_a *)&elemRemove);
1654 }
1655
1656 //==============================================================================
1657 //
1658 // Filter.cu
1659 //
1660 //==============================================================================
1661
1662
1663 texture <float, 1, cudaReadModeElementType> texSrc;
1664 texture <float, 1, cudaReadModeElementType> texKernel;
1665
1666
getValueMirrorRow(const int rowOffset,int i,int w)1667 __forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
1668 int i,
1669 int w)
1670 {
1671 if (i < 0) i = 1 - i;
1672 if (i >= w) i = w + w - i - 1;
1673 return tex1Dfetch (texSrc, rowOffset + i);
1674 }
1675
1676
getValueMirrorColumn(const int offset,const int rowStep,int j,int h)1677 __forceinline__ __device__ float getValueMirrorColumn(const int offset,
1678 const int rowStep,
1679 int j,
1680 int h)
1681 {
1682 if (j < 0) j = 1 - j;
1683 if (j >= h) j = h + h - j - 1;
1684 return tex1Dfetch (texSrc, offset + j * rowStep);
1685 }
1686
1687
FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u dstStep,NcvRect32u roi,Ncv32s nKernelSize,Ncv32s nAnchor,Ncv32f multiplier)1688 __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
1689 Ncv32f *pDst,
1690 NcvSize32u dstSize,
1691 Ncv32u dstStep,
1692 NcvRect32u roi,
1693 Ncv32s nKernelSize,
1694 Ncv32s nAnchor,
1695 Ncv32f multiplier)
1696 {
1697 // position within ROI
1698 const int ix = blockDim.x * blockIdx.x + threadIdx.x;
1699 const int iy = blockDim.y * blockIdx.y + threadIdx.y;
1700
1701 if (ix >= roi.width || iy >= roi.height)
1702 {
1703 return;
1704 }
1705
1706 const int p = nKernelSize - nAnchor - 1;
1707
1708 const int j = roi.y + iy;
1709
1710 const int rowOffset = j * srcStep + roi.x;
1711
1712 float sum = 0.0f;
1713 for (int m = 0; m < nKernelSize; ++m)
1714 {
1715 sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width)
1716 * tex1Dfetch (texKernel, m);
1717 }
1718
1719 pDst[iy * dstStep + ix] = sum * multiplier;
1720 }
1721
1722
FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u dstStep,NcvRect32u roi,Ncv32s nKernelSize,Ncv32s nAnchor,Ncv32f multiplier)1723 __global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
1724 Ncv32f *pDst,
1725 NcvSize32u dstSize,
1726 Ncv32u dstStep,
1727 NcvRect32u roi,
1728 Ncv32s nKernelSize,
1729 Ncv32s nAnchor,
1730 Ncv32f multiplier)
1731 {
1732 const int ix = blockDim.x * blockIdx.x + threadIdx.x;
1733 const int iy = blockDim.y * blockIdx.y + threadIdx.y;
1734
1735 if (ix >= roi.width || iy >= roi.height)
1736 {
1737 return;
1738 }
1739
1740 const int p = nKernelSize - nAnchor - 1;
1741 const int i = roi.x + ix;
1742 const int offset = i + roi.y * srcStep;
1743
1744 float sum = 0.0f;
1745 for (int m = 0; m < nKernelSize; ++m)
1746 {
1747 sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height)
1748 * tex1Dfetch (texKernel, m);
1749 }
1750
1751 pDst[ix + iy * dstStep] = sum * multiplier;
1752 }
1753
1754
nppiStFilterRowBorder_32f_C1R(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u nDstStep,NcvRect32u oROI,NppStBorderType borderType,const Ncv32f * pKernel,Ncv32s nKernelSize,Ncv32s nAnchor,Ncv32f multiplier)1755 NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
1756 NcvSize32u srcSize,
1757 Ncv32u nSrcStep,
1758 Ncv32f *pDst,
1759 NcvSize32u dstSize,
1760 Ncv32u nDstStep,
1761 NcvRect32u oROI,
1762 NppStBorderType borderType,
1763 const Ncv32f *pKernel,
1764 Ncv32s nKernelSize,
1765 Ncv32s nAnchor,
1766 Ncv32f multiplier)
1767 {
1768 ncvAssertReturn (pSrc != NULL &&
1769 pDst != NULL &&
1770 pKernel != NULL, NCV_NULL_PTR);
1771
1772 ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
1773
1774 ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
1775 dstSize.width * sizeof (Ncv32f) <= nDstStep &&
1776 oROI.width * sizeof (Ncv32f) <= nSrcStep &&
1777 oROI.width * sizeof (Ncv32f) <= nDstStep &&
1778 nSrcStep % sizeof (Ncv32f) == 0 &&
1779 nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
1780
1781 Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
1782 Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
1783
1784 // adjust ROI size to be within source image
1785 if (oROI.x + oROI.width > srcSize.width)
1786 {
1787 oROI.width = srcSize.width - oROI.x;
1788 }
1789
1790 if (oROI.y + oROI.height > srcSize.height)
1791 {
1792 oROI.height = srcSize.height - oROI.y;
1793 }
1794
1795 cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
1796 texSrc.normalized = false;
1797 texKernel.normalized = false;
1798
1799 cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
1800 cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
1801
1802 dim3 ctaSize (32, 6);
1803 dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
1804 (oROI.height + ctaSize.y - 1) / ctaSize.y);
1805
1806 switch (borderType)
1807 {
1808 case nppStBorderNone:
1809 return NPPST_ERROR;
1810 case nppStBorderClamp:
1811 return NPPST_ERROR;
1812 case nppStBorderWrap:
1813 return NPPST_ERROR;
1814 case nppStBorderMirror:
1815 FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
1816 (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
1817 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1818 break;
1819 default:
1820 return NPPST_ERROR;
1821 }
1822
1823 return NPPST_SUCCESS;
1824 }
1825
1826
nppiStFilterColumnBorder_32f_C1R(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u nDstStep,NcvRect32u oROI,NppStBorderType borderType,const Ncv32f * pKernel,Ncv32s nKernelSize,Ncv32s nAnchor,Ncv32f multiplier)1827 NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
1828 NcvSize32u srcSize,
1829 Ncv32u nSrcStep,
1830 Ncv32f *pDst,
1831 NcvSize32u dstSize,
1832 Ncv32u nDstStep,
1833 NcvRect32u oROI,
1834 NppStBorderType borderType,
1835 const Ncv32f *pKernel,
1836 Ncv32s nKernelSize,
1837 Ncv32s nAnchor,
1838 Ncv32f multiplier)
1839 {
1840 ncvAssertReturn (pSrc != NULL &&
1841 pDst != NULL &&
1842 pKernel != NULL, NCV_NULL_PTR);
1843
1844 ncvAssertReturn (oROI.width > 0 && oROI.height > 0, NPPST_INVALID_ROI);
1845
1846 ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
1847 dstSize.width * sizeof (Ncv32f) <= nDstStep &&
1848 oROI.width * sizeof (Ncv32f) <= nSrcStep &&
1849 oROI.width * sizeof (Ncv32f) <= nDstStep &&
1850 nSrcStep % sizeof (Ncv32f) == 0 &&
1851 nDstStep % sizeof (Ncv32f) == 0, NPPST_INVALID_STEP);
1852
1853 Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
1854 Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
1855
1856 // adjust ROI size to be within source image
1857 if (oROI.x + oROI.width > srcSize.width)
1858 {
1859 oROI.width = srcSize.width - oROI.x;
1860 }
1861
1862 if (oROI.y + oROI.height > srcSize.height)
1863 {
1864 oROI.height = srcSize.height - oROI.y;
1865 }
1866
1867 cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
1868 texSrc.normalized = false;
1869 texKernel.normalized = false;
1870
1871 cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
1872 cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
1873
1874 dim3 ctaSize (32, 6);
1875 dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
1876 (oROI.height + ctaSize.y - 1) / ctaSize.y);
1877
1878 switch (borderType)
1879 {
1880 case nppStBorderClamp:
1881 return NPPST_ERROR;
1882 case nppStBorderWrap:
1883 return NPPST_ERROR;
1884 case nppStBorderMirror:
1885 FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
1886 (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
1887 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1888 break;
1889 default:
1890 return NPPST_ERROR;
1891 }
1892
1893 return NPPST_SUCCESS;
1894 }
1895
1896
1897 //==============================================================================
1898 //
1899 // FrameInterpolate.cu
1900 //
1901 //==============================================================================
1902
1903
iDivUp(Ncv32u num,Ncv32u denom)1904 inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
1905 {
1906 return (num + denom - 1)/denom;
1907 }
1908
1909
1910 texture<float, 2, cudaReadModeElementType> tex_src1;
1911 texture<float, 2, cudaReadModeElementType> tex_src0;
1912
1913
BlendFramesKernel(const float * u,const float * v,const float * ur,const float * vr,const float * o0,const float * o1,int w,int h,int s,float theta,float * out)1914 __global__ void BlendFramesKernel(const float *u, const float *v, // forward flow
1915 const float *ur, const float *vr, // backward flow
1916 const float *o0, const float *o1, // coverage masks
1917 int w, int h, int s,
1918 float theta, float *out)
1919 {
1920 const int ix = threadIdx.x + blockDim.x * blockIdx.x;
1921 const int iy = threadIdx.y + blockDim.y * blockIdx.y;
1922
1923 const int pos = ix + s * iy;
1924
1925 if (ix >= w || iy >= h) return;
1926
1927 float _u = u[pos];
1928 float _v = v[pos];
1929
1930 float _ur = ur[pos];
1931 float _vr = vr[pos];
1932
1933 float x = (float)ix + 0.5f;
1934 float y = (float)iy + 0.5f;
1935 bool b0 = o0[pos] > 1e-4f;
1936 bool b1 = o1[pos] > 1e-4f;
1937
1938 if (b0 && b1)
1939 {
1940 // pixel is visible on both frames
1941 out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) +
1942 tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
1943 }
1944 else if (b0)
1945 {
1946 // visible on the first frame only
1947 out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
1948 }
1949 else
1950 {
1951 // visible on the second frame only
1952 out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
1953 }
1954 }
1955
1956
BlendFrames(const Ncv32f * src0,const Ncv32f * src1,const Ncv32f * ufi,const Ncv32f * vfi,const Ncv32f * ubi,const Ncv32f * vbi,const Ncv32f * o1,const Ncv32f * o2,Ncv32u width,Ncv32u height,Ncv32u stride,Ncv32f theta,Ncv32f * out)1957 NCVStatus BlendFrames(const Ncv32f *src0,
1958 const Ncv32f *src1,
1959 const Ncv32f *ufi,
1960 const Ncv32f *vfi,
1961 const Ncv32f *ubi,
1962 const Ncv32f *vbi,
1963 const Ncv32f *o1,
1964 const Ncv32f *o2,
1965 Ncv32u width,
1966 Ncv32u height,
1967 Ncv32u stride,
1968 Ncv32f theta,
1969 Ncv32f *out)
1970 {
1971 tex_src1.addressMode[0] = cudaAddressModeClamp;
1972 tex_src1.addressMode[1] = cudaAddressModeClamp;
1973 tex_src1.filterMode = cudaFilterModeLinear;
1974 tex_src1.normalized = false;
1975
1976 tex_src0.addressMode[0] = cudaAddressModeClamp;
1977 tex_src0.addressMode[1] = cudaAddressModeClamp;
1978 tex_src0.filterMode = cudaFilterModeLinear;
1979 tex_src0.normalized = false;
1980
1981 cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
1982 const Ncv32u pitch = stride * sizeof (float);
1983 ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
1984 ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
1985
1986 dim3 threads (32, 4);
1987 dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
1988
1989 BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
1990 (ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
1991
1992 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
1993
1994 return NPPST_SUCCESS;
1995 }
1996
1997
nppiStGetInterpolationBufferSize(NcvSize32u srcSize,Ncv32u nStep,Ncv32u * hpSize)1998 NCVStatus nppiStGetInterpolationBufferSize(NcvSize32u srcSize,
1999 Ncv32u nStep,
2000 Ncv32u *hpSize)
2001 {
2002 NCVStatus status = NPPST_ERROR;
2003 status = nppiStVectorWarpGetBufferSize(srcSize, nStep, hpSize);
2004 return status;
2005 }
2006
2007
nppiStInterpolateFrames(const NppStInterpolationState * pState)2008 NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
2009 {
2010 // check state validity
2011 ncvAssertReturn (pState->pSrcFrame0 != 0 &&
2012 pState->pSrcFrame1 != 0 &&
2013 pState->pFU != 0 &&
2014 pState->pFV != 0 &&
2015 pState->pBU != 0 &&
2016 pState->pBV != 0 &&
2017 pState->pNewFrame != 0 &&
2018 pState->ppBuffers[0] != 0 &&
2019 pState->ppBuffers[1] != 0 &&
2020 pState->ppBuffers[2] != 0 &&
2021 pState->ppBuffers[3] != 0 &&
2022 pState->ppBuffers[4] != 0 &&
2023 pState->ppBuffers[5] != 0, NPPST_NULL_POINTER_ERROR);
2024
2025 ncvAssertReturn (pState->size.width > 0 &&
2026 pState->size.height > 0, NPPST_ERROR);
2027
2028 ncvAssertReturn (pState->nStep >= pState->size.width * sizeof (Ncv32f) &&
2029 pState->nStep > 0 &&
2030 pState->nStep % sizeof (Ncv32f) == 0,
2031 NPPST_INVALID_STEP);
2032
2033 // change notation
2034 Ncv32f *cov0 = pState->ppBuffers[0];
2035 Ncv32f *cov1 = pState->ppBuffers[1];
2036 Ncv32f *fwdU = pState->ppBuffers[2]; // forward u
2037 Ncv32f *fwdV = pState->ppBuffers[3]; // forward v
2038 Ncv32f *bwdU = pState->ppBuffers[4]; // backward u
2039 Ncv32f *bwdV = pState->ppBuffers[5]; // backward v
2040 // warp flow
2041 ncvAssertReturnNcvStat (
2042 nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFU,
2043 pState->size,
2044 pState->nStep,
2045 pState->pFU,
2046 pState->pFV,
2047 pState->nStep,
2048 cov0,
2049 pState->pos,
2050 fwdU) );
2051 ncvAssertReturnNcvStat (
2052 nppiStVectorWarp_PSF2x2_32f_C1 (pState->pFV,
2053 pState->size,
2054 pState->nStep,
2055 pState->pFU,
2056 pState->pFV,
2057 pState->nStep,
2058 cov0,
2059 pState->pos,
2060 fwdV) );
2061 // warp backward flow
2062 ncvAssertReturnNcvStat (
2063 nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBU,
2064 pState->size,
2065 pState->nStep,
2066 pState->pBU,
2067 pState->pBV,
2068 pState->nStep,
2069 cov1,
2070 1.0f - pState->pos,
2071 bwdU) );
2072 ncvAssertReturnNcvStat (
2073 nppiStVectorWarp_PSF2x2_32f_C1 (pState->pBV,
2074 pState->size,
2075 pState->nStep,
2076 pState->pBU,
2077 pState->pBV,
2078 pState->nStep,
2079 cov1,
2080 1.0f - pState->pos,
2081 bwdU) );
2082 // interpolate frame
2083 ncvAssertReturnNcvStat (
2084 BlendFrames (pState->pSrcFrame0,
2085 pState->pSrcFrame1,
2086 fwdU,
2087 fwdV,
2088 bwdU,
2089 bwdV,
2090 cov0,
2091 cov1,
2092 pState->size.width,
2093 pState->size.height,
2094 pState->nStep / sizeof (Ncv32f),
2095 pState->pos,
2096 pState->pNewFrame) );
2097
2098 return NPPST_SUCCESS;
2099 }
2100
2101
2102 //==============================================================================
2103 //
2104 // VectorWarpFrame.cu
2105 //
2106 //==============================================================================
2107
2108
2109 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
2110
2111 // FP32 atomic add
_atomicAdd(float * addr,float val)2112 static __forceinline__ __device__ float _atomicAdd(float *addr, float val)
2113 {
2114 float old = *addr, assumed;
2115
2116 do {
2117 assumed = old;
2118 old = int_as_float(__iAtomicCAS((int*)addr,
2119 float_as_int(assumed),
2120 float_as_int(val+assumed)));
2121 } while( assumed!=old );
2122
2123 return old;
2124 }
2125 #else
2126 #define _atomicAdd atomicAdd
2127 #endif
2128
2129
ForwardWarpKernel_PSF2x2(const float * u,const float * v,const float * src,const int w,const int h,const int flow_stride,const int image_stride,const float time_scale,float * normalization_factor,float * dst)2130 __global__ void ForwardWarpKernel_PSF2x2(const float *u,
2131 const float *v,
2132 const float *src,
2133 const int w,
2134 const int h,
2135 const int flow_stride,
2136 const int image_stride,
2137 const float time_scale,
2138 float *normalization_factor,
2139 float *dst)
2140 {
2141 int j = threadIdx.x + blockDim.x * blockIdx.x;
2142 int i = threadIdx.y + blockDim.y * blockIdx.y;
2143
2144 if (i >= h || j >= w) return;
2145
2146 int flow_row_offset = i * flow_stride;
2147 int image_row_offset = i * image_stride;
2148
2149 //bottom left corner of a target pixel
2150 float cx = u[flow_row_offset + j] * time_scale + (float)j + 1.0f;
2151 float cy = v[flow_row_offset + j] * time_scale + (float)i + 1.0f;
2152 // pixel containing bottom left corner
2153 float px;
2154 float py;
2155 float dx = modff (cx, &px);
2156 float dy = modff (cy, &py);
2157 // target pixel integer coords
2158 int tx;
2159 int ty;
2160 tx = (int) px;
2161 ty = (int) py;
2162 float value = src[image_row_offset + j];
2163 float weight;
2164 // fill pixel containing bottom right corner
2165 if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2166 {
2167 weight = dx * dy;
2168 _atomicAdd (dst + ty * image_stride + tx, value * weight);
2169 _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
2170 }
2171
2172 // fill pixel containing bottom left corner
2173 tx -= 1;
2174 if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2175 {
2176 weight = (1.0f - dx) * dy;
2177 _atomicAdd (dst + ty * image_stride + tx, value * weight);
2178 _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
2179 }
2180
2181 // fill pixel containing upper left corner
2182 ty -= 1;
2183 if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2184 {
2185 weight = (1.0f - dx) * (1.0f - dy);
2186 _atomicAdd (dst + ty * image_stride + tx, value * weight);
2187 _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
2188 }
2189
2190 // fill pixel containing upper right corner
2191 tx += 1;
2192 if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2193 {
2194 weight = dx * (1.0f - dy);
2195 _atomicAdd (dst + ty * image_stride + tx, value * weight);
2196 _atomicAdd (normalization_factor + ty * image_stride + tx, weight);
2197 }
2198 }
2199
2200
ForwardWarpKernel_PSF1x1(const float * u,const float * v,const float * src,const int w,const int h,const int flow_stride,const int image_stride,const float time_scale,float * dst)2201 __global__ void ForwardWarpKernel_PSF1x1(const float *u,
2202 const float *v,
2203 const float *src,
2204 const int w,
2205 const int h,
2206 const int flow_stride,
2207 const int image_stride,
2208 const float time_scale,
2209 float *dst)
2210 {
2211 int j = threadIdx.x + blockDim.x * blockIdx.x;
2212 int i = threadIdx.y + blockDim.y * blockIdx.y;
2213
2214 if (i >= h || j >= w) return;
2215
2216 int flow_row_offset = i * flow_stride;
2217 int image_row_offset = i * image_stride;
2218
2219 float u_ = u[flow_row_offset + j];
2220 float v_ = v[flow_row_offset + j];
2221
2222 //bottom left corner of target pixel
2223 float cx = u_ * time_scale + (float)j + 1.0f;
2224 float cy = v_ * time_scale + (float)i + 1.0f;
2225 // pixel containing bottom left corner
2226 int tx = __float2int_rn (cx);
2227 int ty = __float2int_rn (cy);
2228
2229 float value = src[image_row_offset + j];
2230 // fill pixel
2231 if (!((tx >= w) || (tx < 0) || (ty >= h) || (ty < 0)))
2232 {
2233 _atomicAdd (dst + ty * image_stride + tx, value);
2234 }
2235 }
2236
2237
NormalizeKernel(const float * normalization_factor,int w,int h,int s,float * image)2238 __global__ void NormalizeKernel(const float *normalization_factor, int w, int h, int s, float *image)
2239 {
2240 int i = threadIdx.y + blockDim.y * blockIdx.y;
2241 int j = threadIdx.x + blockDim.x * blockIdx.x;
2242
2243 if (i >= h || j >= w) return;
2244
2245 const int pos = i * s + j;
2246
2247 float scale = normalization_factor[pos];
2248
2249 float invScale = (scale == 0.0f) ? 1.0f : (1.0f / scale);
2250
2251 image[pos] *= invScale;
2252 }
2253
2254
MemsetKernel(const float value,int w,int h,float * image)2255 __global__ void MemsetKernel(const float value, int w, int h, float *image)
2256 {
2257 int i = threadIdx.y + blockDim.y * blockIdx.y;
2258 int j = threadIdx.x + blockDim.x * blockIdx.x;
2259
2260 if (i >= h || j >= w) return;
2261
2262 const int pos = i * w + j;
2263
2264 image[pos] = value;
2265 }
2266
2267
nppiStVectorWarpGetBufferSize(NcvSize32u srcSize,Ncv32u nSrcStep,Ncv32u * hpSize)2268 NCVStatus nppiStVectorWarpGetBufferSize (NcvSize32u srcSize, Ncv32u nSrcStep, Ncv32u *hpSize)
2269 {
2270 ncvAssertReturn (hpSize != NULL, NPPST_NULL_POINTER_ERROR);
2271 ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep,
2272 NPPST_INVALID_STEP);
2273
2274 *hpSize = nSrcStep * srcSize.height;
2275
2276 return NPPST_SUCCESS;
2277 }
2278
2279
2280 // does not require normalization
nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,const Ncv32f * pU,const Ncv32f * pV,Ncv32u nVFStep,Ncv32f timeScale,Ncv32f * pDst)2281 NCVStatus nppiStVectorWarp_PSF1x1_32f_C1(const Ncv32f *pSrc,
2282 NcvSize32u srcSize,
2283 Ncv32u nSrcStep,
2284 const Ncv32f *pU,
2285 const Ncv32f *pV,
2286 Ncv32u nVFStep,
2287 Ncv32f timeScale,
2288 Ncv32f *pDst)
2289 {
2290 ncvAssertReturn (pSrc != NULL &&
2291 pU != NULL &&
2292 pV != NULL &&
2293 pDst != NULL, NPPST_NULL_POINTER_ERROR);
2294
2295 ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
2296 srcSize.width * sizeof (Ncv32f) <= nVFStep,
2297 NPPST_INVALID_STEP);
2298
2299 Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
2300 Ncv32u vfStep = nVFStep / sizeof (Ncv32f);
2301
2302 dim3 ctaSize (32, 6);
2303 dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
2304
2305 ForwardWarpKernel_PSF1x1 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
2306 (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pDst);
2307
2308 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2309
2310 return NPPST_SUCCESS;
2311 }
2312
2313
nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,const Ncv32f * pU,const Ncv32f * pV,Ncv32u nVFStep,Ncv32f * pBuffer,Ncv32f timeScale,Ncv32f * pDst)2314 NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
2315 NcvSize32u srcSize,
2316 Ncv32u nSrcStep,
2317 const Ncv32f *pU,
2318 const Ncv32f *pV,
2319 Ncv32u nVFStep,
2320 Ncv32f *pBuffer,
2321 Ncv32f timeScale,
2322 Ncv32f *pDst)
2323 {
2324 ncvAssertReturn (pSrc != NULL &&
2325 pU != NULL &&
2326 pV != NULL &&
2327 pDst != NULL &&
2328 pBuffer != NULL, NPPST_NULL_POINTER_ERROR);
2329
2330 ncvAssertReturn (srcSize.width * sizeof (Ncv32f) <= nSrcStep &&
2331 srcSize.width * sizeof (Ncv32f) <= nVFStep, NPPST_INVALID_STEP);
2332
2333 Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
2334 Ncv32u vfStep = nVFStep / sizeof(Ncv32f);
2335
2336 dim3 ctaSize(32, 6);
2337 dim3 gridSize (iDivUp (srcSize.width, ctaSize.x), iDivUp (srcSize.height, ctaSize.y));
2338
2339 MemsetKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
2340 (0, srcSize.width, srcSize.height, pBuffer);
2341
2342 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2343
2344 ForwardWarpKernel_PSF2x2 <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
2345 (pU, pV, pSrc, srcSize.width, srcSize.height, vfStep, srcStep, timeScale, pBuffer, pDst);
2346
2347 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2348
2349 NormalizeKernel <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream()>>>
2350 (pBuffer, srcSize.width, srcSize.height, srcStep, pDst);
2351
2352 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2353
2354 return NPPST_SUCCESS;
2355 }
2356
2357
2358 //==============================================================================
2359 //
2360 // Resize.cu
2361 //
2362 //==============================================================================
2363
2364
2365 texture <float, 2, cudaReadModeElementType> texSrc2D;
2366
2367
2368 __forceinline__
processLine(int spos,float xmin,float xmax,int ixmin,int ixmax,float fxmin,float cxmax)2369 __device__ float processLine(int spos,
2370 float xmin,
2371 float xmax,
2372 int ixmin,
2373 int ixmax,
2374 float fxmin,
2375 float cxmax)
2376 {
2377 // first element
2378 float wsum = 1.0f - xmin + fxmin;
2379 float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin);
2380 spos++;
2381 for (int ix = ixmin + 1; ix < ixmax; ++ix)
2382 {
2383 sum += tex1Dfetch(texSrc, spos);
2384 spos++;
2385 wsum += 1.0f;
2386 }
2387 sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax);
2388 wsum += cxmax - xmax;
2389 return sum / wsum;
2390 }
2391
2392
resizeSuperSample_32f(NcvSize32u srcSize,Ncv32u srcStep,NcvRect32u srcROI,Ncv32f * dst,NcvSize32u dstSize,Ncv32u dstStep,NcvRect32u dstROI,Ncv32f scaleX,Ncv32f scaleY)2393 __global__ void resizeSuperSample_32f(NcvSize32u srcSize,
2394 Ncv32u srcStep,
2395 NcvRect32u srcROI,
2396 Ncv32f *dst,
2397 NcvSize32u dstSize,
2398 Ncv32u dstStep,
2399 NcvRect32u dstROI,
2400 Ncv32f scaleX,
2401 Ncv32f scaleY)
2402 {
2403 // position within dst ROI
2404 const int ix = blockIdx.x * blockDim.x + threadIdx.x;
2405 const int iy = blockIdx.y * blockDim.y + threadIdx.y;
2406
2407 if (ix >= dstROI.width || iy >= dstROI.height)
2408 {
2409 return;
2410 }
2411
2412 float rw = (float) srcROI.width;
2413 float rh = (float) srcROI.height;
2414
2415 // source position
2416 float x = scaleX * (float) ix;
2417 float y = scaleY * (float) iy;
2418
2419 // x sampling range
2420 float xBegin = fmax (x - scaleX, 0.0f);
2421 float xEnd = fmin (x + scaleX, rw - 1.0f);
2422 // y sampling range
2423 float yBegin = fmax (y - scaleY, 0.0f);
2424 float yEnd = fmin (y + scaleY, rh - 1.0f);
2425 // x range of source samples
2426 float floorXBegin = floorf (xBegin);
2427 float ceilXEnd = ceilf (xEnd);
2428 int iXBegin = srcROI.x + (int) floorXBegin;
2429 int iXEnd = srcROI.x + (int) ceilXEnd;
2430 // y range of source samples
2431 float floorYBegin = floorf (yBegin);
2432 float ceilYEnd = ceilf (yEnd);
2433 int iYBegin = srcROI.y + (int) floorYBegin;
2434 int iYEnd = srcROI.y + (int) ceilYEnd;
2435
2436 // first row
2437 int pos = iYBegin * srcStep + iXBegin;
2438
2439 float wsum = 1.0f - yBegin + floorYBegin;
2440
2441 float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
2442 ceilXEnd) * (1.0f - yBegin + floorYBegin);
2443 pos += srcStep;
2444 for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
2445 {
2446 sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
2447 ceilXEnd);
2448 pos += srcStep;
2449 wsum += 1.0f;
2450 }
2451
2452 sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
2453 ceilXEnd) * (ceilYEnd - yEnd);
2454 wsum += ceilYEnd - yEnd;
2455 sum /= wsum;
2456
2457 dst[(ix + dstROI.x) + (iy + dstROI.y) * dstStep] = sum;
2458 }
2459
2460
2461 // bicubic interpolation
2462 __forceinline__
bicubicCoeff(float x_)2463 __device__ float bicubicCoeff(float x_)
2464 {
2465 float x = fabsf(x_);
2466 if (x <= 1.0f)
2467 {
2468 return x * x * (1.5f * x - 2.5f) + 1.0f;
2469 }
2470 else if (x < 2.0f)
2471 {
2472 return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
2473 }
2474 else
2475 {
2476 return 0.0f;
2477 }
2478 }
2479
2480
resizeBicubic(NcvSize32u srcSize,NcvRect32u srcROI,NcvSize32u dstSize,Ncv32u dstStep,Ncv32f * dst,NcvRect32u dstROI,Ncv32f scaleX,Ncv32f scaleY)2481 __global__ void resizeBicubic(NcvSize32u srcSize,
2482 NcvRect32u srcROI,
2483 NcvSize32u dstSize,
2484 Ncv32u dstStep,
2485 Ncv32f *dst,
2486 NcvRect32u dstROI,
2487 Ncv32f scaleX,
2488 Ncv32f scaleY)
2489 {
2490 const int ix = blockIdx.x * blockDim.x + threadIdx.x;
2491 const int iy = blockIdx.y * blockDim.y + threadIdx.y;
2492
2493 if (ix >= dstROI.width || iy >= dstROI.height)
2494 {
2495 return;
2496 }
2497
2498 const float dx = 1.0f / srcROI.width;
2499 const float dy = 1.0f / srcROI.height;
2500
2501 float rx = (float) srcROI.x;
2502 float ry = (float) srcROI.y;
2503
2504 float rw = (float) srcROI.width;
2505 float rh = (float) srcROI.height;
2506
2507 float x = scaleX * (float) ix;
2508 float y = scaleY * (float) iy;
2509
2510 // sampling range
2511 // border mode is clamp
2512 float xmin = fmax (ceilf (x - 2.0f), 0.0f);
2513 float xmax = fmin (floorf (x + 2.0f), rw - 1.0f);
2514
2515 float ymin = fmax (ceilf (y - 2.0f), 0.0f);
2516 float ymax = fmin (floorf (y + 2.0f), rh - 1.0f);
2517
2518 // shift data window to match ROI
2519 rx += 0.5f;
2520 ry += 0.5f;
2521
2522 x += rx;
2523 y += ry;
2524
2525 xmin += rx;
2526 xmax += rx;
2527 ymin += ry;
2528 ymax += ry;
2529
2530 float sum = 0.0f;
2531 float wsum = 0.0f;
2532
2533 for (float cy = ymin; cy <= ymax; cy += 1.0f)
2534 {
2535 for (float cx = xmin; cx <= xmax; cx += 1.0f)
2536 {
2537 float xDist = x - cx;
2538 float yDist = y - cy;
2539 float wx = bicubicCoeff (xDist);
2540 float wy = bicubicCoeff (yDist);
2541 wx *= wy;
2542 sum += wx * tex2D (texSrc2D, cx * dx, cy * dy);
2543 wsum += wx;
2544 }
2545 }
2546 dst[(ix + dstROI.x)+ (iy + dstROI.y) * dstStep] = (!wsum)? 0 : sum / wsum;
2547 }
2548
2549
nppiStResize_32f_C1R(const Ncv32f * pSrc,NcvSize32u srcSize,Ncv32u nSrcStep,NcvRect32u srcROI,Ncv32f * pDst,NcvSize32u dstSize,Ncv32u nDstStep,NcvRect32u dstROI,Ncv32f xFactor,Ncv32f yFactor,NppStInterpMode interpolation)2550 NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
2551 NcvSize32u srcSize,
2552 Ncv32u nSrcStep,
2553 NcvRect32u srcROI,
2554 Ncv32f *pDst,
2555 NcvSize32u dstSize,
2556 Ncv32u nDstStep,
2557 NcvRect32u dstROI,
2558 Ncv32f xFactor,
2559 Ncv32f yFactor,
2560 NppStInterpMode interpolation)
2561 {
2562 NCVStatus status = NPPST_SUCCESS;
2563
2564 ncvAssertReturn (pSrc != NULL && pDst != NULL, NPPST_NULL_POINTER_ERROR);
2565 ncvAssertReturn (xFactor != 0.0 && yFactor != 0.0, NPPST_INVALID_SCALE);
2566
2567 ncvAssertReturn (nSrcStep >= sizeof (Ncv32f) * (Ncv32u) srcSize.width &&
2568 nDstStep >= sizeof (Ncv32f) * (Ncv32f) dstSize.width,
2569 NPPST_INVALID_STEP);
2570
2571 Ncv32u srcStep = nSrcStep / sizeof (Ncv32f);
2572 Ncv32u dstStep = nDstStep / sizeof (Ncv32f);
2573
2574 // TODO: preprocess ROI to prevent out of bounds access
2575
2576 if (interpolation == nppStSupersample)
2577 {
2578 // bind texture
2579 cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
2580 // invoke kernel
2581 dim3 ctaSize (32, 6);
2582 dim3 gridSize ((dstROI.width + ctaSize.x - 1) / ctaSize.x,
2583 (dstROI.height + ctaSize.y - 1) / ctaSize.y);
2584
2585 resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
2586 (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
2587 }
2588 else if (interpolation == nppStBicubic)
2589 {
2590 texSrc2D.addressMode[0] = cudaAddressModeMirror;
2591 texSrc2D.addressMode[1] = cudaAddressModeMirror;
2592 texSrc2D.normalized = true;
2593
2594 cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
2595
2596 cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
2597 nSrcStep);
2598
2599 dim3 ctaSize (32, 6);
2600 dim3 gridSize ((dstSize.width + ctaSize.x - 1) / ctaSize.x,
2601 (dstSize.height + ctaSize.y - 1) / ctaSize.y);
2602
2603 resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
2604 (srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
2605 }
2606 else
2607 {
2608 status = NPPST_ERROR;
2609 }
2610
2611 ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
2612
2613 return status;
2614 }
2615
2616 #endif /* CUDA_DISABLER */
2617