1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include <stdio.h>
44 #include <cuda_runtime.h>
45
46 #include "opencv2/core/cuda/common.hpp"
47
48 #include "opencv2/cudalegacy/NCV.hpp"
49 #include "opencv2/cudalegacy/NCVPyramid.hpp"
50
51 #include "NCVAlg.hpp"
52 #include "NCVPixelOperations.hpp"
53
54 template<typename T, Ncv32u CN> struct __average4_CN {static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
55
56 template<typename T> struct __average4_CN<T, 1> {
_average4_CN__average4_CN57 static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
58 {
59 T out;
60 out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
61 return out;
62 }};
63
64 template<> struct __average4_CN<float1, 1> {
_average4_CN__average4_CN65 static __host__ __device__ float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11)
66 {
67 float1 out;
68 out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
69 return out;
70 }};
71
72 template<> struct __average4_CN<double1, 1> {
_average4_CN__average4_CN73 static __host__ __device__ double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11)
74 {
75 double1 out;
76 out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
77 return out;
78 }};
79
80 template<typename T> struct __average4_CN<T, 3> {
_average4_CN__average4_CN81 static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
82 {
83 T out;
84 out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
85 out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
86 out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
87 return out;
88 }};
89
90 template<> struct __average4_CN<float3, 3> {
_average4_CN__average4_CN91 static __host__ __device__ float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11)
92 {
93 float3 out;
94 out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
95 out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
96 out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
97 return out;
98 }};
99
100 template<> struct __average4_CN<double3, 3> {
_average4_CN__average4_CN101 static __host__ __device__ double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11)
102 {
103 double3 out;
104 out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
105 out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
106 out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
107 return out;
108 }};
109
110 template<typename T> struct __average4_CN<T, 4> {
_average4_CN__average4_CN111 static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
112 {
113 T out;
114 out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
115 out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
116 out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
117 out.w = ((Ncv32s)p00.w + p01.w + p10.w + p11.w + 2) / 4;
118 return out;
119 }};
120
121 template<> struct __average4_CN<float4, 4> {
_average4_CN__average4_CN122 static __host__ __device__ float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11)
123 {
124 float4 out;
125 out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
126 out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
127 out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
128 out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
129 return out;
130 }};
131
132 template<> struct __average4_CN<double4, 4> {
_average4_CN__average4_CN133 static __host__ __device__ double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11)
134 {
135 double4 out;
136 out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
137 out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
138 out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
139 out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
140 return out;
141 }};
142
_average4(const T & p00,const T & p01,const T & p10,const T & p11)143 template<typename T> static __host__ __device__ T _average4(const T &p00, const T &p01, const T &p10, const T &p11)
144 {
145 return __average4_CN<T, NC(T)>::_average4_CN(p00, p01, p10, p11);
146 }
147
148
149 template<typename Tin, typename Tout, Ncv32u CN> struct __lerp_CN {static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);};
150
151 template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
_lerp_CN__lerp_CN152 static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
153 {
154 typedef typename TConvVec2Base<Tout>::TBase TB;
155 return _pixMake(TB(b.x * d + a.x * (1 - d)));
156 }};
157
158 template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 3> {
_lerp_CN__lerp_CN159 static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
160 {
161 typedef typename TConvVec2Base<Tout>::TBase TB;
162 return _pixMake(TB(b.x * d + a.x * (1 - d)),
163 TB(b.y * d + a.y * (1 - d)),
164 TB(b.z * d + a.z * (1 - d)));
165 }};
166
167 template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 4> {
_lerp_CN__lerp_CN168 static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
169 {
170 typedef typename TConvVec2Base<Tout>::TBase TB;
171 return _pixMake(TB(b.x * d + a.x * (1 - d)),
172 TB(b.y * d + a.y * (1 - d)),
173 TB(b.z * d + a.z * (1 - d)),
174 TB(b.w * d + a.w * (1 - d)));
175 }};
176
_lerp(const Tin & a,const Tin & b,Ncv32f d)177 template<typename Tin, typename Tout> static __host__ __device__ Tout _lerp(const Tin &a, const Tin &b, Ncv32f d)
178 {
179 return __lerp_CN<Tin, Tout, NC(Tin)>::_lerp_CN(a, b, d);
180 }
181
182
183 template<typename T>
kernelDownsampleX2(T * d_src,Ncv32u srcPitch,T * d_dst,Ncv32u dstPitch,NcvSize32u dstRoi)184 __global__ void kernelDownsampleX2(T *d_src,
185 Ncv32u srcPitch,
186 T *d_dst,
187 Ncv32u dstPitch,
188 NcvSize32u dstRoi)
189 {
190 Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
191 Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
192
193 if (i < dstRoi.height && j < dstRoi.width)
194 {
195 T *d_src_line1 = (T *)((Ncv8u *)d_src + (2 * i + 0) * srcPitch);
196 T *d_src_line2 = (T *)((Ncv8u *)d_src + (2 * i + 1) * srcPitch);
197 T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
198
199 T p00 = d_src_line1[2*j+0];
200 T p01 = d_src_line1[2*j+1];
201 T p10 = d_src_line2[2*j+0];
202 T p11 = d_src_line2[2*j+1];
203
204 d_dst_line[j] = _average4(p00, p01, p10, p11);
205 }
206 }
207
208 namespace cv { namespace cuda { namespace device
209 {
210 namespace pyramid
211 {
kernelDownsampleX2_gpu(PtrStepSzb src,PtrStepSzb dst,cudaStream_t stream)212 template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
213 {
214 dim3 bDim(16, 8);
215 dim3 gDim(divUp(src.cols, bDim.x), divUp(src.rows, bDim.y));
216
217 kernelDownsampleX2<<<gDim, bDim, 0, stream>>>((T*)src.data, static_cast<Ncv32u>(src.step),
218 (T*)dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
219
220 cudaSafeCall( cudaGetLastError() );
221
222 if (stream == 0)
223 cudaSafeCall( cudaDeviceSynchronize() );
224 }
225
downsampleX2(PtrStepSzb src,PtrStepSzb dst,int depth,int cn,cudaStream_t stream)226 void downsampleX2(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream)
227 {
228 typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
229
230 static const func_t funcs[6][4] =
231 {
232 {kernelDownsampleX2_gpu<uchar1> , 0 /*kernelDownsampleX2_gpu<uchar2>*/ , kernelDownsampleX2_gpu<uchar3> , kernelDownsampleX2_gpu<uchar4> },
233 {0 /*kernelDownsampleX2_gpu<char1>*/ , 0 /*kernelDownsampleX2_gpu<char2>*/ , 0 /*kernelDownsampleX2_gpu<char3>*/ , 0 /*kernelDownsampleX2_gpu<char4>*/ },
234 {kernelDownsampleX2_gpu<ushort1> , 0 /*kernelDownsampleX2_gpu<ushort2>*/, kernelDownsampleX2_gpu<ushort3> , kernelDownsampleX2_gpu<ushort4> },
235 {0 /*kernelDownsampleX2_gpu<short1>*/ , 0 /*kernelDownsampleX2_gpu<short2>*/ , 0 /*kernelDownsampleX2_gpu<short3>*/, 0 /*kernelDownsampleX2_gpu<short4>*/},
236 {0 /*kernelDownsampleX2_gpu<int1>*/ , 0 /*kernelDownsampleX2_gpu<int2>*/ , 0 /*kernelDownsampleX2_gpu<int3>*/ , 0 /*kernelDownsampleX2_gpu<int4>*/ },
237 {kernelDownsampleX2_gpu<float1> , 0 /*kernelDownsampleX2_gpu<float2>*/ , kernelDownsampleX2_gpu<float3> , kernelDownsampleX2_gpu<float4> }
238 };
239
240 const func_t func = funcs[depth][cn - 1];
241 CV_Assert(func != 0);
242
243 func(src, dst, stream);
244 }
245 }
246 }}}
247
248
249
250
251 template<typename T>
kernelInterpolateFrom1(T * d_srcTop,Ncv32u srcTopPitch,NcvSize32u szTopRoi,T * d_dst,Ncv32u dstPitch,NcvSize32u dstRoi)252 __global__ void kernelInterpolateFrom1(T *d_srcTop,
253 Ncv32u srcTopPitch,
254 NcvSize32u szTopRoi,
255 T *d_dst,
256 Ncv32u dstPitch,
257 NcvSize32u dstRoi)
258 {
259 Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
260 Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
261
262 if (i < dstRoi.height && j < dstRoi.width)
263 {
264 Ncv32f ptTopX = 1.0f * (szTopRoi.width - 1) * j / (dstRoi.width - 1);
265 Ncv32f ptTopY = 1.0f * (szTopRoi.height - 1) * i / (dstRoi.height - 1);
266 Ncv32u xl = (Ncv32u)ptTopX;
267 Ncv32u xh = xl+1;
268 Ncv32f dx = ptTopX - xl;
269 Ncv32u yl = (Ncv32u)ptTopY;
270 Ncv32u yh = yl+1;
271 Ncv32f dy = ptTopY - yl;
272
273 T *d_src_line1 = (T *)((Ncv8u *)d_srcTop + yl * srcTopPitch);
274 T *d_src_line2 = (T *)((Ncv8u *)d_srcTop + yh * srcTopPitch);
275 T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
276
277 T p00, p01, p10, p11;
278 p00 = d_src_line1[xl];
279 p01 = xh < szTopRoi.width ? d_src_line1[xh] : p00;
280 p10 = yh < szTopRoi.height ? d_src_line2[xl] : p00;
281 p11 = (xh < szTopRoi.width && yh < szTopRoi.height) ? d_src_line2[xh] : p00;
282 typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
283 TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
284 TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
285 TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
286 T outPix = _pixDemoteClampZ<TVFlt, T>(mixture);
287
288 d_dst_line[j] = outPix;
289 }
290 }
291 namespace cv { namespace cuda { namespace device
292 {
293 namespace pyramid
294 {
kernelInterpolateFrom1_gpu(PtrStepSzb src,PtrStepSzb dst,cudaStream_t stream)295 template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
296 {
297 dim3 bDim(16, 8);
298 dim3 gDim(divUp(dst.cols, bDim.x), divUp(dst.rows, bDim.y));
299
300 kernelInterpolateFrom1<<<gDim, bDim, 0, stream>>>((T*) src.data, static_cast<Ncv32u>(src.step), NcvSize32u(src.cols, src.rows),
301 (T*) dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
302
303 cudaSafeCall( cudaGetLastError() );
304
305 if (stream == 0)
306 cudaSafeCall( cudaDeviceSynchronize() );
307 }
308
interpolateFrom1(PtrStepSzb src,PtrStepSzb dst,int depth,int cn,cudaStream_t stream)309 void interpolateFrom1(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream)
310 {
311 typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
312
313 static const func_t funcs[6][4] =
314 {
315 {kernelInterpolateFrom1_gpu<uchar1> , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3> , kernelInterpolateFrom1_gpu<uchar4> },
316 {0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/ , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ },
317 {kernelInterpolateFrom1_gpu<ushort1> , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3> , kernelInterpolateFrom1_gpu<ushort4> },
318 {0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/},
319 {0 /*kernelInterpolateFrom1_gpu<int1>*/ , 0 /*kernelInterpolateFrom1_gpu<int2>*/ , 0 /*kernelInterpolateFrom1_gpu<int3>*/ , 0 /*kernelInterpolateFrom1_gpu<int4>*/ },
320 {kernelInterpolateFrom1_gpu<float1> , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3> , kernelInterpolateFrom1_gpu<float4> }
321 };
322
323 const func_t func = funcs[depth][cn - 1];
324 CV_Assert(func != 0);
325
326 func(src, dst, stream);
327 }
328 }
329 }}}
330
331
332 #if 0 //def _WIN32
333
334 template<typename T>
335 static T _interpLinear(const T &a, const T &b, Ncv32f d)
336 {
337 typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
338 TVFlt tmp = _lerp<T, TVFlt>(a, b, d);
339 return _pixDemoteClampZ<TVFlt, T>(tmp);
340 }
341
342
343 template<typename T>
344 static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
345 {
346 Ncv32u xl = (Ncv32u)x;
347 Ncv32u xh = xl+1;
348 Ncv32f dx = x - xl;
349 Ncv32u yl = (Ncv32u)y;
350 Ncv32u yh = yl+1;
351 Ncv32f dy = y - yl;
352 T p00, p01, p10, p11;
353 p00 = refLayer.at(xl, yl);
354 p01 = xh < refLayer.width() ? refLayer.at(xh, yl) : p00;
355 p10 = yh < refLayer.height() ? refLayer.at(xl, yh) : p00;
356 p11 = (xh < refLayer.width() && yh < refLayer.height()) ? refLayer.at(xh, yh) : p00;
357 typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
358 TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
359 TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
360 TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
361 return _pixDemoteClampZ<TVFlt, T>(mixture);
362 }
363
364 template <class T>
365 NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
366 Ncv8u numLayers,
367 INCVMemAllocator &alloc,
368 cudaStream_t cuStream)
369 {
370 this->_isInitialized = false;
371 ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid::ctor error", );
372
373 this->layer0 = &img;
374 NcvSize32u szLastLayer(img.width(), img.height());
375 this->nLayers = 1;
376
377 NCV_SET_SKIP_COND(alloc.isCounting());
378 NcvBool bDeviceCode = alloc.memType() == NCVMemoryTypeDevice;
379
380 if (numLayers == 0)
381 {
382 numLayers = 255; //it will cut-off when any of the dimensions goes 1
383 }
384
385 #ifdef SELF_CHECK_GPU
386 NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
387 #endif
388
389 for (Ncv32u i=0; i<(Ncv32u)numLayers-1; i++)
390 {
391 NcvSize32u szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
392 if (szCurLayer.width == 0 || szCurLayer.height == 0)
393 {
394 break;
395 }
396
397 this->pyramid.push_back(new NCVMatrixAlloc<T>(alloc, szCurLayer.width, szCurLayer.height));
398 ncvAssertPrintReturn(((NCVMatrixAlloc<T> *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid::ctor error", );
399 this->nLayers++;
400
401 //fill in the layer
402 NCV_SKIP_COND_BEGIN
403
404 const NCVMatrix<T> *prevLayer = i == 0 ? this->layer0 : this->pyramid[i-1];
405 NCVMatrix<T> *curLayer = this->pyramid[i];
406
407 if (bDeviceCode)
408 {
409 dim3 bDim(16, 8);
410 dim3 gDim(divUp(szCurLayer.width, bDim.x), divUp(szCurLayer.height, bDim.y));
411 kernelDownsampleX2<<<gDim, bDim, 0, cuStream>>>(prevLayer->ptr(),
412 prevLayer->pitch(),
413 curLayer->ptr(),
414 curLayer->pitch(),
415 szCurLayer);
416 ncvAssertPrintReturn(cudaSuccess == cudaGetLastError(), "NCVImagePyramid::ctor error", );
417
418 #ifdef SELF_CHECK_GPU
419 NCVMatrixAlloc<T> h_prevLayer(allocCPU, prevLayer->width(), prevLayer->height());
420 ncvAssertPrintReturn(h_prevLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
421 NCVMatrixAlloc<T> h_curLayer(allocCPU, curLayer->width(), curLayer->height());
422 ncvAssertPrintReturn(h_curLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
423 ncvAssertPrintReturn(NCV_SUCCESS == prevLayer->copy2D(h_prevLayer, prevLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
424 ncvAssertPrintReturn(NCV_SUCCESS == curLayer->copy2D(h_curLayer, curLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
425 ncvAssertPrintReturn(cudaSuccess == cudaStreamSynchronize(cuStream), "Validation failure in NCVImagePyramid::ctor", );
426 for (Ncv32u i=0; i<szCurLayer.height; i++)
427 {
428 for (Ncv32u j=0; j<szCurLayer.width; j++)
429 {
430 T p00 = h_prevLayer.at(2*j+0, 2*i+0);
431 T p01 = h_prevLayer.at(2*j+1, 2*i+0);
432 T p10 = h_prevLayer.at(2*j+0, 2*i+1);
433 T p11 = h_prevLayer.at(2*j+1, 2*i+1);
434 T outGold = _average4(p00, p01, p10, p11);
435 T outGPU = h_curLayer.at(j, i);
436 ncvAssertPrintReturn(0 == memcmp(&outGold, &outGPU, sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelDownsampleX2", );
437 }
438 }
439 #endif
440 }
441 else
442 {
443 for (Ncv32u i=0; i<szCurLayer.height; i++)
444 {
445 for (Ncv32u j=0; j<szCurLayer.width; j++)
446 {
447 T p00 = prevLayer->at(2*j+0, 2*i+0);
448 T p01 = prevLayer->at(2*j+1, 2*i+0);
449 T p10 = prevLayer->at(2*j+0, 2*i+1);
450 T p11 = prevLayer->at(2*j+1, 2*i+1);
451 curLayer->at(j, i) = _average4(p00, p01, p10, p11);
452 }
453 }
454 }
455
456 NCV_SKIP_COND_END
457
458 szLastLayer = szCurLayer;
459 }
460
461 this->_isInitialized = true;
462 }
463
464
465 template <class T>
466 NCVImagePyramid<T>::~NCVImagePyramid()
467 {
468 }
469
470
471 template <class T>
472 NcvBool NCVImagePyramid<T>::isInitialized() const
473 {
474 return this->_isInitialized;
475 }
476
477
478 template <class T>
479 NCVStatus NCVImagePyramid<T>::getLayer(NCVMatrix<T> &outImg,
480 NcvSize32u outRoi,
481 NcvBool bTrilinear,
482 cudaStream_t cuStream) const
483 {
484 ncvAssertReturn(this->isInitialized(), NCV_UNKNOWN_ERROR);
485 ncvAssertReturn(outImg.memType() == this->layer0->memType(), NCV_MEM_RESIDENCE_ERROR);
486 ncvAssertReturn(outRoi.width <= this->layer0->width() && outRoi.height <= this->layer0->height() &&
487 outRoi.width > 0 && outRoi.height > 0, NCV_DIMENSIONS_INVALID);
488
489 if (outRoi.width == this->layer0->width() && outRoi.height == this->layer0->height())
490 {
491 ncvAssertReturnNcvStat(this->layer0->copy2D(outImg, NcvSize32u(this->layer0->width(), this->layer0->height()), cuStream));
492 return NCV_SUCCESS;
493 }
494
495 Ncv32f lastScale = 1.0f;
496 Ncv32f curScale;
497 const NCVMatrix<T> *lastLayer = this->layer0;
498 const NCVMatrix<T> *curLayer = NULL;
499 NcvBool bUse2Refs = false;
500
501 for (Ncv32u i=0; i<this->nLayers-1; i++)
502 {
503 curScale = lastScale * 0.5f;
504 curLayer = this->pyramid[i];
505
506 if (outRoi.width == curLayer->width() && outRoi.height == curLayer->height())
507 {
508 ncvAssertReturnNcvStat(this->pyramid[i]->copy2D(outImg, NcvSize32u(this->pyramid[i]->width(), this->pyramid[i]->height()), cuStream));
509 return NCV_SUCCESS;
510 }
511
512 if (outRoi.width >= curLayer->width() && outRoi.height >= curLayer->height())
513 {
514 if (outRoi.width < lastLayer->width() && outRoi.height < lastLayer->height())
515 {
516 bUse2Refs = true;
517 }
518 break;
519 }
520
521 lastScale = curScale;
522 lastLayer = curLayer;
523 }
524
525 bUse2Refs = bUse2Refs && bTrilinear;
526
527 NCV_SET_SKIP_COND(outImg.memType() == NCVMemoryTypeNone);
528 NcvBool bDeviceCode = this->layer0->memType() == NCVMemoryTypeDevice;
529
530 #ifdef SELF_CHECK_GPU
531 NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
532 #endif
533
534 NCV_SKIP_COND_BEGIN
535
536 if (bDeviceCode)
537 {
538 ncvAssertReturn(bUse2Refs == false, NCV_NOT_IMPLEMENTED);
539
540 dim3 bDim(16, 8);
541 dim3 gDim(divUp(outRoi.width, bDim.x), divUp(outRoi.height, bDim.y));
542 kernelInterpolateFrom1<<<gDim, bDim, 0, cuStream>>>(lastLayer->ptr(),
543 lastLayer->pitch(),
544 lastLayer->size(),
545 outImg.ptr(),
546 outImg.pitch(),
547 outRoi);
548 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
549
550 #ifdef SELF_CHECK_GPU
551 ncvSafeMatAlloc(h_lastLayer, T, allocCPU, lastLayer->width(), lastLayer->height(), NCV_ALLOCATOR_BAD_ALLOC);
552 ncvSafeMatAlloc(h_outImg, T, allocCPU, outImg.width(), outImg.height(), NCV_ALLOCATOR_BAD_ALLOC);
553 ncvAssertReturnNcvStat(lastLayer->copy2D(h_lastLayer, lastLayer->size(), cuStream));
554 ncvAssertReturnNcvStat(outImg.copy2D(h_outImg, outRoi, cuStream));
555 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
556
557 for (Ncv32u i=0; i<outRoi.height; i++)
558 {
559 for (Ncv32u j=0; j<outRoi.width; j++)
560 {
561 NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
562 Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
563 Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
564 T outGold = _interpBilinear(h_lastLayer, ptTopX, ptTopY);
565 ncvAssertPrintReturn(0 == memcmp(&outGold, &h_outImg.at(j,i), sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelInterpolateFrom1", NCV_UNKNOWN_ERROR);
566 }
567 }
568 #endif
569 }
570 else
571 {
572 for (Ncv32u i=0; i<outRoi.height; i++)
573 {
574 for (Ncv32u j=0; j<outRoi.width; j++)
575 {
576 //top layer pixel (always exists)
577 NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
578 Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
579 Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
580 T topPix = _interpBilinear(*lastLayer, ptTopX, ptTopY);
581 T trilinearPix = topPix;
582
583 if (bUse2Refs)
584 {
585 //bottom layer pixel (exists only if the requested scale is greater than the smallest layer scale)
586 NcvSize32u szBottomLayer(curLayer->width(), curLayer->height());
587 Ncv32f ptBottomX = 1.0f * (szBottomLayer.width - 1) * j / (outRoi.width - 1);
588 Ncv32f ptBottomY = 1.0f * (szBottomLayer.height - 1) * i / (outRoi.height - 1);
589 T bottomPix = _interpBilinear(*curLayer, ptBottomX, ptBottomY);
590
591 Ncv32f scale = (1.0f * outRoi.width / layer0->width() + 1.0f * outRoi.height / layer0->height()) / 2;
592 Ncv32f dl = (scale - curScale) / (lastScale - curScale);
593 dl = CLAMP(dl, 0.0f, 1.0f);
594 trilinearPix = _interpLinear(bottomPix, topPix, dl);
595 }
596
597 outImg.at(j, i) = trilinearPix;
598 }
599 }
600 }
601
602 NCV_SKIP_COND_END
603
604 return NCV_SUCCESS;
605 }
606
607
608 template class NCVImagePyramid<uchar1>;
609 template class NCVImagePyramid<uchar3>;
610 template class NCVImagePyramid<uchar4>;
611 template class NCVImagePyramid<ushort1>;
612 template class NCVImagePyramid<ushort3>;
613 template class NCVImagePyramid<ushort4>;
614 template class NCVImagePyramid<uint1>;
615 template class NCVImagePyramid<uint3>;
616 template class NCVImagePyramid<uint4>;
617 template class NCVImagePyramid<float1>;
618 template class NCVImagePyramid<float3>;
619 template class NCVImagePyramid<float4>;
620
621 #endif //_WIN32
622