1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #include <stdio.h>
44 #include <cuda_runtime.h>
45 
46 #include "opencv2/core/cuda/common.hpp"
47 
48 #include "opencv2/cudalegacy/NCV.hpp"
49 #include "opencv2/cudalegacy/NCVPyramid.hpp"
50 
51 #include "NCVAlg.hpp"
52 #include "NCVPixelOperations.hpp"
53 
54 template<typename T, Ncv32u CN> struct __average4_CN {static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11);};
55 
56 template<typename T> struct __average4_CN<T, 1> {
_average4_CN__average4_CN57 static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
58 {
59     T out;
60     out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
61     return out;
62 }};
63 
64 template<> struct __average4_CN<float1, 1> {
_average4_CN__average4_CN65 static __host__ __device__ float1 _average4_CN(const float1 &p00, const float1 &p01, const float1 &p10, const float1 &p11)
66 {
67     float1 out;
68     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
69     return out;
70 }};
71 
72 template<> struct __average4_CN<double1, 1> {
_average4_CN__average4_CN73 static __host__ __device__ double1 _average4_CN(const double1 &p00, const double1 &p01, const double1 &p10, const double1 &p11)
74 {
75     double1 out;
76     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
77     return out;
78 }};
79 
80 template<typename T> struct __average4_CN<T, 3> {
_average4_CN__average4_CN81 static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
82 {
83     T out;
84     out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
85     out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
86     out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
87     return out;
88 }};
89 
90 template<> struct __average4_CN<float3, 3> {
_average4_CN__average4_CN91 static __host__ __device__ float3 _average4_CN(const float3 &p00, const float3 &p01, const float3 &p10, const float3 &p11)
92 {
93     float3 out;
94     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
95     out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
96     out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
97     return out;
98 }};
99 
100 template<> struct __average4_CN<double3, 3> {
_average4_CN__average4_CN101 static __host__ __device__ double3 _average4_CN(const double3 &p00, const double3 &p01, const double3 &p10, const double3 &p11)
102 {
103     double3 out;
104     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
105     out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
106     out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
107     return out;
108 }};
109 
110 template<typename T> struct __average4_CN<T, 4> {
_average4_CN__average4_CN111 static __host__ __device__ T _average4_CN(const T &p00, const T &p01, const T &p10, const T &p11)
112 {
113     T out;
114     out.x = ((Ncv32s)p00.x + p01.x + p10.x + p11.x + 2) / 4;
115     out.y = ((Ncv32s)p00.y + p01.y + p10.y + p11.y + 2) / 4;
116     out.z = ((Ncv32s)p00.z + p01.z + p10.z + p11.z + 2) / 4;
117     out.w = ((Ncv32s)p00.w + p01.w + p10.w + p11.w + 2) / 4;
118     return out;
119 }};
120 
121 template<> struct __average4_CN<float4, 4> {
_average4_CN__average4_CN122 static __host__ __device__ float4 _average4_CN(const float4 &p00, const float4 &p01, const float4 &p10, const float4 &p11)
123 {
124     float4 out;
125     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
126     out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
127     out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
128     out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
129     return out;
130 }};
131 
132 template<> struct __average4_CN<double4, 4> {
_average4_CN__average4_CN133 static __host__ __device__ double4 _average4_CN(const double4 &p00, const double4 &p01, const double4 &p10, const double4 &p11)
134 {
135     double4 out;
136     out.x = (p00.x + p01.x + p10.x + p11.x) / 4;
137     out.y = (p00.y + p01.y + p10.y + p11.y) / 4;
138     out.z = (p00.z + p01.z + p10.z + p11.z) / 4;
139     out.w = (p00.w + p01.w + p10.w + p11.w) / 4;
140     return out;
141 }};
142 
_average4(const T & p00,const T & p01,const T & p10,const T & p11)143 template<typename T> static __host__ __device__ T _average4(const T &p00, const T &p01, const T &p10, const T &p11)
144 {
145     return __average4_CN<T, NC(T)>::_average4_CN(p00, p01, p10, p11);
146 }
147 
148 
149 template<typename Tin, typename Tout, Ncv32u CN> struct __lerp_CN {static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d);};
150 
151 template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 1> {
_lerp_CN__lerp_CN152 static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
153 {
154     typedef typename TConvVec2Base<Tout>::TBase TB;
155     return _pixMake(TB(b.x * d + a.x * (1 - d)));
156 }};
157 
158 template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 3> {
_lerp_CN__lerp_CN159 static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
160 {
161     typedef typename TConvVec2Base<Tout>::TBase TB;
162     return _pixMake(TB(b.x * d + a.x * (1 - d)),
163                     TB(b.y * d + a.y * (1 - d)),
164                     TB(b.z * d + a.z * (1 - d)));
165 }};
166 
167 template<typename Tin, typename Tout> struct __lerp_CN<Tin, Tout, 4> {
_lerp_CN__lerp_CN168 static __host__ __device__ Tout _lerp_CN(const Tin &a, const Tin &b, Ncv32f d)
169 {
170     typedef typename TConvVec2Base<Tout>::TBase TB;
171     return _pixMake(TB(b.x * d + a.x * (1 - d)),
172                     TB(b.y * d + a.y * (1 - d)),
173                     TB(b.z * d + a.z * (1 - d)),
174                     TB(b.w * d + a.w * (1 - d)));
175 }};
176 
_lerp(const Tin & a,const Tin & b,Ncv32f d)177 template<typename Tin, typename Tout> static __host__ __device__ Tout _lerp(const Tin &a, const Tin &b, Ncv32f d)
178 {
179     return __lerp_CN<Tin, Tout, NC(Tin)>::_lerp_CN(a, b, d);
180 }
181 
182 
183 template<typename T>
kernelDownsampleX2(T * d_src,Ncv32u srcPitch,T * d_dst,Ncv32u dstPitch,NcvSize32u dstRoi)184 __global__ void kernelDownsampleX2(T *d_src,
185                                    Ncv32u srcPitch,
186                                    T *d_dst,
187                                    Ncv32u dstPitch,
188                                    NcvSize32u dstRoi)
189 {
190     Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
191     Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
192 
193     if (i < dstRoi.height && j < dstRoi.width)
194     {
195         T *d_src_line1 = (T *)((Ncv8u *)d_src + (2 * i + 0) * srcPitch);
196         T *d_src_line2 = (T *)((Ncv8u *)d_src + (2 * i + 1) * srcPitch);
197         T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
198 
199         T p00 = d_src_line1[2*j+0];
200         T p01 = d_src_line1[2*j+1];
201         T p10 = d_src_line2[2*j+0];
202         T p11 = d_src_line2[2*j+1];
203 
204         d_dst_line[j] = _average4(p00, p01, p10, p11);
205     }
206 }
207 
208 namespace cv { namespace cuda { namespace device
209 {
210     namespace pyramid
211     {
kernelDownsampleX2_gpu(PtrStepSzb src,PtrStepSzb dst,cudaStream_t stream)212         template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
213         {
214             dim3 bDim(16, 8);
215             dim3 gDim(divUp(src.cols, bDim.x), divUp(src.rows, bDim.y));
216 
217             kernelDownsampleX2<<<gDim, bDim, 0, stream>>>((T*)src.data, static_cast<Ncv32u>(src.step),
218                 (T*)dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
219 
220             cudaSafeCall( cudaGetLastError() );
221 
222             if (stream == 0)
223                 cudaSafeCall( cudaDeviceSynchronize() );
224         }
225 
downsampleX2(PtrStepSzb src,PtrStepSzb dst,int depth,int cn,cudaStream_t stream)226         void downsampleX2(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream)
227         {
228             typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
229 
230             static const func_t funcs[6][4] =
231             {
232                 {kernelDownsampleX2_gpu<uchar1>       , 0 /*kernelDownsampleX2_gpu<uchar2>*/ , kernelDownsampleX2_gpu<uchar3>      , kernelDownsampleX2_gpu<uchar4>      },
233                 {0 /*kernelDownsampleX2_gpu<char1>*/  , 0 /*kernelDownsampleX2_gpu<char2>*/  , 0 /*kernelDownsampleX2_gpu<char3>*/ , 0 /*kernelDownsampleX2_gpu<char4>*/ },
234                 {kernelDownsampleX2_gpu<ushort1>      , 0 /*kernelDownsampleX2_gpu<ushort2>*/, kernelDownsampleX2_gpu<ushort3>     , kernelDownsampleX2_gpu<ushort4>     },
235                 {0 /*kernelDownsampleX2_gpu<short1>*/ , 0 /*kernelDownsampleX2_gpu<short2>*/ , 0 /*kernelDownsampleX2_gpu<short3>*/, 0 /*kernelDownsampleX2_gpu<short4>*/},
236                 {0 /*kernelDownsampleX2_gpu<int1>*/   , 0 /*kernelDownsampleX2_gpu<int2>*/   , 0 /*kernelDownsampleX2_gpu<int3>*/  , 0 /*kernelDownsampleX2_gpu<int4>*/  },
237                 {kernelDownsampleX2_gpu<float1>       , 0 /*kernelDownsampleX2_gpu<float2>*/ , kernelDownsampleX2_gpu<float3>      , kernelDownsampleX2_gpu<float4>      }
238             };
239 
240             const func_t func = funcs[depth][cn - 1];
241             CV_Assert(func != 0);
242 
243             func(src, dst, stream);
244         }
245     }
246 }}}
247 
248 
249 
250 
251 template<typename T>
kernelInterpolateFrom1(T * d_srcTop,Ncv32u srcTopPitch,NcvSize32u szTopRoi,T * d_dst,Ncv32u dstPitch,NcvSize32u dstRoi)252 __global__ void kernelInterpolateFrom1(T *d_srcTop,
253                                        Ncv32u srcTopPitch,
254                                        NcvSize32u szTopRoi,
255                                        T *d_dst,
256                                        Ncv32u dstPitch,
257                                        NcvSize32u dstRoi)
258 {
259     Ncv32u i = blockIdx.y * blockDim.y + threadIdx.y;
260     Ncv32u j = blockIdx.x * blockDim.x + threadIdx.x;
261 
262     if (i < dstRoi.height && j < dstRoi.width)
263     {
264         Ncv32f ptTopX = 1.0f * (szTopRoi.width - 1) * j / (dstRoi.width - 1);
265         Ncv32f ptTopY = 1.0f * (szTopRoi.height - 1) * i / (dstRoi.height - 1);
266         Ncv32u xl = (Ncv32u)ptTopX;
267         Ncv32u xh = xl+1;
268         Ncv32f dx = ptTopX - xl;
269         Ncv32u yl = (Ncv32u)ptTopY;
270         Ncv32u yh = yl+1;
271         Ncv32f dy = ptTopY - yl;
272 
273         T *d_src_line1 = (T *)((Ncv8u *)d_srcTop + yl * srcTopPitch);
274         T *d_src_line2 = (T *)((Ncv8u *)d_srcTop + yh * srcTopPitch);
275         T *d_dst_line = (T *)((Ncv8u *)d_dst + i * dstPitch);
276 
277         T p00, p01, p10, p11;
278         p00 = d_src_line1[xl];
279         p01 = xh < szTopRoi.width ? d_src_line1[xh] : p00;
280         p10 = yh < szTopRoi.height ? d_src_line2[xl] : p00;
281         p11 = (xh < szTopRoi.width && yh < szTopRoi.height) ? d_src_line2[xh] : p00;
282         typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
283         TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
284         TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
285         TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
286         T outPix = _pixDemoteClampZ<TVFlt, T>(mixture);
287 
288         d_dst_line[j] = outPix;
289     }
290 }
291 namespace cv { namespace cuda { namespace device
292 {
293     namespace pyramid
294     {
kernelInterpolateFrom1_gpu(PtrStepSzb src,PtrStepSzb dst,cudaStream_t stream)295         template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
296         {
297             dim3 bDim(16, 8);
298             dim3 gDim(divUp(dst.cols, bDim.x), divUp(dst.rows, bDim.y));
299 
300             kernelInterpolateFrom1<<<gDim, bDim, 0, stream>>>((T*) src.data, static_cast<Ncv32u>(src.step), NcvSize32u(src.cols, src.rows),
301                 (T*) dst.data, static_cast<Ncv32u>(dst.step), NcvSize32u(dst.cols, dst.rows));
302 
303             cudaSafeCall( cudaGetLastError() );
304 
305             if (stream == 0)
306                 cudaSafeCall( cudaDeviceSynchronize() );
307         }
308 
interpolateFrom1(PtrStepSzb src,PtrStepSzb dst,int depth,int cn,cudaStream_t stream)309         void interpolateFrom1(PtrStepSzb src, PtrStepSzb dst, int depth, int cn, cudaStream_t stream)
310         {
311             typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
312 
313             static const func_t funcs[6][4] =
314             {
315                 {kernelInterpolateFrom1_gpu<uchar1>      , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3>      , kernelInterpolateFrom1_gpu<uchar4>      },
316                 {0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/  , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ },
317                 {kernelInterpolateFrom1_gpu<ushort1>     , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3>     , kernelInterpolateFrom1_gpu<ushort4>     },
318                 {0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/},
319                 {0 /*kernelInterpolateFrom1_gpu<int1>*/  , 0 /*kernelInterpolateFrom1_gpu<int2>*/   , 0 /*kernelInterpolateFrom1_gpu<int3>*/  , 0 /*kernelInterpolateFrom1_gpu<int4>*/  },
320                 {kernelInterpolateFrom1_gpu<float1>      , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3>      , kernelInterpolateFrom1_gpu<float4>      }
321             };
322 
323             const func_t func = funcs[depth][cn - 1];
324             CV_Assert(func != 0);
325 
326             func(src, dst, stream);
327         }
328     }
329 }}}
330 
331 
332 #if 0 //def _WIN32
333 
334 template<typename T>
335 static T _interpLinear(const T &a, const T &b, Ncv32f d)
336 {
337     typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
338     TVFlt tmp = _lerp<T, TVFlt>(a, b, d);
339     return _pixDemoteClampZ<TVFlt, T>(tmp);
340 }
341 
342 
343 template<typename T>
344 static T _interpBilinear(const NCVMatrix<T> &refLayer, Ncv32f x, Ncv32f y)
345 {
346     Ncv32u xl = (Ncv32u)x;
347     Ncv32u xh = xl+1;
348     Ncv32f dx = x - xl;
349     Ncv32u yl = (Ncv32u)y;
350     Ncv32u yh = yl+1;
351     Ncv32f dy = y - yl;
352     T p00, p01, p10, p11;
353     p00 = refLayer.at(xl, yl);
354     p01 = xh < refLayer.width() ? refLayer.at(xh, yl) : p00;
355     p10 = yh < refLayer.height() ? refLayer.at(xl, yh) : p00;
356     p11 = (xh < refLayer.width() && yh < refLayer.height()) ? refLayer.at(xh, yh) : p00;
357     typedef typename TConvBase2Vec<Ncv32f, NC(T)>::TVec TVFlt;
358     TVFlt m_00_01 = _lerp<T, TVFlt>(p00, p01, dx);
359     TVFlt m_10_11 = _lerp<T, TVFlt>(p10, p11, dx);
360     TVFlt mixture = _lerp<TVFlt, TVFlt>(m_00_01, m_10_11, dy);
361     return _pixDemoteClampZ<TVFlt, T>(mixture);
362 }
363 
364 template <class T>
365 NCVImagePyramid<T>::NCVImagePyramid(const NCVMatrix<T> &img,
366                                     Ncv8u numLayers,
367                                     INCVMemAllocator &alloc,
368                                     cudaStream_t cuStream)
369 {
370     this->_isInitialized = false;
371     ncvAssertPrintReturn(img.memType() == alloc.memType(), "NCVImagePyramid::ctor error", );
372 
373     this->layer0 = &img;
374     NcvSize32u szLastLayer(img.width(), img.height());
375     this->nLayers = 1;
376 
377     NCV_SET_SKIP_COND(alloc.isCounting());
378     NcvBool bDeviceCode = alloc.memType() == NCVMemoryTypeDevice;
379 
380     if (numLayers == 0)
381     {
382         numLayers = 255; //it will cut-off when any of the dimensions goes 1
383     }
384 
385 #ifdef SELF_CHECK_GPU
386     NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
387 #endif
388 
389     for (Ncv32u i=0; i<(Ncv32u)numLayers-1; i++)
390     {
391         NcvSize32u szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
392         if (szCurLayer.width == 0 || szCurLayer.height == 0)
393         {
394             break;
395         }
396 
397         this->pyramid.push_back(new NCVMatrixAlloc<T>(alloc, szCurLayer.width, szCurLayer.height));
398         ncvAssertPrintReturn(((NCVMatrixAlloc<T> *)(this->pyramid[i]))->isMemAllocated(), "NCVImagePyramid::ctor error", );
399         this->nLayers++;
400 
401         //fill in the layer
402         NCV_SKIP_COND_BEGIN
403 
404         const NCVMatrix<T> *prevLayer = i == 0 ? this->layer0 : this->pyramid[i-1];
405         NCVMatrix<T> *curLayer = this->pyramid[i];
406 
407         if (bDeviceCode)
408         {
409             dim3 bDim(16, 8);
410             dim3 gDim(divUp(szCurLayer.width, bDim.x), divUp(szCurLayer.height, bDim.y));
411             kernelDownsampleX2<<<gDim, bDim, 0, cuStream>>>(prevLayer->ptr(),
412                                                             prevLayer->pitch(),
413                                                             curLayer->ptr(),
414                                                             curLayer->pitch(),
415                                                             szCurLayer);
416             ncvAssertPrintReturn(cudaSuccess == cudaGetLastError(), "NCVImagePyramid::ctor error", );
417 
418 #ifdef SELF_CHECK_GPU
419             NCVMatrixAlloc<T> h_prevLayer(allocCPU, prevLayer->width(), prevLayer->height());
420             ncvAssertPrintReturn(h_prevLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
421             NCVMatrixAlloc<T> h_curLayer(allocCPU, curLayer->width(), curLayer->height());
422             ncvAssertPrintReturn(h_curLayer.isMemAllocated(), "Validation failure in NCVImagePyramid::ctor", );
423             ncvAssertPrintReturn(NCV_SUCCESS == prevLayer->copy2D(h_prevLayer, prevLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
424             ncvAssertPrintReturn(NCV_SUCCESS == curLayer->copy2D(h_curLayer, curLayer->size(), cuStream), "Validation failure in NCVImagePyramid::ctor", );
425             ncvAssertPrintReturn(cudaSuccess == cudaStreamSynchronize(cuStream), "Validation failure in NCVImagePyramid::ctor", );
426             for (Ncv32u i=0; i<szCurLayer.height; i++)
427             {
428                 for (Ncv32u j=0; j<szCurLayer.width; j++)
429                 {
430                     T p00 = h_prevLayer.at(2*j+0, 2*i+0);
431                     T p01 = h_prevLayer.at(2*j+1, 2*i+0);
432                     T p10 = h_prevLayer.at(2*j+0, 2*i+1);
433                     T p11 = h_prevLayer.at(2*j+1, 2*i+1);
434                     T outGold = _average4(p00, p01, p10, p11);
435                     T outGPU = h_curLayer.at(j, i);
436                     ncvAssertPrintReturn(0 == memcmp(&outGold, &outGPU, sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelDownsampleX2", );
437                 }
438             }
439 #endif
440         }
441         else
442         {
443             for (Ncv32u i=0; i<szCurLayer.height; i++)
444             {
445                 for (Ncv32u j=0; j<szCurLayer.width; j++)
446                 {
447                     T p00 = prevLayer->at(2*j+0, 2*i+0);
448                     T p01 = prevLayer->at(2*j+1, 2*i+0);
449                     T p10 = prevLayer->at(2*j+0, 2*i+1);
450                     T p11 = prevLayer->at(2*j+1, 2*i+1);
451                     curLayer->at(j, i) = _average4(p00, p01, p10, p11);
452                 }
453             }
454         }
455 
456         NCV_SKIP_COND_END
457 
458         szLastLayer = szCurLayer;
459     }
460 
461     this->_isInitialized = true;
462 }
463 
464 
465 template <class T>
466 NCVImagePyramid<T>::~NCVImagePyramid()
467 {
468 }
469 
470 
471 template <class T>
472 NcvBool NCVImagePyramid<T>::isInitialized() const
473 {
474     return this->_isInitialized;
475 }
476 
477 
478 template <class T>
479 NCVStatus NCVImagePyramid<T>::getLayer(NCVMatrix<T> &outImg,
480                                        NcvSize32u outRoi,
481                                        NcvBool bTrilinear,
482                                        cudaStream_t cuStream) const
483 {
484     ncvAssertReturn(this->isInitialized(), NCV_UNKNOWN_ERROR);
485     ncvAssertReturn(outImg.memType() == this->layer0->memType(), NCV_MEM_RESIDENCE_ERROR);
486     ncvAssertReturn(outRoi.width <= this->layer0->width() && outRoi.height <= this->layer0->height() &&
487                     outRoi.width > 0 && outRoi.height > 0, NCV_DIMENSIONS_INVALID);
488 
489     if (outRoi.width == this->layer0->width() && outRoi.height == this->layer0->height())
490     {
491         ncvAssertReturnNcvStat(this->layer0->copy2D(outImg, NcvSize32u(this->layer0->width(), this->layer0->height()), cuStream));
492         return NCV_SUCCESS;
493     }
494 
495     Ncv32f lastScale = 1.0f;
496     Ncv32f curScale;
497     const NCVMatrix<T> *lastLayer = this->layer0;
498     const NCVMatrix<T> *curLayer = NULL;
499     NcvBool bUse2Refs = false;
500 
501     for (Ncv32u i=0; i<this->nLayers-1; i++)
502     {
503         curScale = lastScale * 0.5f;
504         curLayer = this->pyramid[i];
505 
506         if (outRoi.width == curLayer->width() && outRoi.height == curLayer->height())
507         {
508             ncvAssertReturnNcvStat(this->pyramid[i]->copy2D(outImg, NcvSize32u(this->pyramid[i]->width(), this->pyramid[i]->height()), cuStream));
509             return NCV_SUCCESS;
510         }
511 
512         if (outRoi.width >= curLayer->width() && outRoi.height >= curLayer->height())
513         {
514             if (outRoi.width < lastLayer->width() && outRoi.height < lastLayer->height())
515             {
516                 bUse2Refs = true;
517             }
518             break;
519         }
520 
521         lastScale = curScale;
522         lastLayer = curLayer;
523     }
524 
525     bUse2Refs = bUse2Refs && bTrilinear;
526 
527     NCV_SET_SKIP_COND(outImg.memType() == NCVMemoryTypeNone);
528     NcvBool bDeviceCode = this->layer0->memType() == NCVMemoryTypeDevice;
529 
530 #ifdef SELF_CHECK_GPU
531     NCVMemNativeAllocator allocCPU(NCVMemoryTypeHostPinned, 512);
532 #endif
533 
534     NCV_SKIP_COND_BEGIN
535 
536     if (bDeviceCode)
537     {
538         ncvAssertReturn(bUse2Refs == false, NCV_NOT_IMPLEMENTED);
539 
540         dim3 bDim(16, 8);
541         dim3 gDim(divUp(outRoi.width, bDim.x), divUp(outRoi.height, bDim.y));
542         kernelInterpolateFrom1<<<gDim, bDim, 0, cuStream>>>(lastLayer->ptr(),
543                                                             lastLayer->pitch(),
544                                                             lastLayer->size(),
545                                                             outImg.ptr(),
546                                                             outImg.pitch(),
547                                                             outRoi);
548         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
549 
550 #ifdef SELF_CHECK_GPU
551         ncvSafeMatAlloc(h_lastLayer, T, allocCPU, lastLayer->width(), lastLayer->height(), NCV_ALLOCATOR_BAD_ALLOC);
552         ncvSafeMatAlloc(h_outImg, T, allocCPU, outImg.width(), outImg.height(), NCV_ALLOCATOR_BAD_ALLOC);
553         ncvAssertReturnNcvStat(lastLayer->copy2D(h_lastLayer, lastLayer->size(), cuStream));
554         ncvAssertReturnNcvStat(outImg.copy2D(h_outImg, outRoi, cuStream));
555         ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
556 
557         for (Ncv32u i=0; i<outRoi.height; i++)
558         {
559             for (Ncv32u j=0; j<outRoi.width; j++)
560             {
561                 NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
562                 Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
563                 Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
564                 T outGold = _interpBilinear(h_lastLayer, ptTopX, ptTopY);
565                 ncvAssertPrintReturn(0 == memcmp(&outGold, &h_outImg.at(j,i), sizeof(T)), "Validation failure in NCVImagePyramid::ctor with kernelInterpolateFrom1", NCV_UNKNOWN_ERROR);
566             }
567         }
568 #endif
569     }
570     else
571     {
572         for (Ncv32u i=0; i<outRoi.height; i++)
573         {
574             for (Ncv32u j=0; j<outRoi.width; j++)
575             {
576                 //top layer pixel (always exists)
577                 NcvSize32u szTopLayer(lastLayer->width(), lastLayer->height());
578                 Ncv32f ptTopX = 1.0f * (szTopLayer.width - 1) * j / (outRoi.width - 1);
579                 Ncv32f ptTopY = 1.0f * (szTopLayer.height - 1) * i / (outRoi.height - 1);
580                 T topPix = _interpBilinear(*lastLayer, ptTopX, ptTopY);
581                 T trilinearPix = topPix;
582 
583                 if (bUse2Refs)
584                 {
585                     //bottom layer pixel (exists only if the requested scale is greater than the smallest layer scale)
586                     NcvSize32u szBottomLayer(curLayer->width(), curLayer->height());
587                     Ncv32f ptBottomX = 1.0f * (szBottomLayer.width - 1) * j / (outRoi.width - 1);
588                     Ncv32f ptBottomY = 1.0f * (szBottomLayer.height - 1) * i / (outRoi.height - 1);
589                     T bottomPix = _interpBilinear(*curLayer, ptBottomX, ptBottomY);
590 
591                     Ncv32f scale = (1.0f * outRoi.width / layer0->width() + 1.0f * outRoi.height / layer0->height()) / 2;
592                     Ncv32f dl = (scale - curScale) / (lastScale - curScale);
593                     dl = CLAMP(dl, 0.0f, 1.0f);
594                     trilinearPix = _interpLinear(bottomPix, topPix, dl);
595                 }
596 
597                 outImg.at(j, i) = trilinearPix;
598             }
599         }
600     }
601 
602     NCV_SKIP_COND_END
603 
604     return NCV_SUCCESS;
605 }
606 
607 
608 template class NCVImagePyramid<uchar1>;
609 template class NCVImagePyramid<uchar3>;
610 template class NCVImagePyramid<uchar4>;
611 template class NCVImagePyramid<ushort1>;
612 template class NCVImagePyramid<ushort3>;
613 template class NCVImagePyramid<ushort4>;
614 template class NCVImagePyramid<uint1>;
615 template class NCVImagePyramid<uint3>;
616 template class NCVImagePyramid<uint4>;
617 template class NCVImagePyramid<float1>;
618 template class NCVImagePyramid<float3>;
619 template class NCVImagePyramid<float4>;
620 
621 #endif //_WIN32
622