1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include "opencv2/core/cuda/common.hpp"
46 #include "opencv2/core/cuda/vec_math.hpp"
47 #include "opencv2/core/cuda/limits.hpp"
48 #include "opencv2/core/cuda/utility.hpp"
49 #include "opencv2/core/cuda/reduce.hpp"
50 #include "opencv2/core/cuda/functional.hpp"
51 #include "fgd.hpp"
52 
53 using namespace cv::cuda;
54 using namespace cv::cuda::device;
55 
56 namespace fgd
57 {
58     ////////////////////////////////////////////////////////////////////////////
59     // calcDiffHistogram
60 
61     const unsigned int UINT_BITS = 32U;
62     const int LOG_WARP_SIZE = 5;
63     const int WARP_SIZE = 1 << LOG_WARP_SIZE;
64 #if (__CUDA_ARCH__ < 120)
65     const unsigned int TAG_MASK = (1U << (UINT_BITS - LOG_WARP_SIZE)) - 1U;
66 #endif
67 
68     const int MERGE_THREADBLOCK_SIZE = 256;
69 
addByte(unsigned int * s_WarpHist_,unsigned int data,unsigned int threadTag)70     __device__ __forceinline__ void addByte(unsigned int* s_WarpHist_, unsigned int data, unsigned int threadTag)
71     {
72         #if (__CUDA_ARCH__ < 120)
73             volatile unsigned int* s_WarpHist = s_WarpHist_;
74             unsigned int count;
75             do
76             {
77                 count = s_WarpHist[data] & TAG_MASK;
78                 count = threadTag | (count + 1);
79                 s_WarpHist[data] = count;
80             } while (s_WarpHist[data] != count);
81         #else
82             atomicInc(s_WarpHist_ + data, (unsigned int)(-1));
83         #endif
84     }
85 
86 
87     template <typename PT, typename CT>
calcPartialHistogram(const PtrStepSz<PT> prevFrame,const PtrStep<CT> curFrame,unsigned int * partialBuf0,unsigned int * partialBuf1,unsigned int * partialBuf2)88     __global__ void calcPartialHistogram(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
89     {
90 #if (__CUDA_ARCH__ < 200)
91         const int HISTOGRAM_WARP_COUNT = 4;
92 #else
93         const int HISTOGRAM_WARP_COUNT = 6;
94 #endif
95         const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
96         const int HISTOGRAM_THREADBLOCK_MEMORY = HISTOGRAM_WARP_COUNT * HISTOGRAM_BIN_COUNT;
97 
98         //Per-warp subhistogram storage
99         __shared__ unsigned int s_Hist0[HISTOGRAM_THREADBLOCK_MEMORY];
100         __shared__ unsigned int s_Hist1[HISTOGRAM_THREADBLOCK_MEMORY];
101         __shared__ unsigned int s_Hist2[HISTOGRAM_THREADBLOCK_MEMORY];
102 
103         //Clear shared memory storage for current threadblock before processing
104         #pragma unroll
105         for (int i = 0; i < (HISTOGRAM_THREADBLOCK_MEMORY / HISTOGRAM_THREADBLOCK_SIZE); ++i)
106         {
107            s_Hist0[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
108            s_Hist1[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
109            s_Hist2[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
110         }
111         __syncthreads();
112 
113         const unsigned int warpId = threadIdx.x >> LOG_WARP_SIZE;
114 
115         unsigned int* s_WarpHist0 = s_Hist0 + warpId * HISTOGRAM_BIN_COUNT;
116         unsigned int* s_WarpHist1 = s_Hist1 + warpId * HISTOGRAM_BIN_COUNT;
117         unsigned int* s_WarpHist2 = s_Hist2 + warpId * HISTOGRAM_BIN_COUNT;
118 
119         const unsigned int tag = threadIdx.x << (UINT_BITS - LOG_WARP_SIZE);
120         const int dataCount = prevFrame.rows * prevFrame.cols;
121         for (unsigned int pos = blockIdx.x * HISTOGRAM_THREADBLOCK_SIZE + threadIdx.x; pos < dataCount; pos += HISTOGRAM_THREADBLOCK_SIZE * PARTIAL_HISTOGRAM_COUNT)
122         {
123             const unsigned int y = pos / prevFrame.cols;
124             const unsigned int x = pos % prevFrame.cols;
125 
126             PT prevVal = prevFrame(y, x);
127             CT curVal = curFrame(y, x);
128 
129             int3 diff = make_int3(
130                 ::abs(curVal.x - prevVal.x),
131                 ::abs(curVal.y - prevVal.y),
132                 ::abs(curVal.z - prevVal.z)
133             );
134 
135             addByte(s_WarpHist0, diff.x, tag);
136             addByte(s_WarpHist1, diff.y, tag);
137             addByte(s_WarpHist2, diff.z, tag);
138         }
139         __syncthreads();
140 
141         //Merge per-warp histograms into per-block and write to global memory
142         for (unsigned int bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_THREADBLOCK_SIZE)
143         {
144             unsigned int sum0 = 0;
145             unsigned int sum1 = 0;
146             unsigned int sum2 = 0;
147 
148             #pragma unroll
149             for (int i = 0; i < HISTOGRAM_WARP_COUNT; ++i)
150             {
151                 #if (__CUDA_ARCH__ < 120)
152                     sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
153                     sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
154                     sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
155                 #else
156                     sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT];
157                     sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT];
158                     sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT];
159                 #endif
160             }
161 
162             partialBuf0[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum0;
163             partialBuf1[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum1;
164             partialBuf2[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum2;
165         }
166     }
167 
mergeHistogram(const unsigned int * partialBuf0,const unsigned int * partialBuf1,const unsigned int * partialBuf2,unsigned int * hist0,unsigned int * hist1,unsigned int * hist2)168     __global__ void mergeHistogram(const unsigned int* partialBuf0, const unsigned int* partialBuf1, const unsigned int* partialBuf2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2)
169     {
170         unsigned int sum0 = 0;
171         unsigned int sum1 = 0;
172         unsigned int sum2 = 0;
173 
174         #pragma unroll
175         for (unsigned int i = threadIdx.x; i < PARTIAL_HISTOGRAM_COUNT; i += MERGE_THREADBLOCK_SIZE)
176         {
177             sum0 += partialBuf0[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
178             sum1 += partialBuf1[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
179             sum2 += partialBuf2[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
180         }
181 
182         __shared__ unsigned int data0[MERGE_THREADBLOCK_SIZE];
183         __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
184         __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
185 
186         plus<unsigned int> op;
187         reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
188 
189         if(threadIdx.x == 0)
190         {
191             hist0[blockIdx.x] = sum0;
192             hist1[blockIdx.x] = sum1;
193             hist2[blockIdx.x] = sum2;
194         }
195     }
196 
197     template <typename PT, typename CT>
calcDiffHistogram_gpu(PtrStepSzb prevFrame,PtrStepSzb curFrame,unsigned int * hist0,unsigned int * hist1,unsigned int * hist2,unsigned int * partialBuf0,unsigned int * partialBuf1,unsigned int * partialBuf2,bool cc20,cudaStream_t stream)198     void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
199                                unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
200                                unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
201                                bool cc20, cudaStream_t stream)
202     {
203         const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
204         const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
205 
206         calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
207                 (PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
208         cudaSafeCall( cudaGetLastError() );
209 
210         mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
211         cudaSafeCall( cudaGetLastError() );
212 
213         if (stream == 0)
214             cudaSafeCall( cudaDeviceSynchronize() );
215     }
216 
217     template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
218     template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
219     template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
220     template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
221 
222     /////////////////////////////////////////////////////////////////////////
223     // calcDiffThreshMask
224 
225     template <typename PT, typename CT>
calcDiffThreshMask(const PtrStepSz<PT> prevFrame,const PtrStep<CT> curFrame,uchar3 bestThres,PtrStepb changeMask)226     __global__ void calcDiffThreshMask(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
227     {
228         const int y = blockIdx.y * blockDim.y + threadIdx.y;
229         const int x = blockIdx.x * blockDim.x + threadIdx.x;
230 
231         if (y > prevFrame.rows || x > prevFrame.cols)
232             return;
233 
234         PT prevVal = prevFrame(y, x);
235         CT curVal = curFrame(y, x);
236 
237         int3 diff = make_int3(
238             ::abs(curVal.x - prevVal.x),
239             ::abs(curVal.y - prevVal.y),
240             ::abs(curVal.z - prevVal.z)
241         );
242 
243         if (diff.x > bestThres.x || diff.y > bestThres.y || diff.z > bestThres.z)
244             changeMask(y, x) = 255;
245     }
246 
247     template <typename PT, typename CT>
calcDiffThreshMask_gpu(PtrStepSzb prevFrame,PtrStepSzb curFrame,uchar3 bestThres,PtrStepSzb changeMask,cudaStream_t stream)248     void calcDiffThreshMask_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream)
249     {
250         dim3 block(32, 8);
251         dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
252 
253         calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, bestThres, changeMask);
254         cudaSafeCall( cudaGetLastError() );
255 
256         if (stream == 0)
257             cudaSafeCall( cudaDeviceSynchronize() );
258     }
259 
260     template void calcDiffThreshMask_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
261     template void calcDiffThreshMask_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
262     template void calcDiffThreshMask_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
263     template void calcDiffThreshMask_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
264 
265     /////////////////////////////////////////////////////////////////////////
266     // bgfgClassification
267 
268     __constant__ BGPixelStat c_stat;
269 
setBGPixelStat(const BGPixelStat & stat)270     void setBGPixelStat(const BGPixelStat& stat)
271     {
272         cudaSafeCall( cudaMemcpyToSymbol(c_stat, &stat, sizeof(BGPixelStat)) );
273     }
274 
275     template <typename T> struct Output;
276     template <> struct Output<uchar3>
277     {
makefgd::Output278         static __device__ __forceinline__ uchar3 make(uchar v0, uchar v1, uchar v2)
279         {
280             return make_uchar3(v0, v1, v2);
281         }
282     };
283     template <> struct Output<uchar4>
284     {
makefgd::Output285         static __device__ __forceinline__ uchar4 make(uchar v0, uchar v1, uchar v2)
286         {
287             return make_uchar4(v0, v1, v2, 255);
288         }
289     };
290 
291     template <typename PT, typename CT, typename OT>
bgfgClassification(const PtrStepSz<PT> prevFrame,const PtrStep<CT> curFrame,const PtrStepb Ftd,const PtrStepb Fbd,PtrStepb foreground,int deltaC,int deltaCC,float alpha2,int N1c,int N1cc)292     __global__ void bgfgClassification(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame,
293                                        const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
294                                        int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
295     {
296         const int i = blockIdx.y * blockDim.y + threadIdx.y;
297         const int j = blockIdx.x * blockDim.x + threadIdx.x;
298 
299         if (i > prevFrame.rows || j > prevFrame.cols)
300             return;
301 
302         if (Fbd(i, j) || Ftd(i, j))
303         {
304             float Pb  = 0.0f;
305             float Pv  = 0.0f;
306             float Pvb = 0.0f;
307 
308             int val = 0;
309 
310             // Is it a motion pixel?
311             if (Ftd(i, j))
312             {
313                 if (!c_stat.is_trained_dyn_model(i, j))
314                     val = 1;
315                 else
316                 {
317                     PT prevVal = prevFrame(i, j);
318                     CT curVal = curFrame(i, j);
319 
320                     // Compare with stored CCt vectors:
321                     for (int k = 0; k < N1cc && c_stat.PV_CC(i, j, k) > alpha2; ++k)
322                     {
323                         OT v1 = c_stat.V1_CC<OT>(i, j, k);
324                         OT v2 = c_stat.V2_CC<OT>(i, j, k);
325 
326                         if (::abs(v1.x - prevVal.x) <= deltaCC &&
327                             ::abs(v1.y - prevVal.y) <= deltaCC &&
328                             ::abs(v1.z - prevVal.z) <= deltaCC &&
329                             ::abs(v2.x - curVal.x) <= deltaCC &&
330                             ::abs(v2.y - curVal.y) <= deltaCC &&
331                             ::abs(v2.z - curVal.z) <= deltaCC)
332                         {
333                             Pv += c_stat.PV_CC(i, j, k);
334                             Pvb += c_stat.PVB_CC(i, j, k);
335                         }
336                     }
337 
338                     Pb = c_stat.Pbcc(i, j);
339                     if (2 * Pvb * Pb <= Pv)
340                         val = 1;
341                 }
342             }
343             else if(c_stat.is_trained_st_model(i, j))
344             {
345                 CT curVal = curFrame(i, j);
346 
347                 // Compare with stored Ct vectors:
348                 for (int k = 0; k < N1c && c_stat.PV_C(i, j, k) > alpha2; ++k)
349                 {
350                     OT v = c_stat.V_C<OT>(i, j, k);
351 
352                     if (::abs(v.x - curVal.x) <= deltaC &&
353                         ::abs(v.y - curVal.y) <= deltaC &&
354                         ::abs(v.z - curVal.z) <= deltaC)
355                     {
356                         Pv += c_stat.PV_C(i, j, k);
357                         Pvb += c_stat.PVB_C(i, j, k);
358                     }
359                 }
360                 Pb = c_stat.Pbc(i, j);
361                 if (2 * Pvb * Pb <= Pv)
362                     val = 1;
363             }
364 
365             // Update foreground:
366             foreground(i, j) = static_cast<uchar>(val);
367         } // end if( change detection...
368     }
369 
370     template <typename PT, typename CT, typename OT>
bgfgClassification_gpu(PtrStepSzb prevFrame,PtrStepSzb curFrame,PtrStepSzb Ftd,PtrStepSzb Fbd,PtrStepSzb foreground,int deltaC,int deltaCC,float alpha2,int N1c,int N1cc,cudaStream_t stream)371     void bgfgClassification_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
372                                 int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
373     {
374         dim3 block(32, 8);
375         dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
376 
377         cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );
378 
379         bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame,
380                                                                    Ftd, Fbd, foreground,
381                                                                    deltaC, deltaCC, alpha2, N1c, N1cc);
382         cudaSafeCall( cudaGetLastError() );
383 
384         if (stream == 0)
385             cudaSafeCall( cudaDeviceSynchronize() );
386     }
387 
388     template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
389     template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
390     template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
391     template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
392     template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
393     template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
394     template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
395     template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
396 
397     ////////////////////////////////////////////////////////////////////////////
398     // updateBackgroundModel
399 
400     template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
updateBackgroundModel(int cols,int rows,const PrevFramePtr2D prevFrame,const CurFramePtr2D curFrame,const FtdPtr2D Ftd,const FbdPtr2D Fbd,PtrStepb foreground,PtrStep<OT> background,int deltaC,int deltaCC,float alpha1,float alpha2,float alpha3,int N1c,int N1cc,int N2c,int N2cc,float T)401     __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
402                                           PtrStepb foreground, PtrStep<OT> background,
403                                           int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
404     {
405         const int i = blockIdx.y * blockDim.y + threadIdx.y;
406         const int j = blockIdx.x * blockDim.x + threadIdx.x;
407 
408         if (i > rows || j > cols)
409             return;
410 
411         const float MIN_PV = 1e-10f;
412 
413         const uchar is_trained_dyn_model = c_stat.is_trained_dyn_model(i, j);
414         if (Ftd(i, j) || !is_trained_dyn_model)
415         {
416             const float alpha = is_trained_dyn_model ? alpha2 : alpha3;
417 
418             float Pbcc = c_stat.Pbcc(i, j);
419 
420             //update Pb
421             Pbcc *= (1.0f - alpha);
422             if (!foreground(i, j))
423             {
424                 Pbcc += alpha;
425             }
426 
427             int min_dist = numeric_limits<int>::max();
428             int indx = -1;
429 
430             PT prevVal = prevFrame(i, j);
431             CT curVal = curFrame(i, j);
432 
433             // Find best Vi match:
434             for (int k = 0; k < N2cc; ++k)
435             {
436                 float PV_CC = c_stat.PV_CC(i, j, k);
437                 if (!PV_CC)
438                     break;
439 
440                 if (PV_CC < MIN_PV)
441                 {
442                     c_stat.PV_CC(i, j, k) = 0;
443                     c_stat.PVB_CC(i, j, k) = 0;
444                     continue;
445                 }
446 
447                 c_stat.PV_CC(i, j, k) = PV_CC * (1.0f - alpha);
448                 c_stat.PVB_CC(i, j, k) = c_stat.PVB_CC(i, j, k) * (1.0f - alpha);
449 
450                 OT v1 = c_stat.V1_CC<OT>(i, j, k);
451 
452                 int3 val1 = make_int3(
453                     ::abs(v1.x - prevVal.x),
454                     ::abs(v1.y - prevVal.y),
455                     ::abs(v1.z - prevVal.z)
456                 );
457 
458                 OT v2 = c_stat.V2_CC<OT>(i, j, k);
459 
460                 int3 val2 = make_int3(
461                     ::abs(v2.x - curVal.x),
462                     ::abs(v2.y - curVal.y),
463                     ::abs(v2.z - curVal.z)
464                 );
465 
466                 int dist = val1.x + val1.y + val1.z + val2.x + val2.y + val2.z;
467 
468                 if (dist < min_dist &&
469                     val1.x <= deltaCC && val1.y <= deltaCC && val1.z <= deltaCC &&
470                     val2.x <= deltaCC && val2.y <= deltaCC && val2.z <= deltaCC)
471                 {
472                     min_dist = dist;
473                     indx = k;
474                 }
475             }
476 
477             if (indx < 0)
478             {
479                 // Replace N2th elem in the table by new feature:
480                 indx = N2cc - 1;
481                 c_stat.PV_CC(i, j, indx) = alpha;
482                 c_stat.PVB_CC(i, j, indx) = alpha;
483 
484                 //udate Vt
485                 c_stat.V1_CC<OT>(i, j, indx) = Output<OT>::make(prevVal.x, prevVal.y, prevVal.z);
486                 c_stat.V2_CC<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
487             }
488             else
489             {
490                 // Update:
491                 c_stat.PV_CC(i, j, indx) += alpha;
492 
493                 if (!foreground(i, j))
494                 {
495                     c_stat.PVB_CC(i, j, indx) += alpha;
496                 }
497             }
498 
499             //re-sort CCt table by Pv
500             const float PV_CC_indx = c_stat.PV_CC(i, j, indx);
501             const float PVB_CC_indx = c_stat.PVB_CC(i, j, indx);
502             const OT V1_CC_indx = c_stat.V1_CC<OT>(i, j, indx);
503             const OT V2_CC_indx = c_stat.V2_CC<OT>(i, j, indx);
504             for (int k = 0; k < indx; ++k)
505             {
506                 if (c_stat.PV_CC(i, j, k) <= PV_CC_indx)
507                 {
508                     //shift elements
509                     float Pv_tmp1;
510                     float Pv_tmp2 = PV_CC_indx;
511 
512                     float Pvb_tmp1;
513                     float Pvb_tmp2 = PVB_CC_indx;
514 
515                     OT v1_tmp1;
516                     OT v1_tmp2 = V1_CC_indx;
517 
518                     OT v2_tmp1;
519                     OT v2_tmp2 = V2_CC_indx;
520 
521                     for (int l = k; l <= indx; ++l)
522                     {
523                         Pv_tmp1 = c_stat.PV_CC(i, j, l);
524                         c_stat.PV_CC(i, j, l) = Pv_tmp2;
525                         Pv_tmp2 = Pv_tmp1;
526 
527                         Pvb_tmp1 = c_stat.PVB_CC(i, j, l);
528                         c_stat.PVB_CC(i, j, l) = Pvb_tmp2;
529                         Pvb_tmp2 = Pvb_tmp1;
530 
531                         v1_tmp1 = c_stat.V1_CC<OT>(i, j, l);
532                         c_stat.V1_CC<OT>(i, j, l) = v1_tmp2;
533                         v1_tmp2 = v1_tmp1;
534 
535                         v2_tmp1 = c_stat.V2_CC<OT>(i, j, l);
536                         c_stat.V2_CC<OT>(i, j, l) = v2_tmp2;
537                         v2_tmp2 = v2_tmp1;
538                     }
539 
540                     break;
541                 }
542             }
543 
544             float sum1 = 0.0f;
545             float sum2 = 0.0f;
546 
547             //check "once-off" changes
548             for (int k = 0; k < N1cc; ++k)
549             {
550                 const float PV_CC = c_stat.PV_CC(i, j, k);
551                 if (!PV_CC)
552                     break;
553 
554                 sum1 += PV_CC;
555                 sum2 += c_stat.PVB_CC(i, j, k);
556             }
557 
558             if (sum1 > T)
559                 c_stat.is_trained_dyn_model(i, j) = 1;
560 
561             float diff = sum1 - Pbcc * sum2;
562 
563             // Update stat table:
564             if (diff > T)
565             {
566                 //new BG features are discovered
567                 for (int k = 0; k < N1cc; ++k)
568                 {
569                     const float PV_CC = c_stat.PV_CC(i, j, k);
570                     if (!PV_CC)
571                         break;
572 
573                     c_stat.PVB_CC(i, j, k) = (PV_CC - Pbcc * c_stat.PVB_CC(i, j, k)) / (1.0f - Pbcc);
574                 }
575             }
576 
577             c_stat.Pbcc(i, j) = Pbcc;
578         }
579 
580         // Handle "stationary" pixel:
581         if (!Ftd(i, j))
582         {
583             const float alpha = c_stat.is_trained_st_model(i, j) ? alpha2 : alpha3;
584 
585             float Pbc = c_stat.Pbc(i, j);
586 
587             //update Pb
588             Pbc *= (1.0f - alpha);
589             if (!foreground(i, j))
590             {
591                 Pbc += alpha;
592             }
593 
594             int min_dist = numeric_limits<int>::max();
595             int indx = -1;
596 
597             CT curVal = curFrame(i, j);
598 
599             //find best Vi match
600             for (int k = 0; k < N2c; ++k)
601             {
602                 float PV_C = c_stat.PV_C(i, j, k);
603 
604                 if (PV_C < MIN_PV)
605                 {
606                     c_stat.PV_C(i, j, k) = 0;
607                     c_stat.PVB_C(i, j, k) = 0;
608                     continue;
609                 }
610 
611                 // Exponential decay of memory
612                 c_stat.PV_C(i, j, k) = PV_C * (1.0f - alpha);
613                 c_stat.PVB_C(i, j, k) = c_stat.PVB_C(i, j, k) * (1.0f - alpha);
614 
615                 OT v = c_stat.V_C<OT>(i, j, k);
616                 int3 val = make_int3(
617                     ::abs(v.x - curVal.x),
618                     ::abs(v.y - curVal.y),
619                     ::abs(v.z - curVal.z)
620                 );
621 
622                 int dist = val.x + val.y + val.z;
623 
624                 if (dist < min_dist && val.x <= deltaC && val.y <= deltaC && val.z <= deltaC)
625                 {
626                     min_dist = dist;
627                     indx = k;
628                 }
629             }
630 
631             if (indx < 0)
632             {
633                 //N2th elem in the table is replaced by a new features
634                 indx = N2c - 1;
635 
636                 c_stat.PV_C(i, j, indx) = alpha;
637                 c_stat.PVB_C(i, j, indx) = alpha;
638 
639                 //udate Vt
640                 c_stat.V_C<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
641             }
642             else
643             {
644                 //update
645                 c_stat.PV_C(i, j, indx) += alpha;
646 
647                 if (!foreground(i, j))
648                 {
649                     c_stat.PVB_C(i, j, indx) += alpha;
650                 }
651             }
652 
653             //re-sort Ct table by Pv
654             const float PV_C_indx = c_stat.PV_C(i, j, indx);
655             const float PVB_C_indx = c_stat.PVB_C(i, j, indx);
656             OT V_C_indx = c_stat.V_C<OT>(i, j, indx);
657             for (int k = 0; k < indx; ++k)
658             {
659                 if (c_stat.PV_C(i, j, k) <= PV_C_indx)
660                 {
661                     //shift elements
662                     float Pv_tmp1;
663                     float Pv_tmp2 = PV_C_indx;
664 
665                     float Pvb_tmp1;
666                     float Pvb_tmp2 = PVB_C_indx;
667 
668                     OT v_tmp1;
669                     OT v_tmp2 = V_C_indx;
670 
671                     for (int l = k; l <= indx; ++l)
672                     {
673                         Pv_tmp1 = c_stat.PV_C(i, j, l);
674                         c_stat.PV_C(i, j, l) = Pv_tmp2;
675                         Pv_tmp2 = Pv_tmp1;
676 
677                         Pvb_tmp1 = c_stat.PVB_C(i, j, l);
678                         c_stat.PVB_C(i, j, l) = Pvb_tmp2;
679                         Pvb_tmp2 = Pvb_tmp1;
680 
681                         v_tmp1 = c_stat.V_C<OT>(i, j, l);
682                         c_stat.V_C<OT>(i, j, l) = v_tmp2;
683                         v_tmp2 = v_tmp1;
684                     }
685 
686                     break;
687                 }
688             }
689 
690             // Check "once-off" changes:
691             float sum1 = 0.0f;
692             float sum2 = 0.0f;
693             for (int k = 0; k < N1c; ++k)
694             {
695                 const float PV_C = c_stat.PV_C(i, j, k);
696                 if (!PV_C)
697                     break;
698 
699                 sum1 += PV_C;
700                 sum2 += c_stat.PVB_C(i, j, k);
701             }
702 
703             if (sum1 > T)
704                 c_stat.is_trained_st_model(i, j) = 1;
705 
706             float diff = sum1 - Pbc * sum2;
707 
708             // Update stat table:
709             if (diff > T)
710             {
711                 //new BG features are discovered
712                 for (int k = 0; k < N1c; ++k)
713                 {
714                     const float PV_C = c_stat.PV_C(i, j, k);
715                     if (!PV_C)
716                         break;
717 
718                     c_stat.PVB_C(i, j, k) = (PV_C - Pbc * c_stat.PVB_C(i, j, k)) / (1.0f - Pbc);
719                 }
720 
721                 c_stat.Pbc(i, j) = 1.0f - Pbc;
722             }
723             else
724             {
725                 c_stat.Pbc(i, j) = Pbc;
726             }
727         } // if !(change detection) at pixel (i,j)
728 
729         // Update the reference BG image:
730         if (!foreground(i, j))
731         {
732             CT curVal = curFrame(i, j);
733 
734             if (!Ftd(i, j) && !Fbd(i, j))
735             {
736                 // Apply IIR filter:
737                 OT oldVal = background(i, j);
738 
739                 int3 newVal = make_int3(
740                     __float2int_rn(oldVal.x * (1.0f - alpha1) + curVal.x * alpha1),
741                     __float2int_rn(oldVal.y * (1.0f - alpha1) + curVal.y * alpha1),
742                     __float2int_rn(oldVal.z * (1.0f - alpha1) + curVal.z * alpha1)
743                 );
744 
745                 background(i, j) = Output<OT>::make(
746                     static_cast<uchar>(newVal.x),
747                     static_cast<uchar>(newVal.y),
748                     static_cast<uchar>(newVal.z)
749                 );
750             }
751             else
752             {
753                 background(i, j) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
754             }
755         }
756     }
757 
758     template <typename PT, typename CT, typename OT>
759     struct UpdateBackgroundModel
760     {
callfgd::UpdateBackgroundModel761         static void call(PtrStepSz<PT> prevFrame, PtrStepSz<CT> curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSz<OT> background,
762                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
763                          cudaStream_t stream)
764         {
765             dim3 block(32, 8);
766             dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
767 
768             cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
769 
770             updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
771                 prevFrame.cols, prevFrame.rows,
772                 prevFrame, curFrame,
773                 Ftd, Fbd, foreground, background,
774                 deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T);
775             cudaSafeCall( cudaGetLastError() );
776 
777             if (stream == 0)
778                 cudaSafeCall( cudaDeviceSynchronize() );
779         }
780     };
781 
782     template <typename PT, typename CT, typename OT>
updateBackgroundModel_gpu(PtrStepSzb prevFrame,PtrStepSzb curFrame,PtrStepSzb Ftd,PtrStepSzb Fbd,PtrStepSzb foreground,PtrStepSzb background,int deltaC,int deltaCC,float alpha1,float alpha2,float alpha3,int N1c,int N1cc,int N2c,int N2cc,float T,cudaStream_t stream)783     void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background,
784                                    int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
785                                    cudaStream_t stream)
786     {
787         UpdateBackgroundModel<PT, CT, OT>::call(PtrStepSz<PT>(prevFrame), PtrStepSz<CT>(curFrame), Ftd, Fbd, foreground, PtrStepSz<OT>(background),
788                                                 deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
789     }
790 
791     template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
792     template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
793     template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
794     template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
795     template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
796     template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
797     template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
798     template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
799 }
800 
801 #endif /* CUDA_DISABLER */
802