1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 ////////////////////////////////////////////////////////////////////////////////
44 //
45 // NVIDIA CUDA implementation of Brox et al Optical Flow algorithm
46 //
47 // Algorithm is explained in the original paper:
48 //      T. Brox, A. Bruhn, N. Papenberg, J. Weickert:
49 //      High accuracy optical flow estimation based on a theory for warping.
50 //      ECCV 2004.
51 //
52 // Implementation by Mikhail Smirnov
53 // email: msmirnov@nvidia.com, devsupport@nvidia.com
54 //
55 // Credits for help with the code to:
56 // Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov.
57 //
58 ////////////////////////////////////////////////////////////////////////////////
59 
60 #include <iostream>
61 #include <vector>
62 #include <memory>
63 
64 #include "opencv2/core/cuda/utility.hpp"
65 
66 #include "opencv2/cudalegacy/NPP_staging.hpp"
67 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
68 
69 
70 typedef NCVVectorAlloc<Ncv32f> FloatVector;
71 
72 /////////////////////////////////////////////////////////////////////////////////////////
73 // Implementation specific constants
74 /////////////////////////////////////////////////////////////////////////////////////////
75 __device__ const float eps2 = 1e-6f;
76 
77 /////////////////////////////////////////////////////////////////////////////////////////
78 // Additional defines
79 /////////////////////////////////////////////////////////////////////////////////////////
80 
81 // rounded up division
iDivUp(int a,int b)82 inline int iDivUp(int a, int b)
83 {
84     return (a + b - 1)/b;
85 }
86 
87 /////////////////////////////////////////////////////////////////////////////////////////
88 // Texture references
89 /////////////////////////////////////////////////////////////////////////////////////////
90 
91 texture<float, 2, cudaReadModeElementType> tex_coarse;
92 texture<float, 2, cudaReadModeElementType> tex_fine;
93 
94 texture<float, 2, cudaReadModeElementType> tex_I1;
95 texture<float, 2, cudaReadModeElementType> tex_I0;
96 
97 texture<float, 2, cudaReadModeElementType> tex_Ix;
98 texture<float, 2, cudaReadModeElementType> tex_Ixx;
99 texture<float, 2, cudaReadModeElementType> tex_Ix0;
100 
101 texture<float, 2, cudaReadModeElementType> tex_Iy;
102 texture<float, 2, cudaReadModeElementType> tex_Iyy;
103 texture<float, 2, cudaReadModeElementType> tex_Iy0;
104 
105 texture<float, 2, cudaReadModeElementType> tex_Ixy;
106 
107 texture<float, 1, cudaReadModeElementType> tex_u;
108 texture<float, 1, cudaReadModeElementType> tex_v;
109 texture<float, 1, cudaReadModeElementType> tex_du;
110 texture<float, 1, cudaReadModeElementType> tex_dv;
111 texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
112 texture<float, 1, cudaReadModeElementType> tex_numerator_u;
113 texture<float, 1, cudaReadModeElementType> tex_numerator_v;
114 texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
115 texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
116 texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
117 texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
118 
119 
120 /////////////////////////////////////////////////////////////////////////////////////////
121 // SUPPLEMENTARY FUNCTIONS
122 /////////////////////////////////////////////////////////////////////////////////////////
123 
124 ///////////////////////////////////////////////////////////////////////////////
125 /// \brief performs pointwise summation of two vectors stored in device memory
126 /// \param d_res    - pointer to resulting vector (device memory)
127 /// \param d_op1    - term #1 (device memory)
128 /// \param d_op2    - term #2 (device memory)
129 /// \param len    - vector size
130 ///////////////////////////////////////////////////////////////////////////////
pointwise_add(float * d_res,const float * d_op1,const float * d_op2,const int len)131 __global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)
132 {
133     const int pos = blockIdx.x*blockDim.x + threadIdx.x;
134 
135     if(pos >= len) return;
136 
137     d_res[pos] = d_op1[pos] + d_op2[pos];
138 }
139 
140 ///////////////////////////////////////////////////////////////////////////////
141 /// \brief wrapper for summation kernel.
142 ///  Computes \b op1 + \b op2 and stores result to \b res
143 /// \param res   array, containing op1 + op2 (device memory)
144 /// \param op1   term #1 (device memory)
145 /// \param op2   term #2 (device memory)
146 /// \param count vector size
147 ///////////////////////////////////////////////////////////////////////////////
add(float * res,const float * op1,const float * op2,const int count,cudaStream_t stream)148 static void add(float *res, const float *op1, const float *op2, const int count, cudaStream_t stream)
149 {
150     dim3 threads(256);
151     dim3 blocks(iDivUp(count, threads.x));
152 
153     pointwise_add<<<blocks, threads, 0, stream>>>(res, op1, op2, count);
154 }
155 
156 ///////////////////////////////////////////////////////////////////////////////
157 /// \brief wrapper for summation kernel.
158 /// Increments \b res by \b rhs
159 /// \param res   initial vector, will be replaced with result (device memory)
160 /// \param rhs   increment (device memory)
161 /// \param count vector size
162 ///////////////////////////////////////////////////////////////////////////////
add(float * res,const float * rhs,const int count,cudaStream_t stream)163 static void add(float *res, const float *rhs, const int count, cudaStream_t stream)
164 {
165     add(res, res, rhs, count, stream);
166 }
167 
168 ///////////////////////////////////////////////////////////////////////////////
169 /// \brief kernel for scaling vector by scalar
170 /// \param d_res  scaled vector (device memory)
171 /// \param d_src  source vector (device memory)
172 /// \param scale  scalar to scale by
173 /// \param len    vector size (number of elements)
174 ///////////////////////////////////////////////////////////////////////////////
scaleVector(float * d_res,const float * d_src,float scale,const int len)175 __global__ void scaleVector(float *d_res, const float *d_src, float scale, const int len)
176 {
177     const int pos = blockIdx.x * blockDim.x + threadIdx.x;
178 
179     if (pos >= len) return;
180 
181     d_res[pos] = d_src[pos] * scale;
182 }
183 
184 ///////////////////////////////////////////////////////////////////////////////
185 /// \brief scale vector by scalar
186 ///
187 /// kernel wrapper
188 /// \param d_res  scaled vector (device memory)
189 /// \param d_src  source vector (device memory)
190 /// \param scale  scalar to scale by
191 /// \param len    vector size (number of elements)
192 /// \param stream CUDA stream
193 ///////////////////////////////////////////////////////////////////////////////
ScaleVector(float * d_res,const float * d_src,float scale,const int len,cudaStream_t stream)194 static void ScaleVector(float *d_res, const float *d_src, float scale, const int len, cudaStream_t stream)
195 {
196     dim3 threads(256);
197     dim3 blocks(iDivUp(len, threads.x));
198 
199     scaleVector<<<blocks, threads, 0, stream>>>(d_res, d_src, scale, len);
200 }
201 
202 const int SOR_TILE_WIDTH = 32;
203 const int SOR_TILE_HEIGHT = 6;
204 const int PSOR_TILE_WIDTH = 32;
205 const int PSOR_TILE_HEIGHT = 6;
206 const int PSOR_PITCH = PSOR_TILE_WIDTH + 4;
207 const int PSOR_HEIGHT = PSOR_TILE_HEIGHT + 4;
208 
209 ///////////////////////////////////////////////////////////////////////////////
210 ///\brief Utility function. Compute smooth term diffusivity along x axis
211 ///\param s (out) pointer to memory location for result (diffusivity)
212 ///\param pos (in) position within shared memory array containing \b u
213 ///\param u (in) shared memory array containing \b u
214 ///\param v (in) shared memory array containing \b v
215 ///\param du (in) shared memory array containing \b du
216 ///\param dv (in) shared memory array containing \b dv
217 ///////////////////////////////////////////////////////////////////////////////
diffusivity_along_x(float * s,int pos,const float * u,const float * v,const float * du,const float * dv)218 __forceinline__ __device__ void diffusivity_along_x(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
219 {
220     //x derivative between pixels (i,j) and (i-1,j)
221     const int left = pos-1;
222     float u_x = u[pos] + du[pos] - u[left] - du[left];
223     float v_x = v[pos] + dv[pos] - v[left] - dv[left];
224     const int up        = pos + PSOR_PITCH;
225     const int down      = pos - PSOR_PITCH;
226     const int up_left   = up - 1;
227     const int down_left = down-1;
228     //y derivative between pixels (i,j) and (i-1,j)
229     float u_y = 0.25f*(u[up] + du[up] + u[up_left] + du[up_left] - u[down] - du[down] - u[down_left] - du[down_left]);
230     float v_y = 0.25f*(v[up] + dv[up] + v[up_left] + dv[up_left] - v[down] - dv[down] - v[down_left] - dv[down_left]);
231     *s = 0.5f / sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
232 }
233 
234 ///////////////////////////////////////////////////////////////////////////////
235 ///\brief Utility function. Compute smooth term diffusivity along y axis
236 ///\param s (out) pointer to memory location for result (diffusivity)
237 ///\param pos (in) position within shared memory array containing \b u
238 ///\param u (in) shared memory array containing \b u
239 ///\param v (in) shared memory array containing \b v
240 ///\param du (in) shared memory array containing \b du
241 ///\param dv (in) shared memory array containing \b dv
242 ///////////////////////////////////////////////////////////////////////////////
diffusivity_along_y(float * s,int pos,const float * u,const float * v,const float * du,const float * dv)243 __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
244 {
245     //y derivative between pixels (i,j) and (i,j-1)
246     const int down = pos-PSOR_PITCH;
247     float u_y = u[pos] + du[pos] - u[down] - du[down];
248     float v_y = v[pos] + dv[pos] - v[down] - dv[down];
249     const int right      = pos + 1;
250     const int left       = pos - 1;
251     const int down_right = down + 1;
252     const int down_left  = down - 1;
253     //x derivative between pixels (i,j) and (i,j-1);
254     float u_x = 0.25f*(u[right] + u[down_right] + du[right] + du[down_right] - u[left] - u[down_left] - du[left] - du[down_left]);
255     float v_x = 0.25f*(v[right] + v[down_right] + dv[right] + dv[down_right] - v[left] - v[down_left] - dv[left] - dv[down_left]);
256     *s = 0.5f/sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
257 }
258 
259 ///////////////////////////////////////////////////////////////////////////////
260 ///\brief Utility function. Load element of 2D global memory to shared memory
261 ///\param smem pointer to shared memory array
262 ///\param is shared memory array column
263 ///\param js shared memory array row
264 ///\param w number of columns in global memory array
265 ///\param h number of rows in global memory array
266 ///\param p global memory array pitch in floats
267 ///////////////////////////////////////////////////////////////////////////////
268 template<int tex_id>
load_array_element(float * smem,int is,int js,int i,int j,int w,int h,int p)269 __forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
270 {
271     //position within shared memory array
272     const int ijs = js * PSOR_PITCH + is;
273     //mirror reflection across borders
274     i = max(i, -i-1);
275     i = min(i, w-i+w-1);
276     j = max(j, -j-1);
277     j = min(j, h-j+h-1);
278     const int pos = j * p + i;
279     switch(tex_id){
280         case 0:
281             smem[ijs] = tex1Dfetch(tex_u, pos);
282             break;
283         case 1:
284             smem[ijs] = tex1Dfetch(tex_v, pos);
285             break;
286         case 2:
287             smem[ijs] = tex1Dfetch(tex_du, pos);
288             break;
289         case 3:
290             smem[ijs] = tex1Dfetch(tex_dv, pos);
291             break;
292     }
293 }
294 
295 ///////////////////////////////////////////////////////////////////////////////
296 ///\brief Utility function. Load part (tile) of 2D global memory to shared memory
297 ///\param smem pointer to target shared memory array
298 ///\param ig column number within source
299 ///\param jg row number within source
300 ///\param w number of columns in global memory array
301 ///\param h number of rows in global memory array
302 ///\param p global memory array pitch in floats
303 ///////////////////////////////////////////////////////////////////////////////
304 template<int tex>
load_array(float * smem,int ig,int jg,int w,int h,int p)305 __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
306 {
307     const int i = threadIdx.x + 2;
308     const int j = threadIdx.y + 2;
309     load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel
310     __syncthreads();
311     if(threadIdx.y < 2)
312     {
313         //load bottom shadow elements
314         load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p);
315         if(threadIdx.x < 2)
316         {
317             //load bottom right shadow elements
318             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
319             //load middle right shadow elements
320             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
321         }
322         else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
323         {
324             //load bottom left shadow elements
325             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
326             //load middle left shadow elements
327             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
328         }
329     }
330     else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
331     {
332         //load upper shadow elements
333         load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p);
334         if(threadIdx.x < 2)
335         {
336             //load upper right shadow elements
337             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
338             //load middle right shadow elements
339             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
340         }
341         else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
342         {
343             //load upper left shadow elements
344             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
345             //load middle left shadow elements
346             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
347         }
348     }
349     else
350     {
351         //load middle shadow elements
352         if(threadIdx.x < 2)
353         {
354             //load middle right shadow elements
355             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
356         }
357         else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
358         {
359             //load middle left shadow elements
360             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
361         }
362     }
363     __syncthreads();
364 }
365 
366 ///////////////////////////////////////////////////////////////////////////////
367 /// \brief computes matrix of linearised system for \c du, \c dv
368 /// Computed values reside in GPU memory. \n
369 /// Matrix computation is divided into two steps. This kernel performs first step\n
370 /// - compute smoothness term diffusivity between pixels - psi dash smooth
371 /// - compute robustness factor in the data term - psi dash data
372 /// \param diffusivity_x (in/out) diffusivity between pixels along x axis in smoothness term
373 /// \param diffusivity_y (in/out) diffusivity between pixels along y axis in smoothness term
374 /// \param denominator_u (in/out) precomputed part of expression for new du value in SOR iteration
375 /// \param denominator_v (in/out) precomputed part of expression for new dv value in SOR iteration
376 /// \param numerator_dudv (in/out) precomputed part of expression for new du and dv value in SOR iteration
377 /// \param numerator_u (in/out) precomputed part of expression for new du value in SOR iteration
378 /// \param numerator_v (in/out) precomputed part of expression for new dv value in SOR iteration
379 /// \param w (in) frame width
380 /// \param h (in) frame height
381 /// \param pitch (in) pitch in floats
382 /// \param alpha (in) alpha in Brox model (flow smoothness)
383 /// \param gamma (in) gamma in Brox model (edge importance)
384 ///////////////////////////////////////////////////////////////////////////////
385 
prepare_sor_stage_1_tex(float * diffusivity_x,float * diffusivity_y,float * denominator_u,float * denominator_v,float * numerator_dudv,float * numerator_u,float * numerator_v,int w,int h,int s,float alpha,float gamma)386 __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
387                                                         float *denominator_u, float *denominator_v,
388                                                         float *numerator_dudv,
389                                                         float *numerator_u, float *numerator_v,
390                                                         int w, int h, int s,
391                                                         float alpha, float gamma)
392 {
393     __shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
394     __shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
395     __shared__ float du[PSOR_PITCH * PSOR_HEIGHT];
396     __shared__ float dv[PSOR_PITCH * PSOR_HEIGHT];
397 
398     //position within tile
399     const int i = threadIdx.x;
400     const int j = threadIdx.y;
401     //position within smem arrays
402     const int ijs = (j+2) * PSOR_PITCH + i + 2;
403     //position within global memory
404     const int ig  = blockIdx.x * blockDim.x + threadIdx.x;
405     const int jg  = blockIdx.y * blockDim.y + threadIdx.y;
406     const int ijg = jg * s + ig;
407     //position within texture
408     float x = (float)ig + 0.5f;
409     float y = (float)jg + 0.5f;
410     //load u  and v to smem
411     load_array<0>(u, ig, jg, w, h, s);
412     load_array<1>(v, ig, jg, w, h, s);
413     load_array<2>(du, ig, jg, w, h, s);
414     load_array<3>(dv, ig, jg, w, h, s);
415     //warped position
416     float wx = (x + u[ijs])/(float)w;
417     float wy = (y + v[ijs])/(float)h;
418     x /= (float)w;
419     y /= (float)h;
420     //compute image derivatives
421     const float Iz  = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y);
422     const float Ix  = tex2D(tex_Ix, wx, wy);
423     const float Ixz = Ix - tex2D(tex_Ix0, x, y);
424     const float Ixy = tex2D(tex_Ixy, wx, wy);
425     const float Ixx = tex2D(tex_Ixx, wx, wy);
426     const float Iy  = tex2D(tex_Iy, wx, wy);
427     const float Iyz = Iy - tex2D(tex_Iy0, x, y);
428     const float Iyy = tex2D(tex_Iyy, wx, wy);
429     //compute data term
430     float q0, q1, q2;
431     q0 = Iz  + Ix  * du[ijs] + Iy  * dv[ijs];
432     q1 = Ixz + Ixx * du[ijs] + Ixy * dv[ijs];
433     q2 = Iyz + Ixy * du[ijs] + Iyy * dv[ijs];
434     float data_term = 0.5f * rsqrtf(q0*q0 + gamma*(q1*q1 + q2*q2) + eps2);
435     //scale data term by 1/alpha
436     data_term /= alpha;
437     //compute smoothness term (diffusivity)
438     float sx, sy;
439 
440     if(ig >= w || jg >= h) return;
441 
442     diffusivity_along_x(&sx, ijs, u, v, du, dv);
443     diffusivity_along_y(&sy, ijs, u, v, du, dv);
444 
445     if(ig == 0) sx = 0.0f;
446     if(jg == 0) sy = 0.0f;
447 
448     numerator_dudv[ijg] = data_term * (Ix*Iy + gamma * Ixy*(Ixx + Iyy));
449     numerator_u[ijg]    = data_term * (Ix*Iz + gamma * (Ixx*Ixz + Ixy*Iyz));
450     numerator_v[ijg]    = data_term * (Iy*Iz + gamma * (Iyy*Iyz + Ixy*Ixz));
451     denominator_u[ijg]  = data_term * (Ix*Ix + gamma * (Ixy*Ixy + Ixx*Ixx));
452     denominator_v[ijg]  = data_term * (Iy*Iy + gamma * (Ixy*Ixy + Iyy*Iyy));
453     diffusivity_x[ijg]  = sx;
454     diffusivity_y[ijg]  = sy;
455 }
456 
457 ///////////////////////////////////////////////////////////////////////////////
458 ///\brief computes matrix of linearised system for \c du, \c dv
459 ///\param inv_denominator_u
460 ///\param inv_denominator_v
461 ///\param w
462 ///\param h
463 ///\param s
464 ///////////////////////////////////////////////////////////////////////////////
prepare_sor_stage_2(float * inv_denominator_u,float * inv_denominator_v,int w,int h,int s)465 __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v,
466                                     int w, int h, int s)
467 {
468     __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
469     __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
470     //position within tile
471     const int i = threadIdx.x;
472     const int j = threadIdx.y;
473     //position within smem arrays
474     const int ijs = j*(PSOR_TILE_WIDTH+1) + i;
475     //position within global memory
476     const int ig  = blockIdx.x * blockDim.x + threadIdx.x;
477     const int jg  = blockIdx.y * blockDim.y + threadIdx.y;
478     const int ijg = jg*s + ig;
479     int inside = ig < w && jg < h;
480     float denom_u;
481     float denom_v;
482     if(inside)
483     {
484         denom_u = inv_denominator_u[ijg];
485         denom_v = inv_denominator_v[ijg];
486     }
487     if(inside)
488     {
489         sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg);
490         sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg);
491     }
492     else
493     {
494         sx[ijs] = 0.0f;
495         sy[ijs] = 0.0f;
496     }
497     int up = ijs+PSOR_TILE_WIDTH+1;
498     if(j == PSOR_TILE_HEIGHT-1)
499     {
500         if(jg < h-1 && inside)
501         {
502             sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
503         }
504         else
505         {
506             sy[up] = 0.0f;
507         }
508     }
509     int right = ijs + 1;
510     if(threadIdx.x == PSOR_TILE_WIDTH-1)
511     {
512         if(ig < w-1 && inside)
513         {
514             sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
515         }
516         else
517         {
518             sx[right] = 0.0f;
519         }
520     }
521     __syncthreads();
522     float diffusivity_sum;
523     diffusivity_sum = sx[ijs] + sx[ijs+1] + sy[ijs] + sy[ijs+PSOR_TILE_WIDTH+1];
524     if(inside)
525     {
526         denom_u += diffusivity_sum;
527         denom_v += diffusivity_sum;
528         inv_denominator_u[ijg] = 1.0f/denom_u;
529         inv_denominator_v[ijg] = 1.0f/denom_v;
530     }
531 }
532 
533 /////////////////////////////////////////////////////////////////////////////////////////
534 // Red-Black SOR
535 /////////////////////////////////////////////////////////////////////////////////////////
536 
sor_pass(float * new_du,float * new_dv,const float * g_inv_denominator_u,const float * g_inv_denominator_v,const float * g_numerator_u,const float * g_numerator_v,const float * g_numerator_dudv,float omega,int width,int height,int stride)537 template<int isBlack> __global__ void sor_pass(float *new_du,
538                                                float *new_dv,
539                                                const float *g_inv_denominator_u,
540                                                const float *g_inv_denominator_v,
541                                                const float *g_numerator_u,
542                                                const float *g_numerator_v,
543                                                const float *g_numerator_dudv,
544                                                float omega,
545                                                int width,
546                                                int height,
547                                                int stride)
548 {
549     int i = blockIdx.x * blockDim.x + threadIdx.x;
550     int j = blockIdx.y * blockDim.y + threadIdx.y;
551 
552     if(i >= width || j >= height)
553         return;
554 
555     const int pos = j * stride + i;
556     const int pos_r = i < width - 1 ? pos + 1 : pos;
557     const int pos_u = j < height - 1 ? pos + stride : pos;
558     const int pos_d = j > 0 ? pos - stride : pos;
559     const int pos_l = i > 0 ? pos - 1 : pos;
560 
561     //load smooth term
562     float s_up, s_left, s_right, s_down;
563     s_left = tex1Dfetch(tex_diffusivity_x, pos);
564     s_down = tex1Dfetch(tex_diffusivity_y, pos);
565     if(i < width-1)
566         s_right = tex1Dfetch(tex_diffusivity_x, pos_r);
567     else
568         s_right = 0.0f; //Neumann BC
569     if(j < height-1)
570         s_up = tex1Dfetch(tex_diffusivity_y, pos_u);
571     else
572         s_up = 0.0f; //Neumann BC
573 
574     //load u, v and du, dv
575     float u_up, u_left, u_right, u_down, u;
576     float v_up, v_left, v_right, v_down, v;
577     float du_up, du_left, du_right, du_down, du;
578     float dv_up, dv_left, dv_right, dv_down, dv;
579 
580     u_left  = tex1Dfetch(tex_u, pos_l);
581     u_right = tex1Dfetch(tex_u, pos_r);
582     u_down  = tex1Dfetch(tex_u, pos_d);
583     u_up    = tex1Dfetch(tex_u, pos_u);
584     u       = tex1Dfetch(tex_u, pos);
585 
586     v_left  = tex1Dfetch(tex_v, pos_l);
587     v_right = tex1Dfetch(tex_v, pos_r);
588     v_down  = tex1Dfetch(tex_v, pos_d);
589     v       = tex1Dfetch(tex_v, pos);
590     v_up    = tex1Dfetch(tex_v, pos_u);
591 
592     du       = tex1Dfetch(tex_du, pos);
593     du_left  = tex1Dfetch(tex_du, pos_l);
594     du_right = tex1Dfetch(tex_du, pos_r);
595     du_down  = tex1Dfetch(tex_du, pos_d);
596     du_up    = tex1Dfetch(tex_du, pos_u);
597 
598     dv       = tex1Dfetch(tex_dv, pos);
599     dv_left  = tex1Dfetch(tex_dv, pos_l);
600     dv_right = tex1Dfetch(tex_dv, pos_r);
601     dv_down  = tex1Dfetch(tex_dv, pos_d);
602     dv_up    = tex1Dfetch(tex_dv, pos_u);
603 
604     float numerator_dudv    = g_numerator_dudv[pos];
605 
606     if((i+j)%2 == isBlack)
607     {
608         // update du
609         float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -
610                              u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);
611 
612         du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;
613 
614         // update dv
615         float numerator_v = (s_left*(v_left + dv_left) + s_up*(v_up + dv_up) + s_right*(v_right + dv_right) + s_down*(v_down + dv_down) -
616                              v * (s_left + s_right + s_up + s_down) - g_numerator_v[pos] - numerator_dudv*du);
617 
618         dv = (1.0f - omega) * dv + omega * g_inv_denominator_v[pos] * numerator_v;
619     }
620     new_du[pos] = du;
621     new_dv[pos] = dv;
622 }
623 
624 ///////////////////////////////////////////////////////////////////////////////
625 // utility functions
626 ///////////////////////////////////////////////////////////////////////////////
627 
initTexture1D(texture<float,1,cudaReadModeElementType> & tex)628 void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
629 {
630     tex.addressMode[0] = cudaAddressModeClamp;
631     tex.filterMode = cudaFilterModePoint;
632     tex.normalized = false;
633 }
634 
initTexture2D(texture<float,2,cudaReadModeElementType> & tex)635 void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
636 {
637     tex.addressMode[0] = cudaAddressModeMirror;
638     tex.addressMode[1] = cudaAddressModeMirror;
639     tex.filterMode = cudaFilterModeLinear;
640     tex.normalized = true;
641 }
642 
InitTextures()643 void InitTextures()
644 {
645     initTexture2D(tex_I0);
646     initTexture2D(tex_I1);
647     initTexture2D(tex_fine);      // for downsampling
648     initTexture2D(tex_coarse);    // for prolongation
649 
650     initTexture2D(tex_Ix);
651     initTexture2D(tex_Ixx);
652     initTexture2D(tex_Ix0);
653 
654     initTexture2D(tex_Iy);
655     initTexture2D(tex_Iyy);
656     initTexture2D(tex_Iy0);
657 
658     initTexture2D(tex_Ixy);
659 
660     initTexture1D(tex_u);
661     initTexture1D(tex_v);
662     initTexture1D(tex_du);
663     initTexture1D(tex_dv);
664     initTexture1D(tex_diffusivity_x);
665     initTexture1D(tex_diffusivity_y);
666     initTexture1D(tex_inv_denominator_u);
667     initTexture1D(tex_inv_denominator_v);
668     initTexture1D(tex_numerator_dudv);
669     initTexture1D(tex_numerator_u);
670     initTexture1D(tex_numerator_v);
671 }
672 
673 namespace
674 {
675     struct ImagePyramid
676     {
677         std::vector<FloatVector*> img0;
678         std::vector<FloatVector*> img1;
679 
680         std::vector<Ncv32u> w;
681         std::vector<Ncv32u> h;
682 
ImagePyramid__anon599854b60111::ImagePyramid683         explicit ImagePyramid(int outer_iterations)
684         {
685             img0.reserve(outer_iterations);
686             img1.reserve(outer_iterations);
687 
688             w.reserve(outer_iterations);
689             h.reserve(outer_iterations);
690         }
691 
~ImagePyramid__anon599854b60111::ImagePyramid692         ~ImagePyramid()
693         {
694             w.clear();
695             h.clear();
696 
697             for (int i = static_cast<int>(img0.size()) - 1; i >= 0; --i)
698             {
699                 delete img1[i];
700                 delete img0[i];
701             }
702 
703             img0.clear();
704             img1.clear();
705         }
706     };
707 }
708 
709 /////////////////////////////////////////////////////////////////////////////////////////
710 // MAIN FUNCTION
711 /////////////////////////////////////////////////////////////////////////////////////////
NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,INCVMemAllocator & gpu_mem_allocator,const NCVMatrix<Ncv32f> & frame0,const NCVMatrix<Ncv32f> & frame1,NCVMatrix<Ncv32f> & uOut,NCVMatrix<Ncv32f> & vOut,cudaStream_t stream)712 NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
713                              INCVMemAllocator &gpu_mem_allocator,
714                              const NCVMatrix<Ncv32f> &frame0,
715                              const NCVMatrix<Ncv32f> &frame1,
716                              NCVMatrix<Ncv32f> &uOut,
717                              NCVMatrix<Ncv32f> &vOut,
718                              cudaStream_t stream)
719 {
720     ncvAssertPrintReturn(desc.alpha > 0.0f                   , "Invalid alpha"                      , NCV_INCONSISTENT_INPUT);
721     ncvAssertPrintReturn(desc.gamma >= 0.0f                  , "Invalid gamma"                      , NCV_INCONSISTENT_INPUT);
722     ncvAssertPrintReturn(desc.number_of_inner_iterations > 0 , "Invalid number of inner iterations" , NCV_INCONSISTENT_INPUT);
723     ncvAssertPrintReturn(desc.number_of_outer_iterations > 0 , "Invalid number of outer iterations" , NCV_INCONSISTENT_INPUT);
724     ncvAssertPrintReturn(desc.number_of_solver_iterations > 0, "Invalid number of solver iterations", NCV_INCONSISTENT_INPUT);
725 
726     const Ncv32u kSourceWidth  = frame0.width();
727     const Ncv32u kSourceHeight = frame0.height();
728 
729     ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
730     ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
731         uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
732 
733     ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
734 
735     bool kSkipProcessing = gpu_mem_allocator.isCounting();
736 
737     int cuda_device;
738     ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR);
739 
740     cudaDeviceProp device_props;
741     ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR);
742 
743     Ncv32u alignmentValue = gpu_mem_allocator.alignment ();
744 
745     const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float);
746     const Ncv32u kSourcePitch = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
747 
748     const Ncv32f scale_factor = desc.scale_factor;
749     const Ncv32f alpha = desc.alpha;
750     const Ncv32f gamma = desc.gamma;
751 
752     const Ncv32u kSizeInPixelsAligned = alignUp(kSourceWidth, kStrideAlignmentFloat)*kSourceHeight;
753 
754 #if defined SAFE_VECTOR_DECL
755 #undef SAFE_VECTOR_DECL
756 #endif
757 #define SAFE_VECTOR_DECL(name, allocator, size) \
758     FloatVector name((allocator), (size)); \
759     ncvAssertReturn(name.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
760 
761     // matrix elements
762     SAFE_VECTOR_DECL(diffusivity_x,  gpu_mem_allocator, kSizeInPixelsAligned);
763     SAFE_VECTOR_DECL(diffusivity_y,  gpu_mem_allocator, kSizeInPixelsAligned);
764     SAFE_VECTOR_DECL(denom_u,  gpu_mem_allocator, kSizeInPixelsAligned);
765     SAFE_VECTOR_DECL(denom_v,  gpu_mem_allocator, kSizeInPixelsAligned);
766     SAFE_VECTOR_DECL(num_dudv, gpu_mem_allocator, kSizeInPixelsAligned);
767     SAFE_VECTOR_DECL(num_u,    gpu_mem_allocator, kSizeInPixelsAligned);
768     SAFE_VECTOR_DECL(num_v,    gpu_mem_allocator, kSizeInPixelsAligned);
769 
770     // flow components
771     SAFE_VECTOR_DECL(u, gpu_mem_allocator, kSizeInPixelsAligned);
772     SAFE_VECTOR_DECL(v, gpu_mem_allocator, kSizeInPixelsAligned);
773 
774     SAFE_VECTOR_DECL(u_new, gpu_mem_allocator, kSizeInPixelsAligned);
775     SAFE_VECTOR_DECL(v_new, gpu_mem_allocator, kSizeInPixelsAligned);
776 
777     // flow increments
778     SAFE_VECTOR_DECL(du, gpu_mem_allocator, kSizeInPixelsAligned);
779     SAFE_VECTOR_DECL(dv, gpu_mem_allocator, kSizeInPixelsAligned);
780 
781     SAFE_VECTOR_DECL(du_new, gpu_mem_allocator, kSizeInPixelsAligned);
782     SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);
783 
784     // temporary storage
785     SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
786         alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));
787 
788     // image derivatives
789     SAFE_VECTOR_DECL(Ix,  gpu_mem_allocator, kSizeInPixelsAligned);
790     SAFE_VECTOR_DECL(Ixx, gpu_mem_allocator, kSizeInPixelsAligned);
791     SAFE_VECTOR_DECL(Ix0, gpu_mem_allocator, kSizeInPixelsAligned);
792     SAFE_VECTOR_DECL(Iy,  gpu_mem_allocator, kSizeInPixelsAligned);
793     SAFE_VECTOR_DECL(Iyy, gpu_mem_allocator, kSizeInPixelsAligned);
794     SAFE_VECTOR_DECL(Iy0, gpu_mem_allocator, kSizeInPixelsAligned);
795     SAFE_VECTOR_DECL(Ixy, gpu_mem_allocator, kSizeInPixelsAligned);
796 
797     // spatial derivative filter size
798     const int kDFilterSize = 5;
799     SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize);
800 
801     if (!kSkipProcessing)
802     {
803         const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
804 
805         ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
806             cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
807 
808         InitTextures();
809     }
810 
811     //prepare image pyramid
812     ImagePyramid pyr(desc.number_of_outer_iterations);
813 
814     cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
815 
816     float scale = 1.0f;
817 
818     //cuda arrays for frames
819     std::auto_ptr<FloatVector> pI0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
820     ncvAssertReturn(pI0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
821 
822     std::auto_ptr<FloatVector> pI1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
823     ncvAssertReturn(pI1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
824 
825     if (!kSkipProcessing)
826     {
827         //copy frame data to device
828         size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
829         size_t src_width_in_bytes = kSourceWidth * sizeof(float);
830         size_t src_pitch_in_bytes = frame0.pitch();
831 
832         ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),
833             src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
834 
835         ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),
836             src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
837     }
838 
839     FloatVector* I0 = pI0.release();
840     FloatVector* I1 = pI1.release();
841 
842         //prepare pyramid
843     pyr.img0.push_back(I0);
844     pyr.img1.push_back(I1);
845 
846     pyr.w.push_back(kSourceWidth);
847     pyr.h.push_back(kSourceHeight);
848 
849     scale *= scale_factor;
850 
851     Ncv32u prev_level_width  = kSourceWidth;
852     Ncv32u prev_level_height = kSourceHeight;
853     while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(pyr.img0.size()) < desc.number_of_outer_iterations))
854     {
855         //current resolution
856         Ncv32u level_width  = static_cast<Ncv32u>(ceilf(kSourceWidth  * scale));
857         Ncv32u level_height = static_cast<Ncv32u>(ceilf(kSourceHeight * scale));
858 
859         Ncv32u level_width_aligned  = alignUp(level_width,  kStrideAlignmentFloat);
860 
861         Ncv32u buffer_size = alignUp(level_width, kStrideAlignmentFloat) * level_height; // buffer size in floats
862 
863         Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float);
864 
865         std::auto_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
866         ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
867 
868         std::auto_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
869         ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
870 
871         if (!kSkipProcessing)
872         {
873             ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
874 
875             NcvSize32u srcSize (prev_level_width, prev_level_height);
876             NcvSize32u dstSize (level_width, level_height);
877             NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height);
878             NcvRect32u dstROI (0, 0, level_width, level_height);
879 
880             // frame 0
881             ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
882                 level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
883 
884             // frame 1
885             ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
886                 level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
887         }
888 
889         I0 = level_frame0.release();
890         I1 = level_frame1.release();
891 
892         //store pointers
893         pyr.img0.push_back(I0);
894         pyr.img1.push_back(I1);
895 
896         pyr.w.push_back(level_width);
897         pyr.h.push_back(level_height);
898 
899         scale *= scale_factor;
900 
901         prev_level_width  = level_width;
902         prev_level_height = level_height;
903     }
904 
905     if (!kSkipProcessing)
906     {
907         //initial values for flow is 0
908         ncvAssertCUDAReturn(cudaMemsetAsync(u.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
909         ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
910 
911         //select images with lowest resolution
912         size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
913         ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
914         ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
915         ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
916 
917         FloatVector* ptrU = &u;
918         FloatVector* ptrV = &v;
919         FloatVector* ptrUNew = &u_new;
920         FloatVector* ptrVNew = &v_new;
921 
922         std::vector<FloatVector*>::const_reverse_iterator img0Iter = pyr.img0.rbegin();
923         std::vector<FloatVector*>::const_reverse_iterator img1Iter = pyr.img1.rbegin();
924 
925         //outer loop
926         //warping fixed point iteration
927         while(!pyr.w.empty())
928         {
929             //current grid dimensions
930             const Ncv32u kLevelWidth  = pyr.w.back();
931             const Ncv32u kLevelHeight = pyr.h.back();
932             const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
933 
934             //size of current image in bytes
935             const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
936 
937             //number of points at current resolution
938             const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
939 
940             //initial guess for du and dv
941             ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
942             ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
943 
944             //texture format descriptor
945             cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>();
946 
947             I0 = *img0Iter;
948             I1 = *img1Iter;
949 
950             ++img0Iter;
951             ++img1Iter;
952 
953             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
954             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
955 
956             //compute derivatives
957             dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
958             dim3 dThreads(32, 6);
959 
960             const int kPitchTex = kLevelStride * sizeof(float);
961 
962             NcvSize32u srcSize(kLevelWidth, kLevelHeight);
963             Ncv32u nSrcStep = kLevelStride * sizeof(float);
964             NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
965 
966             // Ix0
967             ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
968                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
969 
970             // Iy0
971             ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
972                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
973 
974             // Ix
975             ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Ix.ptr(), srcSize, nSrcStep, oROI,
976                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
977 
978             // Iy
979             ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Iy.ptr(), srcSize, nSrcStep, oROI,
980                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
981 
982             // Ixx
983             ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Ix.ptr(), srcSize, nSrcStep, Ixx.ptr(), srcSize, nSrcStep, oROI,
984                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
985 
986             // Iyy
987             ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Iyy.ptr(), srcSize, nSrcStep, oROI,
988                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
989 
990             // Ixy
991             ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
992                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
993 
994             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
995             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
996             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
997             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy,  Iy.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
998             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
999             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
1000             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
1001 
1002             //    flow
1003             ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1004             ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1005             //    flow increments
1006             ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1007             ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1008 
1009             dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
1010             dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
1011 
1012             dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
1013             dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);
1014 
1015             // inner loop
1016             // lagged nonlinearity fixed point iteration
1017             ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
1018             for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
1019             {
1020                 //compute coefficients
1021                 prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
1022                     (diffusivity_x.ptr(),
1023                      diffusivity_y.ptr(),
1024                      denom_u.ptr(),
1025                      denom_v.ptr(),
1026                      num_dudv.ptr(),
1027                      num_u.ptr(),
1028                      num_v.ptr(),
1029                      kLevelWidth,
1030                      kLevelHeight,
1031                      kLevelStride,
1032                      alpha,
1033                      gamma);
1034 
1035                 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
1036 
1037                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1038                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1039 
1040                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1041 
1042                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1043                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1044 
1045                 prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
1046 
1047                 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
1048 
1049                 //    linear system coefficients
1050                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1051                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1052 
1053                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1054 
1055                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1056                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1057 
1058                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1059                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1060 
1061                 //solve linear system
1062                 for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
1063                 {
1064                     float omega = 1.99f;
1065 
1066                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1067                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1068 
1069                     sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
1070                         (du_new.ptr(),
1071                         dv_new.ptr(),
1072                         denom_u.ptr(),
1073                         denom_v.ptr(),
1074                         num_u.ptr(),
1075                         num_v.ptr(),
1076                         num_dudv.ptr(),
1077                         omega,
1078                         kLevelWidth,
1079                         kLevelHeight,
1080                         kLevelStride);
1081 
1082                     ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
1083 
1084                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1085                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1086 
1087                     sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
1088                         (du.ptr(),
1089                         dv.ptr(),
1090                         denom_u.ptr(),
1091                         denom_v.ptr(),
1092                         num_u.ptr(),
1093                         num_v.ptr(),
1094                         num_dudv.ptr(),
1095                         omega,
1096                         kLevelWidth,
1097                         kLevelHeight,
1098                         kLevelStride);
1099 
1100                     ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
1101 
1102                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1103                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
1104                 }//end of solver loop
1105             }// end of inner loop
1106 
1107             //update u and v
1108             add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream);
1109             ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
1110             add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream);
1111             ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
1112 
1113             //prolongate using texture
1114             pyr.w.pop_back();
1115             pyr.h.pop_back();
1116             if (!pyr.w.empty())
1117             {
1118                 //compute new image size
1119                 Ncv32u nw = pyr.w.back();
1120                 Ncv32u nh = pyr.h.back();
1121                 Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);
1122 
1123                 dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
1124                 dim3 p_threads(32, 8);
1125 
1126                 NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight);
1127                 NcvSize32u dstSize (nw, nh);
1128                 NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
1129                 NcvRect32u dstROI (0, 0, nw, nh);
1130 
1131                 ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
1132                     ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
1133 
1134                 ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
1135                 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
1136 
1137                 ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
1138                     ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
1139 
1140                 ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
1141                 ncvAssertCUDALastErrorReturn((int)NCV_CUDA_ERROR);
1142 
1143                 cv::cuda::device::swap<FloatVector*>(ptrU, ptrUNew);
1144                 cv::cuda::device::swap<FloatVector*>(ptrV, ptrVNew);
1145             }
1146             scale /= scale_factor;
1147         }
1148 
1149         // end of warping iterations
1150         ncvAssertCUDAReturn(cudaStreamSynchronize(stream), (int)NCV_CUDA_ERROR);
1151 
1152         ncvAssertCUDAReturn( cudaMemcpy2DAsync
1153             (uOut.ptr(), uOut.pitch(), ptrU->ptr(),
1154             kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), (int)NCV_CUDA_ERROR );
1155 
1156         ncvAssertCUDAReturn( cudaMemcpy2DAsync
1157             (vOut.ptr(), vOut.pitch(), ptrV->ptr(),
1158             kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), (int)NCV_CUDA_ERROR );
1159 
1160         ncvAssertCUDAReturn(cudaStreamSynchronize(stream), (int)NCV_CUDA_ERROR);
1161     }
1162 
1163     return NCV_SUCCESS;
1164 }
1165