1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
23 //
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
27 //
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 #pragma once
45
46 #ifndef __OPENCV_CUDEV_WARP_SHUFFLE_HPP__
47 #define __OPENCV_CUDEV_WARP_SHUFFLE_HPP__
48
49 #include "../common.hpp"
50 #include "../util/vec_traits.hpp"
51
52 namespace cv { namespace cudev {
53
54 //! @addtogroup cudev
55 //! @{
56
57 #if CV_CUDEV_ARCH >= 300
58
59 // shfl
60
shfl(uchar val,int srcLane,int width=warpSize)61 __device__ __forceinline__ uchar shfl(uchar val, int srcLane, int width = warpSize)
62 {
63 return (uchar) __shfl((int) val, srcLane, width);
64 }
65
shfl(schar val,int srcLane,int width=warpSize)66 __device__ __forceinline__ schar shfl(schar val, int srcLane, int width = warpSize)
67 {
68 return (schar) __shfl((int) val, srcLane, width);
69 }
70
shfl(ushort val,int srcLane,int width=warpSize)71 __device__ __forceinline__ ushort shfl(ushort val, int srcLane, int width = warpSize)
72 {
73 return (ushort) __shfl((int) val, srcLane, width);
74 }
75
shfl(short val,int srcLane,int width=warpSize)76 __device__ __forceinline__ short shfl(short val, int srcLane, int width = warpSize)
77 {
78 return (short) __shfl((int) val, srcLane, width);
79 }
80
shfl(int val,int srcLane,int width=warpSize)81 __device__ __forceinline__ int shfl(int val, int srcLane, int width = warpSize)
82 {
83 return __shfl(val, srcLane, width);
84 }
85
shfl(uint val,int srcLane,int width=warpSize)86 __device__ __forceinline__ uint shfl(uint val, int srcLane, int width = warpSize)
87 {
88 return (uint) __shfl((int) val, srcLane, width);
89 }
90
shfl(float val,int srcLane,int width=warpSize)91 __device__ __forceinline__ float shfl(float val, int srcLane, int width = warpSize)
92 {
93 return __shfl(val, srcLane, width);
94 }
95
shfl(double val,int srcLane,int width=warpSize)96 __device__ double shfl(double val, int srcLane, int width = warpSize)
97 {
98 int lo = __double2loint(val);
99 int hi = __double2hiint(val);
100
101 lo = __shfl(lo, srcLane, width);
102 hi = __shfl(hi, srcLane, width);
103
104 return __hiloint2double(hi, lo);
105 }
106
107 #define CV_CUDEV_SHFL_VEC_INST(input_type) \
108 __device__ __forceinline__ input_type ## 1 shfl(const input_type ## 1 & val, int srcLane, int width = warpSize) \
109 { \
110 return VecTraits<input_type ## 1>::make( \
111 shfl(val.x, srcLane, width) \
112 ); \
113 } \
114 __device__ __forceinline__ input_type ## 2 shfl(const input_type ## 2 & val, int srcLane, int width = warpSize) \
115 { \
116 return VecTraits<input_type ## 2>::make( \
117 shfl(val.x, srcLane, width), \
118 shfl(val.y, srcLane, width) \
119 ); \
120 } \
121 __device__ __forceinline__ input_type ## 3 shfl(const input_type ## 3 & val, int srcLane, int width = warpSize) \
122 { \
123 return VecTraits<input_type ## 3>::make( \
124 shfl(val.x, srcLane, width), \
125 shfl(val.y, srcLane, width), \
126 shfl(val.z, srcLane, width) \
127 ); \
128 } \
129 __device__ __forceinline__ input_type ## 4 shfl(const input_type ## 4 & val, int srcLane, int width = warpSize) \
130 { \
131 return VecTraits<input_type ## 4>::make( \
132 shfl(val.x, srcLane, width), \
133 shfl(val.y, srcLane, width), \
134 shfl(val.z, srcLane, width), \
135 shfl(val.w, srcLane, width) \
136 ); \
137 }
138
139 CV_CUDEV_SHFL_VEC_INST(uchar)
CV_CUDEV_SHFL_VEC_INST(char)140 CV_CUDEV_SHFL_VEC_INST(char)
141 CV_CUDEV_SHFL_VEC_INST(ushort)
142 CV_CUDEV_SHFL_VEC_INST(short)
143 CV_CUDEV_SHFL_VEC_INST(uint)
144 CV_CUDEV_SHFL_VEC_INST(int)
145 CV_CUDEV_SHFL_VEC_INST(float)
146 CV_CUDEV_SHFL_VEC_INST(double)
147
148 #undef CV_CUDEV_SHFL_VEC_INST
149
150 // shfl_up
151
152 __device__ __forceinline__ uchar shfl_up(uchar val, uint delta, int width = warpSize)
153 {
154 return (uchar) __shfl_up((int) val, delta, width);
155 }
156
shfl_up(schar val,uint delta,int width=warpSize)157 __device__ __forceinline__ schar shfl_up(schar val, uint delta, int width = warpSize)
158 {
159 return (schar) __shfl_up((int) val, delta, width);
160 }
161
shfl_up(ushort val,uint delta,int width=warpSize)162 __device__ __forceinline__ ushort shfl_up(ushort val, uint delta, int width = warpSize)
163 {
164 return (ushort) __shfl_up((int) val, delta, width);
165 }
166
shfl_up(short val,uint delta,int width=warpSize)167 __device__ __forceinline__ short shfl_up(short val, uint delta, int width = warpSize)
168 {
169 return (short) __shfl_up((int) val, delta, width);
170 }
171
shfl_up(int val,uint delta,int width=warpSize)172 __device__ __forceinline__ int shfl_up(int val, uint delta, int width = warpSize)
173 {
174 return __shfl_up(val, delta, width);
175 }
176
shfl_up(uint val,uint delta,int width=warpSize)177 __device__ __forceinline__ uint shfl_up(uint val, uint delta, int width = warpSize)
178 {
179 return (uint) __shfl_up((int) val, delta, width);
180 }
181
shfl_up(float val,uint delta,int width=warpSize)182 __device__ __forceinline__ float shfl_up(float val, uint delta, int width = warpSize)
183 {
184 return __shfl_up(val, delta, width);
185 }
186
shfl_up(double val,uint delta,int width=warpSize)187 __device__ double shfl_up(double val, uint delta, int width = warpSize)
188 {
189 int lo = __double2loint(val);
190 int hi = __double2hiint(val);
191
192 lo = __shfl_up(lo, delta, width);
193 hi = __shfl_up(hi, delta, width);
194
195 return __hiloint2double(hi, lo);
196 }
197
198 #define CV_CUDEV_SHFL_UP_VEC_INST(input_type) \
199 __device__ __forceinline__ input_type ## 1 shfl_up(const input_type ## 1 & val, uint delta, int width = warpSize) \
200 { \
201 return VecTraits<input_type ## 1>::make( \
202 shfl_up(val.x, delta, width) \
203 ); \
204 } \
205 __device__ __forceinline__ input_type ## 2 shfl_up(const input_type ## 2 & val, uint delta, int width = warpSize) \
206 { \
207 return VecTraits<input_type ## 2>::make( \
208 shfl_up(val.x, delta, width), \
209 shfl_up(val.y, delta, width) \
210 ); \
211 } \
212 __device__ __forceinline__ input_type ## 3 shfl_up(const input_type ## 3 & val, uint delta, int width = warpSize) \
213 { \
214 return VecTraits<input_type ## 3>::make( \
215 shfl_up(val.x, delta, width), \
216 shfl_up(val.y, delta, width), \
217 shfl_up(val.z, delta, width) \
218 ); \
219 } \
220 __device__ __forceinline__ input_type ## 4 shfl_up(const input_type ## 4 & val, uint delta, int width = warpSize) \
221 { \
222 return VecTraits<input_type ## 4>::make( \
223 shfl_up(val.x, delta, width), \
224 shfl_up(val.y, delta, width), \
225 shfl_up(val.z, delta, width), \
226 shfl_up(val.w, delta, width) \
227 ); \
228 }
229
230 CV_CUDEV_SHFL_UP_VEC_INST(uchar)
CV_CUDEV_SHFL_UP_VEC_INST(char)231 CV_CUDEV_SHFL_UP_VEC_INST(char)
232 CV_CUDEV_SHFL_UP_VEC_INST(ushort)
233 CV_CUDEV_SHFL_UP_VEC_INST(short)
234 CV_CUDEV_SHFL_UP_VEC_INST(uint)
235 CV_CUDEV_SHFL_UP_VEC_INST(int)
236 CV_CUDEV_SHFL_UP_VEC_INST(float)
237 CV_CUDEV_SHFL_UP_VEC_INST(double)
238
239 #undef CV_CUDEV_SHFL_UP_VEC_INST
240
241 // shfl_down
242
243 __device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)
244 {
245 return (uchar) __shfl_down((int) val, delta, width);
246 }
247
shfl_down(schar val,uint delta,int width=warpSize)248 __device__ __forceinline__ schar shfl_down(schar val, uint delta, int width = warpSize)
249 {
250 return (schar) __shfl_down((int) val, delta, width);
251 }
252
shfl_down(ushort val,uint delta,int width=warpSize)253 __device__ __forceinline__ ushort shfl_down(ushort val, uint delta, int width = warpSize)
254 {
255 return (ushort) __shfl_down((int) val, delta, width);
256 }
257
shfl_down(short val,uint delta,int width=warpSize)258 __device__ __forceinline__ short shfl_down(short val, uint delta, int width = warpSize)
259 {
260 return (short) __shfl_down((int) val, delta, width);
261 }
262
shfl_down(int val,uint delta,int width=warpSize)263 __device__ __forceinline__ int shfl_down(int val, uint delta, int width = warpSize)
264 {
265 return __shfl_down(val, delta, width);
266 }
267
shfl_down(uint val,uint delta,int width=warpSize)268 __device__ __forceinline__ uint shfl_down(uint val, uint delta, int width = warpSize)
269 {
270 return (uint) __shfl_down((int) val, delta, width);
271 }
272
shfl_down(float val,uint delta,int width=warpSize)273 __device__ __forceinline__ float shfl_down(float val, uint delta, int width = warpSize)
274 {
275 return __shfl_down(val, delta, width);
276 }
277
shfl_down(double val,uint delta,int width=warpSize)278 __device__ double shfl_down(double val, uint delta, int width = warpSize)
279 {
280 int lo = __double2loint(val);
281 int hi = __double2hiint(val);
282
283 lo = __shfl_down(lo, delta, width);
284 hi = __shfl_down(hi, delta, width);
285
286 return __hiloint2double(hi, lo);
287 }
288
289 #define CV_CUDEV_SHFL_DOWN_VEC_INST(input_type) \
290 __device__ __forceinline__ input_type ## 1 shfl_down(const input_type ## 1 & val, uint delta, int width = warpSize) \
291 { \
292 return VecTraits<input_type ## 1>::make( \
293 shfl_down(val.x, delta, width) \
294 ); \
295 } \
296 __device__ __forceinline__ input_type ## 2 shfl_down(const input_type ## 2 & val, uint delta, int width = warpSize) \
297 { \
298 return VecTraits<input_type ## 2>::make( \
299 shfl_down(val.x, delta, width), \
300 shfl_down(val.y, delta, width) \
301 ); \
302 } \
303 __device__ __forceinline__ input_type ## 3 shfl_down(const input_type ## 3 & val, uint delta, int width = warpSize) \
304 { \
305 return VecTraits<input_type ## 3>::make( \
306 shfl_down(val.x, delta, width), \
307 shfl_down(val.y, delta, width), \
308 shfl_down(val.z, delta, width) \
309 ); \
310 } \
311 __device__ __forceinline__ input_type ## 4 shfl_down(const input_type ## 4 & val, uint delta, int width = warpSize) \
312 { \
313 return VecTraits<input_type ## 4>::make( \
314 shfl_down(val.x, delta, width), \
315 shfl_down(val.y, delta, width), \
316 shfl_down(val.z, delta, width), \
317 shfl_down(val.w, delta, width) \
318 ); \
319 }
320
321 CV_CUDEV_SHFL_DOWN_VEC_INST(uchar)
CV_CUDEV_SHFL_DOWN_VEC_INST(char)322 CV_CUDEV_SHFL_DOWN_VEC_INST(char)
323 CV_CUDEV_SHFL_DOWN_VEC_INST(ushort)
324 CV_CUDEV_SHFL_DOWN_VEC_INST(short)
325 CV_CUDEV_SHFL_DOWN_VEC_INST(uint)
326 CV_CUDEV_SHFL_DOWN_VEC_INST(int)
327 CV_CUDEV_SHFL_DOWN_VEC_INST(float)
328 CV_CUDEV_SHFL_DOWN_VEC_INST(double)
329
330 #undef CV_CUDEV_SHFL_DOWN_VEC_INST
331
332 // shfl_xor
333
334 __device__ __forceinline__ uchar shfl_xor(uchar val, int laneMask, int width = warpSize)
335 {
336 return (uchar) __shfl_xor((int) val, laneMask, width);
337 }
338
shfl_xor(schar val,int laneMask,int width=warpSize)339 __device__ __forceinline__ schar shfl_xor(schar val, int laneMask, int width = warpSize)
340 {
341 return (schar) __shfl_xor((int) val, laneMask, width);
342 }
343
shfl_xor(ushort val,int laneMask,int width=warpSize)344 __device__ __forceinline__ ushort shfl_xor(ushort val, int laneMask, int width = warpSize)
345 {
346 return (ushort) __shfl_xor((int) val, laneMask, width);
347 }
348
shfl_xor(short val,int laneMask,int width=warpSize)349 __device__ __forceinline__ short shfl_xor(short val, int laneMask, int width = warpSize)
350 {
351 return (short) __shfl_xor((int) val, laneMask, width);
352 }
353
shfl_xor(int val,int laneMask,int width=warpSize)354 __device__ __forceinline__ int shfl_xor(int val, int laneMask, int width = warpSize)
355 {
356 return __shfl_xor(val, laneMask, width);
357 }
358
shfl_xor(uint val,int laneMask,int width=warpSize)359 __device__ __forceinline__ uint shfl_xor(uint val, int laneMask, int width = warpSize)
360 {
361 return (uint) __shfl_xor((int) val, laneMask, width);
362 }
363
shfl_xor(float val,int laneMask,int width=warpSize)364 __device__ __forceinline__ float shfl_xor(float val, int laneMask, int width = warpSize)
365 {
366 return __shfl_xor(val, laneMask, width);
367 }
368
shfl_xor(double val,int laneMask,int width=warpSize)369 __device__ double shfl_xor(double val, int laneMask, int width = warpSize)
370 {
371 int lo = __double2loint(val);
372 int hi = __double2hiint(val);
373
374 lo = __shfl_xor(lo, laneMask, width);
375 hi = __shfl_xor(hi, laneMask, width);
376
377 return __hiloint2double(hi, lo);
378 }
379
380 #define CV_CUDEV_SHFL_XOR_VEC_INST(input_type) \
381 __device__ __forceinline__ input_type ## 1 shfl_xor(const input_type ## 1 & val, int laneMask, int width = warpSize) \
382 { \
383 return VecTraits<input_type ## 1>::make( \
384 shfl_xor(val.x, laneMask, width) \
385 ); \
386 } \
387 __device__ __forceinline__ input_type ## 2 shfl_xor(const input_type ## 2 & val, int laneMask, int width = warpSize) \
388 { \
389 return VecTraits<input_type ## 2>::make( \
390 shfl_xor(val.x, laneMask, width), \
391 shfl_xor(val.y, laneMask, width) \
392 ); \
393 } \
394 __device__ __forceinline__ input_type ## 3 shfl_xor(const input_type ## 3 & val, int laneMask, int width = warpSize) \
395 { \
396 return VecTraits<input_type ## 3>::make( \
397 shfl_xor(val.x, laneMask, width), \
398 shfl_xor(val.y, laneMask, width), \
399 shfl_xor(val.z, laneMask, width) \
400 ); \
401 } \
402 __device__ __forceinline__ input_type ## 4 shfl_xor(const input_type ## 4 & val, int laneMask, int width = warpSize) \
403 { \
404 return VecTraits<input_type ## 4>::make( \
405 shfl_xor(val.x, laneMask, width), \
406 shfl_xor(val.y, laneMask, width), \
407 shfl_xor(val.z, laneMask, width), \
408 shfl_xor(val.w, laneMask, width) \
409 ); \
410 }
411
412 CV_CUDEV_SHFL_XOR_VEC_INST(uchar)
413 CV_CUDEV_SHFL_XOR_VEC_INST(char)
414 CV_CUDEV_SHFL_XOR_VEC_INST(ushort)
415 CV_CUDEV_SHFL_XOR_VEC_INST(short)
416 CV_CUDEV_SHFL_XOR_VEC_INST(uint)
417 CV_CUDEV_SHFL_XOR_VEC_INST(int)
418 CV_CUDEV_SHFL_XOR_VEC_INST(float)
419 CV_CUDEV_SHFL_XOR_VEC_INST(double)
420
421 #undef CV_CUDEV_SHFL_XOR_VEC_INST
422
423 #endif // CV_CUDEV_ARCH >= 300
424
425 //! @}
426
427 }}
428
429 #endif
430