1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
23 //
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
27 //
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 /*
45 * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
46 *
47 * Redistribution and use in source and binary forms, with or without
48 * modification, are permitted provided that the following conditions are met:
49 *
50 * Redistributions of source code must retain the above copyright notice,
51 * this list of conditions and the following disclaimer.
52 *
53 * Redistributions in binary form must reproduce the above copyright notice,
54 * this list of conditions and the following disclaimer in the documentation
55 * and/or other materials provided with the distribution.
56 *
57 * Neither the name of NVIDIA Corporation nor the names of its contributors
58 * may be used to endorse or promote products derived from this software
59 * without specific prior written permission.
60 *
61 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
62 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
65 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
66 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
67 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
68 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
69 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
70 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
71 * POSSIBILITY OF SUCH DAMAGE.
72 */
73
74 #pragma once
75
76 #ifndef __OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP__
77 #define __OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP__
78
79 #include "../common.hpp"
80
81 /*
82 This header file contains inline functions that implement intra-word SIMD
83 operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
84 emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
85 to make the code portable across all GPUs supported by CUDA. The following
86 functions are currently implemented:
87
88 vadd2(a,b) per-halfword unsigned addition, with wrap-around: a + b
89 vsub2(a,b) per-halfword unsigned subtraction, with wrap-around: a - b
90 vabsdiff2(a,b) per-halfword unsigned absolute difference: |a - b|
91 vavg2(a,b) per-halfword unsigned average: (a + b) / 2
92 vavrg2(a,b) per-halfword unsigned rounded average: (a + b + 1) / 2
93 vseteq2(a,b) per-halfword unsigned comparison: a == b ? 1 : 0
94 vcmpeq2(a,b) per-halfword unsigned comparison: a == b ? 0xffff : 0
95 vsetge2(a,b) per-halfword unsigned comparison: a >= b ? 1 : 0
96 vcmpge2(a,b) per-halfword unsigned comparison: a >= b ? 0xffff : 0
97 vsetgt2(a,b) per-halfword unsigned comparison: a > b ? 1 : 0
98 vcmpgt2(a,b) per-halfword unsigned comparison: a > b ? 0xffff : 0
99 vsetle2(a,b) per-halfword unsigned comparison: a <= b ? 1 : 0
100 vcmple2(a,b) per-halfword unsigned comparison: a <= b ? 0xffff : 0
101 vsetlt2(a,b) per-halfword unsigned comparison: a < b ? 1 : 0
102 vcmplt2(a,b) per-halfword unsigned comparison: a < b ? 0xffff : 0
103 vsetne2(a,b) per-halfword unsigned comparison: a != b ? 1 : 0
104 vcmpne2(a,b) per-halfword unsigned comparison: a != b ? 0xffff : 0
105 vmax2(a,b) per-halfword unsigned maximum: max(a, b)
106 vmin2(a,b) per-halfword unsigned minimum: min(a, b)
107
108 vadd4(a,b) per-byte unsigned addition, with wrap-around: a + b
109 vsub4(a,b) per-byte unsigned subtraction, with wrap-around: a - b
110 vabsdiff4(a,b) per-byte unsigned absolute difference: |a - b|
111 vavg4(a,b) per-byte unsigned average: (a + b) / 2
112 vavrg4(a,b) per-byte unsigned rounded average: (a + b + 1) / 2
113 vseteq4(a,b) per-byte unsigned comparison: a == b ? 1 : 0
114 vcmpeq4(a,b) per-byte unsigned comparison: a == b ? 0xff : 0
115 vsetge4(a,b) per-byte unsigned comparison: a >= b ? 1 : 0
116 vcmpge4(a,b) per-byte unsigned comparison: a >= b ? 0xff : 0
117 vsetgt4(a,b) per-byte unsigned comparison: a > b ? 1 : 0
118 vcmpgt4(a,b) per-byte unsigned comparison: a > b ? 0xff : 0
119 vsetle4(a,b) per-byte unsigned comparison: a <= b ? 1 : 0
120 vcmple4(a,b) per-byte unsigned comparison: a <= b ? 0xff : 0
121 vsetlt4(a,b) per-byte unsigned comparison: a < b ? 1 : 0
122 vcmplt4(a,b) per-byte unsigned comparison: a < b ? 0xff : 0
123 vsetne4(a,b) per-byte unsigned comparison: a != b ? 1: 0
124 vcmpne4(a,b) per-byte unsigned comparison: a != b ? 0xff: 0
125 vmax4(a,b) per-byte unsigned maximum: max(a, b)
126 vmin4(a,b) per-byte unsigned minimum: min(a, b)
127 */
128
129 namespace cv { namespace cudev {
130
131 //! @addtogroup cudev
132 //! @{
133
134 // 2
135
vadd2(uint a,uint b)136 __device__ __forceinline__ uint vadd2(uint a, uint b)
137 {
138 uint r = 0;
139
140 #if CV_CUDEV_ARCH >= 300
141 asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
142 #elif CV_CUDEV_ARCH >= 200
143 asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
144 asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
145 #else
146 uint s;
147 s = a ^ b; // sum bits
148 r = a + b; // actual sum
149 s = s ^ r; // determine carry-ins for each bit position
150 s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
151 r = r - s; // subtract out carry-out from low word
152 #endif
153
154 return r;
155 }
156
vsub2(uint a,uint b)157 __device__ __forceinline__ uint vsub2(uint a, uint b)
158 {
159 uint r = 0;
160
161 #if CV_CUDEV_ARCH >= 300
162 asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
163 #elif CV_CUDEV_ARCH >= 200
164 asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
165 asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
166 #else
167 uint s;
168 s = a ^ b; // sum bits
169 r = a - b; // actual sum
170 s = s ^ r; // determine carry-ins for each bit position
171 s = s & 0x00010000; // borrow to high word
172 r = r + s; // compensate for borrow from low word
173 #endif
174
175 return r;
176 }
177
vabsdiff2(uint a,uint b)178 __device__ __forceinline__ uint vabsdiff2(uint a, uint b)
179 {
180 uint r = 0;
181
182 #if CV_CUDEV_ARCH >= 300
183 asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
184 #elif CV_CUDEV_ARCH >= 200
185 asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
186 asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
187 #else
188 uint s, t, u, v;
189 s = a & 0x0000ffff; // extract low halfword
190 r = b & 0x0000ffff; // extract low halfword
191 u = ::max(r, s); // maximum of low halfwords
192 v = ::min(r, s); // minimum of low halfwords
193 s = a & 0xffff0000; // extract high halfword
194 r = b & 0xffff0000; // extract high halfword
195 t = ::max(r, s); // maximum of high halfwords
196 s = ::min(r, s); // minimum of high halfwords
197 r = u | t; // maximum of both halfwords
198 s = v | s; // minimum of both halfwords
199 r = r - s; // |a - b| = max(a,b) - min(a,b);
200 #endif
201
202 return r;
203 }
204
vavg2(uint a,uint b)205 __device__ __forceinline__ uint vavg2(uint a, uint b)
206 {
207 uint r, s;
208
209 // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
210 // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
211 s = a ^ b;
212 r = a & b;
213 s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
214 s = s >> 1;
215 s = r + s;
216
217 return s;
218 }
219
vavrg2(uint a,uint b)220 __device__ __forceinline__ uint vavrg2(uint a, uint b)
221 {
222 uint r = 0;
223
224 #if CV_CUDEV_ARCH >= 300
225 asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
226 #else
227 // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
228 // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
229 uint s;
230 s = a ^ b;
231 r = a | b;
232 s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
233 s = s >> 1;
234 r = r - s;
235 #endif
236
237 return r;
238 }
239
vseteq2(uint a,uint b)240 __device__ __forceinline__ uint vseteq2(uint a, uint b)
241 {
242 uint r = 0;
243
244 #if CV_CUDEV_ARCH >= 300
245 asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
246 #else
247 // inspired by Alan Mycroft's null-byte detection algorithm:
248 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
249 uint c;
250 r = a ^ b; // 0x0000 if a == b
251 c = r | 0x80008000; // set msbs, to catch carry out
252 r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
253 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
254 c = r & ~c; // msb = 1, if r was 0x0000
255 r = c >> 15; // convert to bool
256 #endif
257
258 return r;
259 }
260
vcmpeq2(uint a,uint b)261 __device__ __forceinline__ uint vcmpeq2(uint a, uint b)
262 {
263 uint r, c;
264
265 #if CV_CUDEV_ARCH >= 300
266 r = vseteq2(a, b);
267 c = r << 16; // convert bool
268 r = c - r; // into mask
269 #else
270 // inspired by Alan Mycroft's null-byte detection algorithm:
271 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
272 r = a ^ b; // 0x0000 if a == b
273 c = r | 0x80008000; // set msbs, to catch carry out
274 r = r ^ c; // extract msbs, msb = 1 if r < 0x8000
275 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
276 c = r & ~c; // msb = 1, if r was 0x0000
277 r = c >> 15; // convert
278 r = c - r; // msbs to
279 r = c | r; // mask
280 #endif
281
282 return r;
283 }
284
vsetge2(uint a,uint b)285 __device__ __forceinline__ uint vsetge2(uint a, uint b)
286 {
287 uint r = 0;
288
289 #if CV_CUDEV_ARCH >= 300
290 asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
291 #else
292 uint c;
293 asm("not.b32 %0, %0;" : "+r"(b));
294 c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
295 c = c & 0x80008000; // msb = carry-outs
296 r = c >> 15; // convert to bool
297 #endif
298
299 return r;
300 }
301
vcmpge2(uint a,uint b)302 __device__ __forceinline__ uint vcmpge2(uint a, uint b)
303 {
304 uint r, c;
305
306 #if CV_CUDEV_ARCH >= 300
307 r = vsetge2(a, b);
308 c = r << 16; // convert bool
309 r = c - r; // into mask
310 #else
311 asm("not.b32 %0, %0;" : "+r"(b));
312 c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
313 c = c & 0x80008000; // msb = carry-outs
314 r = c >> 15; // convert
315 r = c - r; // msbs to
316 r = c | r; // mask
317 #endif
318
319 return r;
320 }
321
vsetgt2(uint a,uint b)322 __device__ __forceinline__ uint vsetgt2(uint a, uint b)
323 {
324 uint r = 0;
325
326 #if CV_CUDEV_ARCH >= 300
327 asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
328 #else
329 uint c;
330 asm("not.b32 %0, %0;" : "+r"(b));
331 c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
332 c = c & 0x80008000; // msbs = carry-outs
333 r = c >> 15; // convert to bool
334 #endif
335
336 return r;
337 }
338
vcmpgt2(uint a,uint b)339 __device__ __forceinline__ uint vcmpgt2(uint a, uint b)
340 {
341 uint r, c;
342
343 #if CV_CUDEV_ARCH >= 300
344 r = vsetgt2(a, b);
345 c = r << 16; // convert bool
346 r = c - r; // into mask
347 #else
348 asm("not.b32 %0, %0;" : "+r"(b));
349 c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
350 c = c & 0x80008000; // msbs = carry-outs
351 r = c >> 15; // convert
352 r = c - r; // msbs to
353 r = c | r; // mask
354 #endif
355
356 return r;
357 }
358
vsetle2(uint a,uint b)359 __device__ __forceinline__ uint vsetle2(uint a, uint b)
360 {
361 uint r = 0;
362
363 #if CV_CUDEV_ARCH >= 300
364 asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
365 #else
366 uint c;
367 asm("not.b32 %0, %0;" : "+r"(a));
368 c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
369 c = c & 0x80008000; // msb = carry-outs
370 r = c >> 15; // convert to bool
371 #endif
372
373 return r;
374 }
375
vcmple2(uint a,uint b)376 __device__ __forceinline__ uint vcmple2(uint a, uint b)
377 {
378 uint r, c;
379
380 #if CV_CUDEV_ARCH >= 300
381 r = vsetle2(a, b);
382 c = r << 16; // convert bool
383 r = c - r; // into mask
384 #else
385 asm("not.b32 %0, %0;" : "+r"(a));
386 c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
387 c = c & 0x80008000; // msb = carry-outs
388 r = c >> 15; // convert
389 r = c - r; // msbs to
390 r = c | r; // mask
391 #endif
392
393 return r;
394 }
395
vsetlt2(uint a,uint b)396 __device__ __forceinline__ uint vsetlt2(uint a, uint b)
397 {
398 uint r = 0;
399
400 #if CV_CUDEV_ARCH >= 300
401 asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
402 #else
403 uint c;
404 asm("not.b32 %0, %0;" : "+r"(a));
405 c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
406 c = c & 0x80008000; // msb = carry-outs
407 r = c >> 15; // convert to bool
408 #endif
409
410 return r;
411 }
412
vcmplt2(uint a,uint b)413 __device__ __forceinline__ uint vcmplt2(uint a, uint b)
414 {
415 uint r, c;
416
417 #if CV_CUDEV_ARCH >= 300
418 r = vsetlt2(a, b);
419 c = r << 16; // convert bool
420 r = c - r; // into mask
421 #else
422 asm("not.b32 %0, %0;" : "+r"(a));
423 c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
424 c = c & 0x80008000; // msb = carry-outs
425 r = c >> 15; // convert
426 r = c - r; // msbs to
427 r = c | r; // mask
428 #endif
429
430 return r;
431 }
432
vsetne2(uint a,uint b)433 __device__ __forceinline__ uint vsetne2(uint a, uint b)
434 {
435 uint r = 0;
436
437 #if CV_CUDEV_ARCH >= 300
438 asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
439 #else
440 // inspired by Alan Mycroft's null-byte detection algorithm:
441 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
442 uint c;
443 r = a ^ b; // 0x0000 if a == b
444 c = r | 0x80008000; // set msbs, to catch carry out
445 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
446 c = r | c; // msb = 1, if r was not 0x0000
447 c = c & 0x80008000; // extract msbs
448 r = c >> 15; // convert to bool
449 #endif
450
451 return r;
452 }
453
vcmpne2(uint a,uint b)454 __device__ __forceinline__ uint vcmpne2(uint a, uint b)
455 {
456 uint r, c;
457
458 #if CV_CUDEV_ARCH >= 300
459 r = vsetne2(a, b);
460 c = r << 16; // convert bool
461 r = c - r; // into mask
462 #else
463 // inspired by Alan Mycroft's null-byte detection algorithm:
464 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
465 r = a ^ b; // 0x0000 if a == b
466 c = r | 0x80008000; // set msbs, to catch carry out
467 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
468 c = r | c; // msb = 1, if r was not 0x0000
469 c = c & 0x80008000; // extract msbs
470 r = c >> 15; // convert
471 r = c - r; // msbs to
472 r = c | r; // mask
473 #endif
474
475 return r;
476 }
477
vmax2(uint a,uint b)478 __device__ __forceinline__ uint vmax2(uint a, uint b)
479 {
480 uint r = 0;
481
482 #if CV_CUDEV_ARCH >= 300
483 asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
484 #elif CV_CUDEV_ARCH >= 200
485 asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
486 asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
487 #else
488 uint s, t, u;
489 r = a & 0x0000ffff; // extract low halfword
490 s = b & 0x0000ffff; // extract low halfword
491 t = ::max(r, s); // maximum of low halfwords
492 r = a & 0xffff0000; // extract high halfword
493 s = b & 0xffff0000; // extract high halfword
494 u = ::max(r, s); // maximum of high halfwords
495 r = t | u; // combine halfword maximums
496 #endif
497
498 return r;
499 }
500
vmin2(uint a,uint b)501 __device__ __forceinline__ uint vmin2(uint a, uint b)
502 {
503 uint r = 0;
504
505 #if CV_CUDEV_ARCH >= 300
506 asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
507 #elif CV_CUDEV_ARCH >= 200
508 asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
509 asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
510 #else
511 uint s, t, u;
512 r = a & 0x0000ffff; // extract low halfword
513 s = b & 0x0000ffff; // extract low halfword
514 t = ::min(r, s); // minimum of low halfwords
515 r = a & 0xffff0000; // extract high halfword
516 s = b & 0xffff0000; // extract high halfword
517 u = ::min(r, s); // minimum of high halfwords
518 r = t | u; // combine halfword minimums
519 #endif
520
521 return r;
522 }
523
524 // 4
525
vadd4(uint a,uint b)526 __device__ __forceinline__ uint vadd4(uint a, uint b)
527 {
528 uint r = 0;
529
530 #if CV_CUDEV_ARCH >= 300
531 asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
532 #elif CV_CUDEV_ARCH >= 200
533 asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
534 asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
535 asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
536 asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
537 #else
538 uint s, t;
539 s = a ^ b; // sum bits
540 r = a & 0x7f7f7f7f; // clear msbs
541 t = b & 0x7f7f7f7f; // clear msbs
542 s = s & 0x80808080; // msb sum bits
543 r = r + t; // add without msbs, record carry-out in msbs
544 r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out
545 #endif /* CV_CUDEV_ARCH >= 300 */
546
547 return r;
548 }
549
vsub4(uint a,uint b)550 __device__ __forceinline__ uint vsub4(uint a, uint b)
551 {
552 uint r = 0;
553
554 #if CV_CUDEV_ARCH >= 300
555 asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
556 #elif CV_CUDEV_ARCH >= 200
557 asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
558 asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
559 asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
560 asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
561 #else
562 uint s, t;
563 s = a ^ ~b; // inverted sum bits
564 r = a | 0x80808080; // set msbs
565 t = b & 0x7f7f7f7f; // clear msbs
566 s = s & 0x80808080; // inverted msb sum bits
567 r = r - t; // subtract w/o msbs, record inverted borrows in msb
568 r = r ^ s; // combine inverted msb sum bits and borrows
569 #endif
570
571 return r;
572 }
573
vavg4(uint a,uint b)574 __device__ __forceinline__ uint vavg4(uint a, uint b)
575 {
576 uint r, s;
577
578 // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
579 // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
580 s = a ^ b;
581 r = a & b;
582 s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
583 s = s >> 1;
584 s = r + s;
585
586 return s;
587 }
588
vavrg4(uint a,uint b)589 __device__ __forceinline__ uint vavrg4(uint a, uint b)
590 {
591 uint r = 0;
592
593 #if CV_CUDEV_ARCH >= 300
594 asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
595 #else
596 // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
597 // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
598 uint c;
599 c = a ^ b;
600 r = a | b;
601 c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
602 c = c >> 1;
603 r = r - c;
604 #endif
605
606 return r;
607 }
608
vseteq4(uint a,uint b)609 __device__ __forceinline__ uint vseteq4(uint a, uint b)
610 {
611 uint r = 0;
612
613 #if CV_CUDEV_ARCH >= 300
614 asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
615 #else
616 // inspired by Alan Mycroft's null-byte detection algorithm:
617 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
618 uint c;
619 r = a ^ b; // 0x00 if a == b
620 c = r | 0x80808080; // set msbs, to catch carry out
621 r = r ^ c; // extract msbs, msb = 1 if r < 0x80
622 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
623 c = r & ~c; // msb = 1, if r was 0x00
624 r = c >> 7; // convert to bool
625 #endif
626
627 return r;
628 }
629
vcmpeq4(uint a,uint b)630 __device__ __forceinline__ uint vcmpeq4(uint a, uint b)
631 {
632 uint r, t;
633
634 #if CV_CUDEV_ARCH >= 300
635 r = vseteq4(a, b);
636 t = r << 8; // convert bool
637 r = t - r; // to mask
638 #else
639 // inspired by Alan Mycroft's null-byte detection algorithm:
640 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
641 t = a ^ b; // 0x00 if a == b
642 r = t | 0x80808080; // set msbs, to catch carry out
643 t = t ^ r; // extract msbs, msb = 1 if t < 0x80
644 r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
645 r = t & ~r; // msb = 1, if t was 0x00
646 t = r >> 7; // build mask
647 t = r - t; // from
648 r = t | r; // msbs
649 #endif
650
651 return r;
652 }
653
vsetle4(uint a,uint b)654 __device__ __forceinline__ uint vsetle4(uint a, uint b)
655 {
656 uint r = 0;
657
658 #if CV_CUDEV_ARCH >= 300
659 asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
660 #else
661 uint c;
662 asm("not.b32 %0, %0;" : "+r"(a));
663 c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
664 c = c & 0x80808080; // msb = carry-outs
665 r = c >> 7; // convert to bool
666 #endif
667
668 return r;
669 }
670
vcmple4(uint a,uint b)671 __device__ __forceinline__ uint vcmple4(uint a, uint b)
672 {
673 uint r, c;
674
675 #if CV_CUDEV_ARCH >= 300
676 r = vsetle4(a, b);
677 c = r << 8; // convert bool
678 r = c - r; // to mask
679 #else
680 asm("not.b32 %0, %0;" : "+r"(a));
681 c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2
682 c = c & 0x80808080; // msbs = carry-outs
683 r = c >> 7; // convert
684 r = c - r; // msbs to
685 r = c | r; // mask
686 #endif
687
688 return r;
689 }
690
vsetlt4(uint a,uint b)691 __device__ __forceinline__ uint vsetlt4(uint a, uint b)
692 {
693 uint r = 0;
694
695 #if CV_CUDEV_ARCH >= 300
696 asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
697 #else
698 uint c;
699 asm("not.b32 %0, %0;" : "+r"(a));
700 c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
701 c = c & 0x80808080; // msb = carry-outs
702 r = c >> 7; // convert to bool
703 #endif
704
705 return r;
706 }
707
vcmplt4(uint a,uint b)708 __device__ __forceinline__ uint vcmplt4(uint a, uint b)
709 {
710 uint r, c;
711
712 #if CV_CUDEV_ARCH >= 300
713 r = vsetlt4(a, b);
714 c = r << 8; // convert bool
715 r = c - r; // to mask
716 #else
717 asm("not.b32 %0, %0;" : "+r"(a));
718 c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down]
719 c = c & 0x80808080; // msbs = carry-outs
720 r = c >> 7; // convert
721 r = c - r; // msbs to
722 r = c | r; // mask
723 #endif
724
725 return r;
726 }
727
vsetge4(uint a,uint b)728 __device__ __forceinline__ uint vsetge4(uint a, uint b)
729 {
730 uint r = 0;
731
732 #if CV_CUDEV_ARCH >= 300
733 asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
734 #else
735 uint c;
736 asm("not.b32 %0, %0;" : "+r"(b));
737 c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2
738 c = c & 0x80808080; // msb = carry-outs
739 r = c >> 7; // convert to bool
740 #endif
741
742 return r;
743 }
744
vcmpge4(uint a,uint b)745 __device__ __forceinline__ uint vcmpge4(uint a, uint b)
746 {
747 uint r, s;
748
749 #if CV_CUDEV_ARCH >= 300
750 r = vsetge4(a, b);
751 s = r << 8; // convert bool
752 r = s - r; // to mask
753 #else
754 asm ("not.b32 %0,%0;" : "+r"(b));
755 r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2
756 r = r & 0x80808080; // msb = carry-outs
757 s = r >> 7; // build mask
758 s = r - s; // from
759 r = s | r; // msbs
760 #endif
761
762 return r;
763 }
764
vsetgt4(uint a,uint b)765 __device__ __forceinline__ uint vsetgt4(uint a, uint b)
766 {
767 uint r = 0;
768
769 #if CV_CUDEV_ARCH >= 300
770 asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
771 #else
772 uint c;
773 asm("not.b32 %0, %0;" : "+r"(b));
774 c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
775 c = c & 0x80808080; // msb = carry-outs
776 r = c >> 7; // convert to bool
777 #endif
778
779 return r;
780 }
781
vcmpgt4(uint a,uint b)782 __device__ __forceinline__ uint vcmpgt4(uint a, uint b)
783 {
784 uint r, c;
785
786 #if CV_CUDEV_ARCH >= 300
787 r = vsetgt4(a, b);
788 c = r << 8; // convert bool
789 r = c - r; // to mask
790 #else
791 asm("not.b32 %0, %0;" : "+r"(b));
792 c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down]
793 c = c & 0x80808080; // msb = carry-outs
794 r = c >> 7; // convert
795 r = c - r; // msbs to
796 r = c | r; // mask
797 #endif
798
799 return r;
800 }
801
vsetne4(uint a,uint b)802 __device__ __forceinline__ uint vsetne4(uint a, uint b)
803 {
804 uint r = 0;
805
806 #if CV_CUDEV_ARCH >= 300
807 asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
808 #else
809 // inspired by Alan Mycroft's null-byte detection algorithm:
810 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
811 uint c;
812 r = a ^ b; // 0x00 if a == b
813 c = r | 0x80808080; // set msbs, to catch carry out
814 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
815 c = r | c; // msb = 1, if r was not 0x00
816 c = c & 0x80808080; // extract msbs
817 r = c >> 7; // convert to bool
818 #endif
819
820 return r;
821 }
822
vcmpne4(uint a,uint b)823 __device__ __forceinline__ uint vcmpne4(uint a, uint b)
824 {
825 uint r, c;
826
827 #if CV_CUDEV_ARCH >= 300
828 r = vsetne4(a, b);
829 c = r << 8; // convert bool
830 r = c - r; // to mask
831 #else
832 // inspired by Alan Mycroft's null-byte detection algorithm:
833 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
834 r = a ^ b; // 0x00 if a == b
835 c = r | 0x80808080; // set msbs, to catch carry out
836 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
837 c = r | c; // msb = 1, if r was not 0x00
838 c = c & 0x80808080; // extract msbs
839 r = c >> 7; // convert
840 r = c - r; // msbs to
841 r = c | r; // mask
842 #endif
843
844 return r;
845 }
846
vabsdiff4(uint a,uint b)847 __device__ __forceinline__ uint vabsdiff4(uint a, uint b)
848 {
849 uint r = 0;
850
851 #if CV_CUDEV_ARCH >= 300
852 asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
853 #elif CV_CUDEV_ARCH >= 200
854 asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
855 asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
856 asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
857 asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
858 #else
859 uint s;
860 s = vcmpge4(a, b); // mask = 0xff if a >= b
861 r = a ^ b; //
862 s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b)
863 r = s ^ r; // select a when b >= a, else select b => min(a,b)
864 r = s - r; // |a - b| = max(a,b) - min(a,b);
865 #endif
866
867 return r;
868 }
869
vmax4(uint a,uint b)870 __device__ __forceinline__ uint vmax4(uint a, uint b)
871 {
872 uint r = 0;
873
874 #if CV_CUDEV_ARCH >= 300
875 asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
876 #elif CV_CUDEV_ARCH >= 200
877 asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
878 asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
879 asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
880 asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
881 #else
882 uint s;
883 s = vcmpge4(a, b); // mask = 0xff if a >= b
884 r = a & s; // select a when b >= a
885 s = b & ~s; // select b when b < a
886 r = r | s; // combine byte selections
887 #endif
888
889 return r; // byte-wise unsigned maximum
890 }
891
vmin4(uint a,uint b)892 __device__ __forceinline__ uint vmin4(uint a, uint b)
893 {
894 uint r = 0;
895
896 #if CV_CUDEV_ARCH >= 300
897 asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
898 #elif CV_CUDEV_ARCH >= 200
899 asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
900 asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
901 asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
902 asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
903 #else
904 uint s;
905 s = vcmpge4(b, a); // mask = 0xff if a >= b
906 r = a & s; // select a when b >= a
907 s = b & ~s; // select b when b < a
908 r = r | s; // combine byte selections
909 #endif
910
911 return r;
912 }
913
914 //! @}
915
916 }}
917
918 #endif
919