1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 /*
45  * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
46  *
47  * Redistribution and use in source and binary forms, with or without
48  * modification, are permitted provided that the following conditions are met:
49  *
50  *   Redistributions of source code must retain the above copyright notice,
51  *   this list of conditions and the following disclaimer.
52  *
53  *   Redistributions in binary form must reproduce the above copyright notice,
54  *   this list of conditions and the following disclaimer in the documentation
55  *   and/or other materials provided with the distribution.
56  *
57  *   Neither the name of NVIDIA Corporation nor the names of its contributors
58  *   may be used to endorse or promote products derived from this software
59  *   without specific prior written permission.
60  *
61  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
62  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
65  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
66  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
67  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
68  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
69  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
70  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
71  * POSSIBILITY OF SUCH DAMAGE.
72  */
73 
74 #pragma once
75 
76 #ifndef __OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP__
77 #define __OPENCV_CUDEV_UTIL_SIMD_FUNCTIONS_HPP__
78 
79 #include "../common.hpp"
80 
81 /*
82   This header file contains inline functions that implement intra-word SIMD
83   operations, that are hardware accelerated on sm_3x (Kepler) GPUs. Efficient
84   emulation code paths are provided for earlier architectures (sm_1x, sm_2x)
85   to make the code portable across all GPUs supported by CUDA. The following
86   functions are currently implemented:
87 
88   vadd2(a,b)      per-halfword unsigned addition, with wrap-around: a + b
89   vsub2(a,b)      per-halfword unsigned subtraction, with wrap-around: a - b
90   vabsdiff2(a,b)  per-halfword unsigned absolute difference: |a - b|
91   vavg2(a,b)      per-halfword unsigned average: (a + b) / 2
92   vavrg2(a,b)     per-halfword unsigned rounded average: (a + b + 1) / 2
93   vseteq2(a,b)    per-halfword unsigned comparison: a == b ? 1 : 0
94   vcmpeq2(a,b)    per-halfword unsigned comparison: a == b ? 0xffff : 0
95   vsetge2(a,b)    per-halfword unsigned comparison: a >= b ? 1 : 0
96   vcmpge2(a,b)    per-halfword unsigned comparison: a >= b ? 0xffff : 0
97   vsetgt2(a,b)    per-halfword unsigned comparison: a > b ? 1 : 0
98   vcmpgt2(a,b)    per-halfword unsigned comparison: a > b ? 0xffff : 0
99   vsetle2(a,b)    per-halfword unsigned comparison: a <= b ? 1 : 0
100   vcmple2(a,b)    per-halfword unsigned comparison: a <= b ? 0xffff : 0
101   vsetlt2(a,b)    per-halfword unsigned comparison: a < b ? 1 : 0
102   vcmplt2(a,b)    per-halfword unsigned comparison: a < b ? 0xffff : 0
103   vsetne2(a,b)    per-halfword unsigned comparison: a != b ? 1 : 0
104   vcmpne2(a,b)    per-halfword unsigned comparison: a != b ? 0xffff : 0
105   vmax2(a,b)      per-halfword unsigned maximum: max(a, b)
106   vmin2(a,b)      per-halfword unsigned minimum: min(a, b)
107 
108   vadd4(a,b)      per-byte unsigned addition, with wrap-around: a + b
109   vsub4(a,b)      per-byte unsigned subtraction, with wrap-around: a - b
110   vabsdiff4(a,b)  per-byte unsigned absolute difference: |a - b|
111   vavg4(a,b)      per-byte unsigned average: (a + b) / 2
112   vavrg4(a,b)     per-byte unsigned rounded average: (a + b + 1) / 2
113   vseteq4(a,b)    per-byte unsigned comparison: a == b ? 1 : 0
114   vcmpeq4(a,b)    per-byte unsigned comparison: a == b ? 0xff : 0
115   vsetge4(a,b)    per-byte unsigned comparison: a >= b ? 1 : 0
116   vcmpge4(a,b)    per-byte unsigned comparison: a >= b ? 0xff : 0
117   vsetgt4(a,b)    per-byte unsigned comparison: a > b ? 1 : 0
118   vcmpgt4(a,b)    per-byte unsigned comparison: a > b ? 0xff : 0
119   vsetle4(a,b)    per-byte unsigned comparison: a <= b ? 1 : 0
120   vcmple4(a,b)    per-byte unsigned comparison: a <= b ? 0xff : 0
121   vsetlt4(a,b)    per-byte unsigned comparison: a < b ? 1 : 0
122   vcmplt4(a,b)    per-byte unsigned comparison: a < b ? 0xff : 0
123   vsetne4(a,b)    per-byte unsigned comparison: a != b ? 1: 0
124   vcmpne4(a,b)    per-byte unsigned comparison: a != b ? 0xff: 0
125   vmax4(a,b)      per-byte unsigned maximum: max(a, b)
126   vmin4(a,b)      per-byte unsigned minimum: min(a, b)
127 */
128 
129 namespace cv { namespace cudev {
130 
131 //! @addtogroup cudev
132 //! @{
133 
134 // 2
135 
vadd2(uint a,uint b)136 __device__ __forceinline__ uint vadd2(uint a, uint b)
137 {
138     uint r = 0;
139 
140 #if CV_CUDEV_ARCH >= 300
141     asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
142 #elif CV_CUDEV_ARCH >= 200
143     asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
144     asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
145 #else
146     uint s;
147     s = a ^ b;          // sum bits
148     r = a + b;          // actual sum
149     s = s ^ r;          // determine carry-ins for each bit position
150     s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
151     r = r - s;          // subtract out carry-out from low word
152 #endif
153 
154     return r;
155 }
156 
vsub2(uint a,uint b)157 __device__ __forceinline__ uint vsub2(uint a, uint b)
158 {
159     uint r = 0;
160 
161 #if CV_CUDEV_ARCH >= 300
162     asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
163 #elif CV_CUDEV_ARCH >= 200
164     asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
165     asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
166 #else
167     uint s;
168     s = a ^ b;          // sum bits
169     r = a - b;          // actual sum
170     s = s ^ r;          // determine carry-ins for each bit position
171     s = s & 0x00010000; // borrow to high word
172     r = r + s;          // compensate for borrow from low word
173 #endif
174 
175     return r;
176 }
177 
vabsdiff2(uint a,uint b)178 __device__ __forceinline__ uint vabsdiff2(uint a, uint b)
179 {
180     uint r = 0;
181 
182 #if CV_CUDEV_ARCH >= 300
183     asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
184 #elif CV_CUDEV_ARCH >= 200
185     asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
186     asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
187 #else
188     uint s, t, u, v;
189     s = a & 0x0000ffff; // extract low halfword
190     r = b & 0x0000ffff; // extract low halfword
191     u = ::max(r, s);    // maximum of low halfwords
192     v = ::min(r, s);    // minimum of low halfwords
193     s = a & 0xffff0000; // extract high halfword
194     r = b & 0xffff0000; // extract high halfword
195     t = ::max(r, s);    // maximum of high halfwords
196     s = ::min(r, s);    // minimum of high halfwords
197     r = u | t;          // maximum of both halfwords
198     s = v | s;          // minimum of both halfwords
199     r = r - s;          // |a - b| = max(a,b) - min(a,b);
200 #endif
201 
202     return r;
203 }
204 
vavg2(uint a,uint b)205 __device__ __forceinline__ uint vavg2(uint a, uint b)
206 {
207     uint r, s;
208 
209     // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
210     // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
211     s = a ^ b;
212     r = a & b;
213     s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
214     s = s >> 1;
215     s = r + s;
216 
217     return s;
218 }
219 
vavrg2(uint a,uint b)220 __device__ __forceinline__ uint vavrg2(uint a, uint b)
221 {
222     uint r = 0;
223 
224 #if CV_CUDEV_ARCH >= 300
225     asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
226 #else
227     // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
228     // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
229     uint s;
230     s = a ^ b;
231     r = a | b;
232     s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
233     s = s >> 1;
234     r = r - s;
235 #endif
236 
237     return r;
238 }
239 
vseteq2(uint a,uint b)240 __device__ __forceinline__ uint vseteq2(uint a, uint b)
241 {
242     uint r = 0;
243 
244 #if CV_CUDEV_ARCH >= 300
245     asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
246 #else
247     // inspired by Alan Mycroft's null-byte detection algorithm:
248     // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
249     uint c;
250     r = a ^ b;          // 0x0000 if a == b
251     c = r | 0x80008000; // set msbs, to catch carry out
252     r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
253     c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
254     c = r & ~c;         // msb = 1, if r was 0x0000
255     r = c >> 15;        // convert to bool
256 #endif
257 
258     return r;
259 }
260 
vcmpeq2(uint a,uint b)261 __device__ __forceinline__ uint vcmpeq2(uint a, uint b)
262 {
263     uint r, c;
264 
265 #if CV_CUDEV_ARCH >= 300
266     r = vseteq2(a, b);
267     c = r << 16;        // convert bool
268     r = c - r;          //  into mask
269 #else
270     // inspired by Alan Mycroft's null-byte detection algorithm:
271     // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
272     r = a ^ b;          // 0x0000 if a == b
273     c = r | 0x80008000; // set msbs, to catch carry out
274     r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
275     c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
276     c = r & ~c;         // msb = 1, if r was 0x0000
277     r = c >> 15;        // convert
278     r = c - r;          //  msbs to
279     r = c | r;          //   mask
280 #endif
281 
282     return r;
283 }
284 
vsetge2(uint a,uint b)285 __device__ __forceinline__ uint vsetge2(uint a, uint b)
286 {
287     uint r = 0;
288 
289 #if CV_CUDEV_ARCH >= 300
290     asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
291 #else
292     uint c;
293     asm("not.b32 %0, %0;" : "+r"(b));
294     c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
295     c = c & 0x80008000; // msb = carry-outs
296     r = c >> 15;        // convert to bool
297 #endif
298 
299     return r;
300 }
301 
vcmpge2(uint a,uint b)302 __device__ __forceinline__ uint vcmpge2(uint a, uint b)
303 {
304     uint r, c;
305 
306 #if CV_CUDEV_ARCH >= 300
307     r = vsetge2(a, b);
308     c = r << 16;        // convert bool
309     r = c - r;          //  into mask
310 #else
311     asm("not.b32 %0, %0;" : "+r"(b));
312     c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
313     c = c & 0x80008000; // msb = carry-outs
314     r = c >> 15;        // convert
315     r = c - r;          //  msbs to
316     r = c | r;          //   mask
317 #endif
318 
319     return r;
320 }
321 
vsetgt2(uint a,uint b)322 __device__ __forceinline__ uint vsetgt2(uint a, uint b)
323 {
324     uint r = 0;
325 
326 #if CV_CUDEV_ARCH >= 300
327     asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
328 #else
329     uint c;
330     asm("not.b32 %0, %0;" : "+r"(b));
331     c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
332     c = c & 0x80008000; // msbs = carry-outs
333     r = c >> 15;        // convert to bool
334 #endif
335 
336     return r;
337 }
338 
vcmpgt2(uint a,uint b)339 __device__ __forceinline__ uint vcmpgt2(uint a, uint b)
340 {
341     uint r, c;
342 
343 #if CV_CUDEV_ARCH >= 300
344     r = vsetgt2(a, b);
345     c = r << 16;        // convert bool
346     r = c - r;          //  into mask
347 #else
348     asm("not.b32 %0, %0;" : "+r"(b));
349     c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
350     c = c & 0x80008000; // msbs = carry-outs
351     r = c >> 15;        // convert
352     r = c - r;          //  msbs to
353     r = c | r;          //   mask
354 #endif
355 
356     return r;
357 }
358 
vsetle2(uint a,uint b)359 __device__ __forceinline__ uint vsetle2(uint a, uint b)
360 {
361     uint r = 0;
362 
363 #if CV_CUDEV_ARCH >= 300
364     asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
365 #else
366     uint c;
367     asm("not.b32 %0, %0;" : "+r"(a));
368     c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
369     c = c & 0x80008000; // msb = carry-outs
370     r = c >> 15;        // convert to bool
371 #endif
372 
373     return r;
374 }
375 
vcmple2(uint a,uint b)376 __device__ __forceinline__ uint vcmple2(uint a, uint b)
377 {
378     uint r, c;
379 
380 #if CV_CUDEV_ARCH >= 300
381     r = vsetle2(a, b);
382     c = r << 16;        // convert bool
383     r = c - r;          //  into mask
384 #else
385     asm("not.b32 %0, %0;" : "+r"(a));
386     c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
387     c = c & 0x80008000; // msb = carry-outs
388     r = c >> 15;        // convert
389     r = c - r;          //  msbs to
390     r = c | r;          //   mask
391 #endif
392 
393     return r;
394 }
395 
vsetlt2(uint a,uint b)396 __device__ __forceinline__ uint vsetlt2(uint a, uint b)
397 {
398     uint r = 0;
399 
400 #if CV_CUDEV_ARCH >= 300
401     asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
402 #else
403     uint c;
404     asm("not.b32 %0, %0;" : "+r"(a));
405     c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
406     c = c & 0x80008000; // msb = carry-outs
407     r = c >> 15;        // convert to bool
408 #endif
409 
410     return r;
411 }
412 
vcmplt2(uint a,uint b)413 __device__ __forceinline__ uint vcmplt2(uint a, uint b)
414 {
415     uint r, c;
416 
417 #if CV_CUDEV_ARCH >= 300
418     r = vsetlt2(a, b);
419     c = r << 16;        // convert bool
420     r = c - r;          //  into mask
421 #else
422     asm("not.b32 %0, %0;" : "+r"(a));
423     c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
424     c = c & 0x80008000; // msb = carry-outs
425     r = c >> 15;        // convert
426     r = c - r;          //  msbs to
427     r = c | r;          //   mask
428 #endif
429 
430     return r;
431 }
432 
vsetne2(uint a,uint b)433 __device__ __forceinline__ uint vsetne2(uint a, uint b)
434 {
435     uint r = 0;
436 
437 #if CV_CUDEV_ARCH >= 300
438     asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
439 #else
440     // inspired by Alan Mycroft's null-byte detection algorithm:
441     // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
442     uint c;
443     r = a ^ b;          // 0x0000 if a == b
444     c = r | 0x80008000; // set msbs, to catch carry out
445     c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
446     c = r | c;          // msb = 1, if r was not 0x0000
447     c = c & 0x80008000; // extract msbs
448     r = c >> 15;        // convert to bool
449 #endif
450 
451     return r;
452 }
453 
vcmpne2(uint a,uint b)454 __device__ __forceinline__ uint vcmpne2(uint a, uint b)
455 {
456     uint r, c;
457 
458 #if CV_CUDEV_ARCH >= 300
459     r = vsetne2(a, b);
460     c = r << 16;        // convert bool
461     r = c - r;          //  into mask
462 #else
463     // inspired by Alan Mycroft's null-byte detection algorithm:
464     // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
465     r = a ^ b;          // 0x0000 if a == b
466     c = r | 0x80008000; // set msbs, to catch carry out
467     c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
468     c = r | c;          // msb = 1, if r was not 0x0000
469     c = c & 0x80008000; // extract msbs
470     r = c >> 15;        // convert
471     r = c - r;          //  msbs to
472     r = c | r;          //   mask
473 #endif
474 
475     return r;
476 }
477 
vmax2(uint a,uint b)478 __device__ __forceinline__ uint vmax2(uint a, uint b)
479 {
480     uint r = 0;
481 
482 #if CV_CUDEV_ARCH >= 300
483     asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
484 #elif CV_CUDEV_ARCH >= 200
485     asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
486     asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
487 #else
488     uint s, t, u;
489     r = a & 0x0000ffff; // extract low halfword
490     s = b & 0x0000ffff; // extract low halfword
491     t = ::max(r, s);    // maximum of low halfwords
492     r = a & 0xffff0000; // extract high halfword
493     s = b & 0xffff0000; // extract high halfword
494     u = ::max(r, s);    // maximum of high halfwords
495     r = t | u;          // combine halfword maximums
496 #endif
497 
498     return r;
499 }
500 
vmin2(uint a,uint b)501 __device__ __forceinline__ uint vmin2(uint a, uint b)
502 {
503     uint r = 0;
504 
505 #if CV_CUDEV_ARCH >= 300
506     asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
507 #elif CV_CUDEV_ARCH >= 200
508     asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
509     asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
510 #else
511     uint s, t, u;
512     r = a & 0x0000ffff; // extract low halfword
513     s = b & 0x0000ffff; // extract low halfword
514     t = ::min(r, s);    // minimum of low halfwords
515     r = a & 0xffff0000; // extract high halfword
516     s = b & 0xffff0000; // extract high halfword
517     u = ::min(r, s);    // minimum of high halfwords
518     r = t | u;          // combine halfword minimums
519 #endif
520 
521     return r;
522 }
523 
524 // 4
525 
vadd4(uint a,uint b)526 __device__ __forceinline__ uint vadd4(uint a, uint b)
527 {
528     uint r = 0;
529 
530 #if CV_CUDEV_ARCH >= 300
531     asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
532 #elif CV_CUDEV_ARCH >= 200
533     asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
534     asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
535     asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
536     asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
537 #else
538     uint s, t;
539     s = a ^ b;          // sum bits
540     r = a & 0x7f7f7f7f; // clear msbs
541     t = b & 0x7f7f7f7f; // clear msbs
542     s = s & 0x80808080; // msb sum bits
543     r = r + t;          // add without msbs, record carry-out in msbs
544     r = r ^ s;          // sum of msb sum and carry-in bits, w/o carry-out
545 #endif /* CV_CUDEV_ARCH >= 300 */
546 
547     return r;
548 }
549 
vsub4(uint a,uint b)550 __device__ __forceinline__ uint vsub4(uint a, uint b)
551 {
552     uint r = 0;
553 
554 #if CV_CUDEV_ARCH >= 300
555     asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
556 #elif CV_CUDEV_ARCH >= 200
557     asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
558     asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
559     asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
560     asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
561 #else
562     uint s, t;
563     s = a ^ ~b;         // inverted sum bits
564     r = a | 0x80808080; // set msbs
565     t = b & 0x7f7f7f7f; // clear msbs
566     s = s & 0x80808080; // inverted msb sum bits
567     r = r - t;          // subtract w/o msbs, record inverted borrows in msb
568     r = r ^ s;          // combine inverted msb sum bits and borrows
569 #endif
570 
571     return r;
572 }
573 
vavg4(uint a,uint b)574 __device__ __forceinline__ uint vavg4(uint a, uint b)
575 {
576     uint r, s;
577 
578     // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
579     // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
580     s = a ^ b;
581     r = a & b;
582     s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
583     s = s >> 1;
584     s = r + s;
585 
586     return s;
587 }
588 
vavrg4(uint a,uint b)589 __device__ __forceinline__ uint vavrg4(uint a, uint b)
590 {
591     uint r = 0;
592 
593 #if CV_CUDEV_ARCH >= 300
594     asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
595 #else
596     // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
597     // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
598     uint c;
599     c = a ^ b;
600     r = a | b;
601     c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
602     c = c >> 1;
603     r = r - c;
604 #endif
605 
606     return r;
607 }
608 
vseteq4(uint a,uint b)609 __device__ __forceinline__ uint vseteq4(uint a, uint b)
610 {
611     uint r = 0;
612 
613 #if CV_CUDEV_ARCH >= 300
614     asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
615 #else
616     // inspired by Alan Mycroft's null-byte detection algorithm:
617     // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
618     uint c;
619     r = a ^ b;          // 0x00 if a == b
620     c = r | 0x80808080; // set msbs, to catch carry out
621     r = r ^ c;          // extract msbs, msb = 1 if r < 0x80
622     c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
623     c = r & ~c;         // msb = 1, if r was 0x00
624     r = c >> 7;         // convert to bool
625 #endif
626 
627     return r;
628 }
629 
vcmpeq4(uint a,uint b)630 __device__ __forceinline__ uint vcmpeq4(uint a, uint b)
631 {
632     uint r, t;
633 
634 #if CV_CUDEV_ARCH >= 300
635     r = vseteq4(a, b);
636     t = r << 8;         // convert bool
637     r = t - r;          //  to mask
638 #else
639     // inspired by Alan Mycroft's null-byte detection algorithm:
640     // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
641     t = a ^ b;          // 0x00 if a == b
642     r = t | 0x80808080; // set msbs, to catch carry out
643     t = t ^ r;          // extract msbs, msb = 1 if t < 0x80
644     r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
645     r = t & ~r;         // msb = 1, if t was 0x00
646     t = r >> 7;         // build mask
647     t = r - t;          //  from
648     r = t | r;          //   msbs
649 #endif
650 
651     return r;
652 }
653 
vsetle4(uint a,uint b)654 __device__ __forceinline__ uint vsetle4(uint a, uint b)
655 {
656     uint r = 0;
657 
658 #if CV_CUDEV_ARCH >= 300
659     asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
660 #else
661     uint c;
662     asm("not.b32 %0, %0;" : "+r"(a));
663     c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
664     c = c & 0x80808080; // msb = carry-outs
665     r = c >> 7;         // convert to bool
666 #endif
667 
668     return r;
669 }
670 
vcmple4(uint a,uint b)671 __device__ __forceinline__ uint vcmple4(uint a, uint b)
672 {
673     uint r, c;
674 
675 #if CV_CUDEV_ARCH >= 300
676     r = vsetle4(a, b);
677     c = r << 8;         // convert bool
678     r = c - r;          //  to mask
679 #else
680     asm("not.b32 %0, %0;" : "+r"(a));
681     c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
682     c = c & 0x80808080; // msbs = carry-outs
683     r = c >> 7;         // convert
684     r = c - r;          //  msbs to
685     r = c | r;          //   mask
686 #endif
687 
688     return r;
689 }
690 
vsetlt4(uint a,uint b)691 __device__ __forceinline__ uint vsetlt4(uint a, uint b)
692 {
693     uint r = 0;
694 
695 #if CV_CUDEV_ARCH >= 300
696     asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
697 #else
698     uint c;
699     asm("not.b32 %0, %0;" : "+r"(a));
700     c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
701     c = c & 0x80808080; // msb = carry-outs
702     r = c >> 7;         // convert to bool
703 #endif
704 
705     return r;
706 }
707 
vcmplt4(uint a,uint b)708 __device__ __forceinline__ uint vcmplt4(uint a, uint b)
709 {
710     uint r, c;
711 
712 #if CV_CUDEV_ARCH >= 300
713     r = vsetlt4(a, b);
714     c = r << 8;         // convert bool
715     r = c - r;          //  to mask
716 #else
717     asm("not.b32 %0, %0;" : "+r"(a));
718     c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
719     c = c & 0x80808080; // msbs = carry-outs
720     r = c >> 7;         // convert
721     r = c - r;          //  msbs to
722     r = c | r;          //   mask
723 #endif
724 
725     return r;
726 }
727 
vsetge4(uint a,uint b)728 __device__ __forceinline__ uint vsetge4(uint a, uint b)
729 {
730     uint r = 0;
731 
732 #if CV_CUDEV_ARCH >= 300
733     asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
734 #else
735     uint c;
736     asm("not.b32 %0, %0;" : "+r"(b));
737     c = vavrg4(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
738     c = c & 0x80808080; // msb = carry-outs
739     r = c >> 7;         // convert to bool
740 #endif
741 
742     return r;
743 }
744 
vcmpge4(uint a,uint b)745 __device__ __forceinline__ uint vcmpge4(uint a, uint b)
746 {
747     uint r, s;
748 
749 #if CV_CUDEV_ARCH >= 300
750     r = vsetge4(a, b);
751     s = r << 8;         // convert bool
752     r = s - r;          //  to mask
753 #else
754     asm ("not.b32 %0,%0;" : "+r"(b));
755     r = vavrg4 (a, b);  // (a + ~b + 1) / 2 = (a - b) / 2
756     r = r & 0x80808080; // msb = carry-outs
757     s = r >> 7;         // build mask
758     s = r - s;          //  from
759     r = s | r;          //   msbs
760 #endif
761 
762     return r;
763 }
764 
vsetgt4(uint a,uint b)765 __device__ __forceinline__ uint vsetgt4(uint a, uint b)
766 {
767     uint r = 0;
768 
769 #if CV_CUDEV_ARCH >= 300
770     asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
771 #else
772     uint c;
773     asm("not.b32 %0, %0;" : "+r"(b));
774     c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
775     c = c & 0x80808080; // msb = carry-outs
776     r = c >> 7;         // convert to bool
777 #endif
778 
779     return r;
780 }
781 
vcmpgt4(uint a,uint b)782 __device__ __forceinline__ uint vcmpgt4(uint a, uint b)
783 {
784     uint r, c;
785 
786 #if CV_CUDEV_ARCH >= 300
787     r = vsetgt4(a, b);
788     c = r << 8;         // convert bool
789     r = c - r;          //  to mask
790 #else
791     asm("not.b32 %0, %0;" : "+r"(b));
792     c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
793     c = c & 0x80808080; // msb = carry-outs
794     r = c >> 7;         // convert
795     r = c - r;          //  msbs to
796     r = c | r;          //   mask
797 #endif
798 
799     return r;
800 }
801 
vsetne4(uint a,uint b)802 __device__ __forceinline__ uint vsetne4(uint a, uint b)
803 {
804     uint r = 0;
805 
806 #if CV_CUDEV_ARCH >= 300
807     asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
808 #else
809     // inspired by Alan Mycroft's null-byte detection algorithm:
810     // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
811     uint c;
812     r = a ^ b;          // 0x00 if a == b
813     c = r | 0x80808080; // set msbs, to catch carry out
814     c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
815     c = r | c;          // msb = 1, if r was not 0x00
816     c = c & 0x80808080; // extract msbs
817     r = c >> 7;         // convert to bool
818 #endif
819 
820     return r;
821 }
822 
vcmpne4(uint a,uint b)823 __device__ __forceinline__ uint vcmpne4(uint a, uint b)
824 {
825     uint r, c;
826 
827 #if CV_CUDEV_ARCH >= 300
828     r = vsetne4(a, b);
829     c = r << 8;         // convert bool
830     r = c - r;          //  to mask
831 #else
832     // inspired by Alan Mycroft's null-byte detection algorithm:
833     // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
834     r = a ^ b;          // 0x00 if a == b
835     c = r | 0x80808080; // set msbs, to catch carry out
836     c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
837     c = r | c;          // msb = 1, if r was not 0x00
838     c = c & 0x80808080; // extract msbs
839     r = c >> 7;         // convert
840     r = c - r;          //  msbs to
841     r = c | r;          //   mask
842 #endif
843 
844     return r;
845 }
846 
vabsdiff4(uint a,uint b)847 __device__ __forceinline__ uint vabsdiff4(uint a, uint b)
848 {
849     uint r = 0;
850 
851 #if CV_CUDEV_ARCH >= 300
852     asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
853 #elif CV_CUDEV_ARCH >= 200
854     asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
855     asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
856     asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
857     asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
858 #else
859     uint s;
860     s = vcmpge4(a, b);  // mask = 0xff if a >= b
861     r = a ^ b;          //
862     s = (r &  s) ^ b;   // select a when a >= b, else select b => max(a,b)
863     r = s ^ r;          // select a when b >= a, else select b => min(a,b)
864     r = s - r;          // |a - b| = max(a,b) - min(a,b);
865 #endif
866 
867     return r;
868 }
869 
vmax4(uint a,uint b)870 __device__ __forceinline__ uint vmax4(uint a, uint b)
871 {
872     uint r = 0;
873 
874 #if CV_CUDEV_ARCH >= 300
875     asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
876 #elif CV_CUDEV_ARCH >= 200
877     asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
878     asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
879     asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
880     asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
881 #else
882     uint s;
883     s = vcmpge4(a, b);  // mask = 0xff if a >= b
884     r = a & s;          // select a when b >= a
885     s = b & ~s;         // select b when b < a
886     r = r | s;          // combine byte selections
887 #endif
888 
889     return r;           // byte-wise unsigned maximum
890 }
891 
vmin4(uint a,uint b)892 __device__ __forceinline__ uint vmin4(uint a, uint b)
893 {
894     uint r = 0;
895 
896 #if CV_CUDEV_ARCH >= 300
897     asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
898 #elif CV_CUDEV_ARCH >= 200
899     asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
900     asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
901     asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
902     asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
903 #else
904     uint s;
905     s = vcmpge4(b, a);  // mask = 0xff if a >= b
906     r = a & s;          // select a when b >= a
907     s = b & ~s;         // select b when b < a
908     r = r | s;          // combine byte selections
909 #endif
910 
911     return r;
912 }
913 
914 //! @}
915 
916 }}
917 
918 #endif
919