1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// Redistribution and use in source and binary forms, with or without modification,
18// are permitted provided that the following conditions are met:
19//
20//   * Redistribution's of source code must retain the above copyright notice,
21//     this list of conditions and the following disclaimer.
22//
23//   * Redistribution's in binary form must reproduce the above copyright notice,
24//     this list of conditions and the following disclaimer in the documentation
25//     and/or other materials provided with the distribution.
26//
27//   * The name of the copyright holders may not be used to endorse or promote products
28//     derived from this software without specific prior written permission.
29//
30// This software is provided by the copyright holders and contributors "as is" and
31// any express or implied warranties, including, but not limited to, the implied
32// warranties of merchantability and fitness for a particular purpose are disclaimed.
33// In no event shall the Intel Corporation or contributors be liable for any direct,
34// indirect, incidental, special, exemplary, or consequential damages
35// (including, but not limited to, procurement of substitute goods or services;
36// loss of use, data, or profits; or business interruption) however caused
37// and on any theory of liability, whether in contract, strict liability,
38// or tort (including negligence or otherwise) arising in any way out of
39// the use of this software, even if advised of the possibility of such damage.
40//
41//M*/
42
43/*
44 * Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
45 *
46 * Redistribution and use in source and binary forms, with or without
47 * modification, are permitted provided that the following conditions are met:
48 *
49 *   Redistributions of source code must retain the above copyright notice,
50 *   this list of conditions and the following disclaimer.
51 *
52 *   Redistributions in binary form must reproduce the above copyright notice,
53 *   this list of conditions and the following disclaimer in the documentation
54 *   and/or other materials provided with the distribution.
55 *
56 *   Neither the name of NVIDIA Corporation nor the names of its contributors
57 *   may be used to endorse or promote products derived from this software
58 *   without specific prior written permission.
59 *
60 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
61 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
63 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
64 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
65 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
66 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
67 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
68 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
70 * POSSIBILITY OF SUCH DAMAGE.
71 */
72
73#ifndef __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__
74#define __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__
75
76#include "common.hpp"
77
78/** @file
79 * @deprecated Use @ref cudev instead.
80 */
81
82//! @cond IGNORED
83
84namespace cv { namespace cuda { namespace device
85{
86    // 2
87
88    static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b)
89    {
90        unsigned int r = 0;
91
92    #if __CUDA_ARCH__ >= 300
93        asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
94    #elif __CUDA_ARCH__ >= 200
95        asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
96        asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
97    #else
98        unsigned int s;
99        s = a ^ b;          // sum bits
100        r = a + b;          // actual sum
101        s = s ^ r;          // determine carry-ins for each bit position
102        s = s & 0x00010000; // carry-in to high word (= carry-out from low word)
103        r = r - s;          // subtract out carry-out from low word
104    #endif
105
106        return r;
107    }
108
109    static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b)
110    {
111        unsigned int r = 0;
112
113    #if __CUDA_ARCH__ >= 300
114        asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
115    #elif __CUDA_ARCH__ >= 200
116        asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
117        asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
118    #else
119        unsigned int s;
120        s = a ^ b;          // sum bits
121        r = a - b;          // actual sum
122        s = s ^ r;          // determine carry-ins for each bit position
123        s = s & 0x00010000; // borrow to high word
124        r = r + s;          // compensate for borrow from low word
125    #endif
126
127        return r;
128    }
129
130    static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b)
131    {
132        unsigned int r = 0;
133
134    #if __CUDA_ARCH__ >= 300
135        asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
136    #elif __CUDA_ARCH__ >= 200
137        asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
138        asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
139    #else
140        unsigned int s, t, u, v;
141        s = a & 0x0000ffff; // extract low halfword
142        r = b & 0x0000ffff; // extract low halfword
143        u = ::max(r, s);    // maximum of low halfwords
144        v = ::min(r, s);    // minimum of low halfwords
145        s = a & 0xffff0000; // extract high halfword
146        r = b & 0xffff0000; // extract high halfword
147        t = ::max(r, s);    // maximum of high halfwords
148        s = ::min(r, s);    // minimum of high halfwords
149        r = u | t;          // maximum of both halfwords
150        s = v | s;          // minimum of both halfwords
151        r = r - s;          // |a - b| = max(a,b) - min(a,b);
152    #endif
153
154        return r;
155    }
156
157    static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b)
158    {
159        unsigned int r, s;
160
161        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
162        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
163        s = a ^ b;
164        r = a & b;
165        s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries
166        s = s >> 1;
167        s = r + s;
168
169        return s;
170    }
171
172    static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b)
173    {
174        unsigned int r = 0;
175
176    #if __CUDA_ARCH__ >= 300
177        asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
178    #else
179        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
180        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
181        unsigned int s;
182        s = a ^ b;
183        r = a | b;
184        s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries
185        s = s >> 1;
186        r = r - s;
187    #endif
188
189        return r;
190    }
191
192    static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b)
193    {
194        unsigned int r = 0;
195
196    #if __CUDA_ARCH__ >= 300
197        asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
198    #else
199        // inspired by Alan Mycroft's null-byte detection algorithm:
200        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
201        unsigned int c;
202        r = a ^ b;          // 0x0000 if a == b
203        c = r | 0x80008000; // set msbs, to catch carry out
204        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
205        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
206        c = r & ~c;         // msb = 1, if r was 0x0000
207        r = c >> 15;        // convert to bool
208    #endif
209
210        return r;
211    }
212
213    static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b)
214    {
215        unsigned int r, c;
216
217    #if __CUDA_ARCH__ >= 300
218        r = vseteq2(a, b);
219        c = r << 16;        // convert bool
220        r = c - r;          //  into mask
221    #else
222        // inspired by Alan Mycroft's null-byte detection algorithm:
223        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
224        r = a ^ b;          // 0x0000 if a == b
225        c = r | 0x80008000; // set msbs, to catch carry out
226        r = r ^ c;          // extract msbs, msb = 1 if r < 0x8000
227        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
228        c = r & ~c;         // msb = 1, if r was 0x0000
229        r = c >> 15;        // convert
230        r = c - r;          //  msbs to
231        r = c | r;          //   mask
232    #endif
233
234        return r;
235    }
236
237    static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b)
238    {
239        unsigned int r = 0;
240
241    #if __CUDA_ARCH__ >= 300
242        asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
243    #else
244        unsigned int c;
245        asm("not.b32 %0, %0;" : "+r"(b));
246        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
247        c = c & 0x80008000; // msb = carry-outs
248        r = c >> 15;        // convert to bool
249    #endif
250
251        return r;
252    }
253
254    static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b)
255    {
256        unsigned int r, c;
257
258    #if __CUDA_ARCH__ >= 300
259        r = vsetge2(a, b);
260        c = r << 16;        // convert bool
261        r = c - r;          //  into mask
262    #else
263        asm("not.b32 %0, %0;" : "+r"(b));
264        c = vavrg2(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
265        c = c & 0x80008000; // msb = carry-outs
266        r = c >> 15;        // convert
267        r = c - r;          //  msbs to
268        r = c | r;          //   mask
269    #endif
270
271        return r;
272    }
273
274    static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b)
275    {
276        unsigned int r = 0;
277
278    #if __CUDA_ARCH__ >= 300
279        asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
280    #else
281        unsigned int c;
282        asm("not.b32 %0, %0;" : "+r"(b));
283        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
284        c = c & 0x80008000; // msbs = carry-outs
285        r = c >> 15;        // convert to bool
286    #endif
287
288        return r;
289    }
290
291    static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b)
292    {
293        unsigned int r, c;
294
295    #if __CUDA_ARCH__ >= 300
296        r = vsetgt2(a, b);
297        c = r << 16;        // convert bool
298        r = c - r;          //  into mask
299    #else
300        asm("not.b32 %0, %0;" : "+r"(b));
301        c = vavg2(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
302        c = c & 0x80008000; // msbs = carry-outs
303        r = c >> 15;        // convert
304        r = c - r;          //  msbs to
305        r = c | r;          //   mask
306    #endif
307
308        return r;
309    }
310
311    static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b)
312    {
313        unsigned int r = 0;
314
315    #if __CUDA_ARCH__ >= 300
316        asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
317    #else
318        unsigned int c;
319        asm("not.b32 %0, %0;" : "+r"(a));
320        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
321        c = c & 0x80008000; // msb = carry-outs
322        r = c >> 15;        // convert to bool
323    #endif
324
325        return r;
326    }
327
328    static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b)
329    {
330        unsigned int r, c;
331
332    #if __CUDA_ARCH__ >= 300
333        r = vsetle2(a, b);
334        c = r << 16;        // convert bool
335        r = c - r;          //  into mask
336    #else
337        asm("not.b32 %0, %0;" : "+r"(a));
338        c = vavrg2(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
339        c = c & 0x80008000; // msb = carry-outs
340        r = c >> 15;        // convert
341        r = c - r;          //  msbs to
342        r = c | r;          //   mask
343    #endif
344
345        return r;
346    }
347
348    static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b)
349    {
350        unsigned int r = 0;
351
352    #if __CUDA_ARCH__ >= 300
353        asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
354    #else
355        unsigned int c;
356        asm("not.b32 %0, %0;" : "+r"(a));
357        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
358        c = c & 0x80008000; // msb = carry-outs
359        r = c >> 15;        // convert to bool
360    #endif
361
362        return r;
363    }
364
365    static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b)
366    {
367        unsigned int r, c;
368
369    #if __CUDA_ARCH__ >= 300
370        r = vsetlt2(a, b);
371        c = r << 16;        // convert bool
372        r = c - r;          //  into mask
373    #else
374        asm("not.b32 %0, %0;" : "+r"(a));
375        c = vavg2(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
376        c = c & 0x80008000; // msb = carry-outs
377        r = c >> 15;        // convert
378        r = c - r;          //  msbs to
379        r = c | r;          //   mask
380    #endif
381
382        return r;
383    }
384
385    static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b)
386    {
387        unsigned int r = 0;
388
389    #if __CUDA_ARCH__ >= 300
390        asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
391    #else
392        // inspired by Alan Mycroft's null-byte detection algorithm:
393        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
394        unsigned int c;
395        r = a ^ b;          // 0x0000 if a == b
396        c = r | 0x80008000; // set msbs, to catch carry out
397        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
398        c = r | c;          // msb = 1, if r was not 0x0000
399        c = c & 0x80008000; // extract msbs
400        r = c >> 15;        // convert to bool
401    #endif
402
403        return r;
404    }
405
406    static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b)
407    {
408        unsigned int r, c;
409
410    #if __CUDA_ARCH__ >= 300
411        r = vsetne2(a, b);
412        c = r << 16;        // convert bool
413        r = c - r;          //  into mask
414    #else
415        // inspired by Alan Mycroft's null-byte detection algorithm:
416        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
417        r = a ^ b;          // 0x0000 if a == b
418        c = r | 0x80008000; // set msbs, to catch carry out
419        c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000
420        c = r | c;          // msb = 1, if r was not 0x0000
421        c = c & 0x80008000; // extract msbs
422        r = c >> 15;        // convert
423        r = c - r;          //  msbs to
424        r = c | r;          //   mask
425    #endif
426
427        return r;
428    }
429
430    static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b)
431    {
432        unsigned int r = 0;
433
434    #if __CUDA_ARCH__ >= 300
435        asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
436    #elif __CUDA_ARCH__ >= 200
437        asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
438        asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
439    #else
440        unsigned int s, t, u;
441        r = a & 0x0000ffff; // extract low halfword
442        s = b & 0x0000ffff; // extract low halfword
443        t = ::max(r, s);    // maximum of low halfwords
444        r = a & 0xffff0000; // extract high halfword
445        s = b & 0xffff0000; // extract high halfword
446        u = ::max(r, s);    // maximum of high halfwords
447        r = t | u;          // combine halfword maximums
448    #endif
449
450        return r;
451    }
452
453    static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b)
454    {
455        unsigned int r = 0;
456
457    #if __CUDA_ARCH__ >= 300
458        asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
459    #elif __CUDA_ARCH__ >= 200
460        asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
461        asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
462    #else
463        unsigned int s, t, u;
464        r = a & 0x0000ffff; // extract low halfword
465        s = b & 0x0000ffff; // extract low halfword
466        t = ::min(r, s);    // minimum of low halfwords
467        r = a & 0xffff0000; // extract high halfword
468        s = b & 0xffff0000; // extract high halfword
469        u = ::min(r, s);    // minimum of high halfwords
470        r = t | u;          // combine halfword minimums
471    #endif
472
473        return r;
474    }
475
476    // 4
477
478    static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b)
479    {
480        unsigned int r = 0;
481
482    #if __CUDA_ARCH__ >= 300
483        asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
484    #elif __CUDA_ARCH__ >= 200
485        asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
486        asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
487        asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
488        asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
489    #else
490        unsigned int s, t;
491        s = a ^ b;          // sum bits
492        r = a & 0x7f7f7f7f; // clear msbs
493        t = b & 0x7f7f7f7f; // clear msbs
494        s = s & 0x80808080; // msb sum bits
495        r = r + t;          // add without msbs, record carry-out in msbs
496        r = r ^ s;          // sum of msb sum and carry-in bits, w/o carry-out
497    #endif /* __CUDA_ARCH__ >= 300 */
498
499        return r;
500    }
501
502    static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b)
503    {
504        unsigned int r = 0;
505
506    #if __CUDA_ARCH__ >= 300
507        asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
508    #elif __CUDA_ARCH__ >= 200
509        asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
510        asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
511        asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
512        asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
513    #else
514        unsigned int s, t;
515        s = a ^ ~b;         // inverted sum bits
516        r = a | 0x80808080; // set msbs
517        t = b & 0x7f7f7f7f; // clear msbs
518        s = s & 0x80808080; // inverted msb sum bits
519        r = r - t;          // subtract w/o msbs, record inverted borrows in msb
520        r = r ^ s;          // combine inverted msb sum bits and borrows
521    #endif
522
523        return r;
524    }
525
526    static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b)
527    {
528        unsigned int r, s;
529
530        // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==>
531        // (a + b) / 2 = (a & b) + ((a ^ b) >> 1)
532        s = a ^ b;
533        r = a & b;
534        s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
535        s = s >> 1;
536        s = r + s;
537
538        return s;
539    }
540
541    static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b)
542    {
543        unsigned int r = 0;
544
545    #if __CUDA_ARCH__ >= 300
546        asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
547    #else
548        // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==>
549        // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1)
550        unsigned int c;
551        c = a ^ b;
552        r = a | b;
553        c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries
554        c = c >> 1;
555        r = r - c;
556    #endif
557
558        return r;
559    }
560
561    static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b)
562    {
563        unsigned int r = 0;
564
565    #if __CUDA_ARCH__ >= 300
566        asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
567    #else
568        // inspired by Alan Mycroft's null-byte detection algorithm:
569        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
570        unsigned int c;
571        r = a ^ b;          // 0x00 if a == b
572        c = r | 0x80808080; // set msbs, to catch carry out
573        r = r ^ c;          // extract msbs, msb = 1 if r < 0x80
574        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
575        c = r & ~c;         // msb = 1, if r was 0x00
576        r = c >> 7;         // convert to bool
577    #endif
578
579        return r;
580    }
581
582    static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b)
583    {
584        unsigned int r, t;
585
586    #if __CUDA_ARCH__ >= 300
587        r = vseteq4(a, b);
588        t = r << 8;         // convert bool
589        r = t - r;          //  to mask
590    #else
591        // inspired by Alan Mycroft's null-byte detection algorithm:
592        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
593        t = a ^ b;          // 0x00 if a == b
594        r = t | 0x80808080; // set msbs, to catch carry out
595        t = t ^ r;          // extract msbs, msb = 1 if t < 0x80
596        r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80
597        r = t & ~r;         // msb = 1, if t was 0x00
598        t = r >> 7;         // build mask
599        t = r - t;          //  from
600        r = t | r;          //   msbs
601    #endif
602
603        return r;
604    }
605
606    static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b)
607    {
608        unsigned int r = 0;
609
610    #if __CUDA_ARCH__ >= 300
611        asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
612    #else
613        unsigned int c;
614        asm("not.b32 %0, %0;" : "+r"(a));
615        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
616        c = c & 0x80808080; // msb = carry-outs
617        r = c >> 7;         // convert to bool
618    #endif
619
620        return r;
621    }
622
623    static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b)
624    {
625        unsigned int r, c;
626
627    #if __CUDA_ARCH__ >= 300
628        r = vsetle4(a, b);
629        c = r << 8;         // convert bool
630        r = c - r;          //  to mask
631    #else
632        asm("not.b32 %0, %0;" : "+r"(a));
633        c = vavrg4(a, b);   // (b + ~a + 1) / 2 = (b - a) / 2
634        c = c & 0x80808080; // msbs = carry-outs
635        r = c >> 7;         // convert
636        r = c - r;          //  msbs to
637        r = c | r;          //   mask
638    #endif
639
640        return r;
641    }
642
643    static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b)
644    {
645        unsigned int r = 0;
646
647    #if __CUDA_ARCH__ >= 300
648        asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
649    #else
650        unsigned int c;
651        asm("not.b32 %0, %0;" : "+r"(a));
652        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
653        c = c & 0x80808080; // msb = carry-outs
654        r = c >> 7;         // convert to bool
655    #endif
656
657        return r;
658    }
659
660    static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b)
661    {
662        unsigned int r, c;
663
664    #if __CUDA_ARCH__ >= 300
665        r = vsetlt4(a, b);
666        c = r << 8;         // convert bool
667        r = c - r;          //  to mask
668    #else
669        asm("not.b32 %0, %0;" : "+r"(a));
670        c = vavg4(a, b);    // (b + ~a) / 2 = (b - a) / 2 [rounded down]
671        c = c & 0x80808080; // msbs = carry-outs
672        r = c >> 7;         // convert
673        r = c - r;          //  msbs to
674        r = c | r;          //   mask
675    #endif
676
677        return r;
678    }
679
680    static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b)
681    {
682        unsigned int r = 0;
683
684    #if __CUDA_ARCH__ >= 300
685        asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
686    #else
687        unsigned int c;
688        asm("not.b32 %0, %0;" : "+r"(b));
689        c = vavrg4(a, b);   // (a + ~b + 1) / 2 = (a - b) / 2
690        c = c & 0x80808080; // msb = carry-outs
691        r = c >> 7;         // convert to bool
692    #endif
693
694        return r;
695    }
696
697    static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b)
698    {
699        unsigned int r, s;
700
701    #if __CUDA_ARCH__ >= 300
702        r = vsetge4(a, b);
703        s = r << 8;         // convert bool
704        r = s - r;          //  to mask
705    #else
706        asm ("not.b32 %0,%0;" : "+r"(b));
707        r = vavrg4 (a, b);  // (a + ~b + 1) / 2 = (a - b) / 2
708        r = r & 0x80808080; // msb = carry-outs
709        s = r >> 7;         // build mask
710        s = r - s;          //  from
711        r = s | r;          //   msbs
712    #endif
713
714        return r;
715    }
716
717    static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b)
718    {
719        unsigned int r = 0;
720
721    #if __CUDA_ARCH__ >= 300
722        asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
723    #else
724        unsigned int c;
725        asm("not.b32 %0, %0;" : "+r"(b));
726        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
727        c = c & 0x80808080; // msb = carry-outs
728        r = c >> 7;         // convert to bool
729    #endif
730
731        return r;
732    }
733
734    static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b)
735    {
736        unsigned int r, c;
737
738    #if __CUDA_ARCH__ >= 300
739        r = vsetgt4(a, b);
740        c = r << 8;         // convert bool
741        r = c - r;          //  to mask
742    #else
743        asm("not.b32 %0, %0;" : "+r"(b));
744        c = vavg4(a, b);    // (a + ~b) / 2 = (a - b) / 2 [rounded down]
745        c = c & 0x80808080; // msb = carry-outs
746        r = c >> 7;         // convert
747        r = c - r;          //  msbs to
748        r = c | r;          //   mask
749    #endif
750
751        return r;
752    }
753
754    static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b)
755    {
756        unsigned int r = 0;
757
758    #if __CUDA_ARCH__ >= 300
759        asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
760    #else
761        // inspired by Alan Mycroft's null-byte detection algorithm:
762        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
763        unsigned int c;
764        r = a ^ b;          // 0x00 if a == b
765        c = r | 0x80808080; // set msbs, to catch carry out
766        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
767        c = r | c;          // msb = 1, if r was not 0x00
768        c = c & 0x80808080; // extract msbs
769        r = c >> 7;         // convert to bool
770    #endif
771
772        return r;
773    }
774
775    static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b)
776    {
777        unsigned int r, c;
778
779    #if __CUDA_ARCH__ >= 300
780        r = vsetne4(a, b);
781        c = r << 8;         // convert bool
782        r = c - r;          //  to mask
783    #else
784        // inspired by Alan Mycroft's null-byte detection algorithm:
785        // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080))
786        r = a ^ b;          // 0x00 if a == b
787        c = r | 0x80808080; // set msbs, to catch carry out
788        c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80
789        c = r | c;          // msb = 1, if r was not 0x00
790        c = c & 0x80808080; // extract msbs
791        r = c >> 7;         // convert
792        r = c - r;          //  msbs to
793        r = c | r;          //   mask
794    #endif
795
796        return r;
797    }
798
799    static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b)
800    {
801        unsigned int r = 0;
802
803    #if __CUDA_ARCH__ >= 300
804        asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
805    #elif __CUDA_ARCH__ >= 200
806        asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
807        asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
808        asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
809        asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
810    #else
811        unsigned int s;
812        s = vcmpge4(a, b);  // mask = 0xff if a >= b
813        r = a ^ b;          //
814        s = (r &  s) ^ b;   // select a when a >= b, else select b => max(a,b)
815        r = s ^ r;          // select a when b >= a, else select b => min(a,b)
816        r = s - r;          // |a - b| = max(a,b) - min(a,b);
817    #endif
818
819        return r;
820    }
821
822    static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b)
823    {
824        unsigned int r = 0;
825
826    #if __CUDA_ARCH__ >= 300
827        asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
828    #elif __CUDA_ARCH__ >= 200
829        asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
830        asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
831        asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
832        asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
833    #else
834        unsigned int s;
835        s = vcmpge4(a, b);  // mask = 0xff if a >= b
836        r = a & s;          // select a when b >= a
837        s = b & ~s;         // select b when b < a
838        r = r | s;          // combine byte selections
839    #endif
840
841        return r;           // byte-wise unsigned maximum
842    }
843
844    static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b)
845    {
846        unsigned int r = 0;
847
848    #if __CUDA_ARCH__ >= 300
849        asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
850    #elif __CUDA_ARCH__ >= 200
851        asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
852        asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
853        asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
854        asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r));
855    #else
856        unsigned int s;
857        s = vcmpge4(b, a);  // mask = 0xff if a >= b
858        r = a & s;          // select a when b >= a
859        s = b & ~s;         // select b when b < a
860        r = r | s;          // combine byte selections
861    #endif
862
863        return r;
864    }
865}}}
866
867//! @endcond
868
869#endif // __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__
870