1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14// Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// Redistribution and use in source and binary forms, with or without modification, 18// are permitted provided that the following conditions are met: 19// 20// * Redistribution's of source code must retain the above copyright notice, 21// this list of conditions and the following disclaimer. 22// 23// * Redistribution's in binary form must reproduce the above copyright notice, 24// this list of conditions and the following disclaimer in the documentation 25// and/or other materials provided with the distribution. 26// 27// * The name of the copyright holders may not be used to endorse or promote products 28// derived from this software without specific prior written permission. 29// 30// This software is provided by the copyright holders and contributors "as is" and 31// any express or implied warranties, including, but not limited to, the implied 32// warranties of merchantability and fitness for a particular purpose are disclaimed. 33// In no event shall the Intel Corporation or contributors be liable for any direct, 34// indirect, incidental, special, exemplary, or consequential damages 35// (including, but not limited to, procurement of substitute goods or services; 36// loss of use, data, or profits; or business interruption) however caused 37// and on any theory of liability, whether in contract, strict liability, 38// or tort (including negligence or otherwise) arising in any way out of 39// the use of this software, even if advised of the possibility of such damage. 40// 41//M*/ 42 43/* 44 * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. 45 * 46 * Redistribution and use in source and binary forms, with or without 47 * modification, are permitted provided that the following conditions are met: 48 * 49 * Redistributions of source code must retain the above copyright notice, 50 * this list of conditions and the following disclaimer. 51 * 52 * Redistributions in binary form must reproduce the above copyright notice, 53 * this list of conditions and the following disclaimer in the documentation 54 * and/or other materials provided with the distribution. 55 * 56 * Neither the name of NVIDIA Corporation nor the names of its contributors 57 * may be used to endorse or promote products derived from this software 58 * without specific prior written permission. 59 * 60 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 61 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 62 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 63 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 64 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 65 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 66 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 67 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 68 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 69 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 70 * POSSIBILITY OF SUCH DAMAGE. 71 */ 72 73#ifndef __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__ 74#define __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__ 75 76#include "common.hpp" 77 78/** @file 79 * @deprecated Use @ref cudev instead. 80 */ 81 82//! @cond IGNORED 83 84namespace cv { namespace cuda { namespace device 85{ 86 // 2 87 88 static __device__ __forceinline__ unsigned int vadd2(unsigned int a, unsigned int b) 89 { 90 unsigned int r = 0; 91 92 #if __CUDA_ARCH__ >= 300 93 asm("vadd2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 94 #elif __CUDA_ARCH__ >= 200 95 asm("vadd.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 96 asm("vadd.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 97 #else 98 unsigned int s; 99 s = a ^ b; // sum bits 100 r = a + b; // actual sum 101 s = s ^ r; // determine carry-ins for each bit position 102 s = s & 0x00010000; // carry-in to high word (= carry-out from low word) 103 r = r - s; // subtract out carry-out from low word 104 #endif 105 106 return r; 107 } 108 109 static __device__ __forceinline__ unsigned int vsub2(unsigned int a, unsigned int b) 110 { 111 unsigned int r = 0; 112 113 #if __CUDA_ARCH__ >= 300 114 asm("vsub2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 115 #elif __CUDA_ARCH__ >= 200 116 asm("vsub.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 117 asm("vsub.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 118 #else 119 unsigned int s; 120 s = a ^ b; // sum bits 121 r = a - b; // actual sum 122 s = s ^ r; // determine carry-ins for each bit position 123 s = s & 0x00010000; // borrow to high word 124 r = r + s; // compensate for borrow from low word 125 #endif 126 127 return r; 128 } 129 130 static __device__ __forceinline__ unsigned int vabsdiff2(unsigned int a, unsigned int b) 131 { 132 unsigned int r = 0; 133 134 #if __CUDA_ARCH__ >= 300 135 asm("vabsdiff2.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 136 #elif __CUDA_ARCH__ >= 200 137 asm("vabsdiff.u32.u32.u32.sat %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 138 asm("vabsdiff.u32.u32.u32.sat %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 139 #else 140 unsigned int s, t, u, v; 141 s = a & 0x0000ffff; // extract low halfword 142 r = b & 0x0000ffff; // extract low halfword 143 u = ::max(r, s); // maximum of low halfwords 144 v = ::min(r, s); // minimum of low halfwords 145 s = a & 0xffff0000; // extract high halfword 146 r = b & 0xffff0000; // extract high halfword 147 t = ::max(r, s); // maximum of high halfwords 148 s = ::min(r, s); // minimum of high halfwords 149 r = u | t; // maximum of both halfwords 150 s = v | s; // minimum of both halfwords 151 r = r - s; // |a - b| = max(a,b) - min(a,b); 152 #endif 153 154 return r; 155 } 156 157 static __device__ __forceinline__ unsigned int vavg2(unsigned int a, unsigned int b) 158 { 159 unsigned int r, s; 160 161 // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==> 162 // (a + b) / 2 = (a & b) + ((a ^ b) >> 1) 163 s = a ^ b; 164 r = a & b; 165 s = s & 0xfffefffe; // ensure shift doesn't cross halfword boundaries 166 s = s >> 1; 167 s = r + s; 168 169 return s; 170 } 171 172 static __device__ __forceinline__ unsigned int vavrg2(unsigned int a, unsigned int b) 173 { 174 unsigned int r = 0; 175 176 #if __CUDA_ARCH__ >= 300 177 asm("vavrg2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 178 #else 179 // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==> 180 // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1) 181 unsigned int s; 182 s = a ^ b; 183 r = a | b; 184 s = s & 0xfffefffe; // ensure shift doesn't cross half-word boundaries 185 s = s >> 1; 186 r = r - s; 187 #endif 188 189 return r; 190 } 191 192 static __device__ __forceinline__ unsigned int vseteq2(unsigned int a, unsigned int b) 193 { 194 unsigned int r = 0; 195 196 #if __CUDA_ARCH__ >= 300 197 asm("vset2.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 198 #else 199 // inspired by Alan Mycroft's null-byte detection algorithm: 200 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 201 unsigned int c; 202 r = a ^ b; // 0x0000 if a == b 203 c = r | 0x80008000; // set msbs, to catch carry out 204 r = r ^ c; // extract msbs, msb = 1 if r < 0x8000 205 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 206 c = r & ~c; // msb = 1, if r was 0x0000 207 r = c >> 15; // convert to bool 208 #endif 209 210 return r; 211 } 212 213 static __device__ __forceinline__ unsigned int vcmpeq2(unsigned int a, unsigned int b) 214 { 215 unsigned int r, c; 216 217 #if __CUDA_ARCH__ >= 300 218 r = vseteq2(a, b); 219 c = r << 16; // convert bool 220 r = c - r; // into mask 221 #else 222 // inspired by Alan Mycroft's null-byte detection algorithm: 223 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 224 r = a ^ b; // 0x0000 if a == b 225 c = r | 0x80008000; // set msbs, to catch carry out 226 r = r ^ c; // extract msbs, msb = 1 if r < 0x8000 227 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 228 c = r & ~c; // msb = 1, if r was 0x0000 229 r = c >> 15; // convert 230 r = c - r; // msbs to 231 r = c | r; // mask 232 #endif 233 234 return r; 235 } 236 237 static __device__ __forceinline__ unsigned int vsetge2(unsigned int a, unsigned int b) 238 { 239 unsigned int r = 0; 240 241 #if __CUDA_ARCH__ >= 300 242 asm("vset2.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 243 #else 244 unsigned int c; 245 asm("not.b32 %0, %0;" : "+r"(b)); 246 c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 247 c = c & 0x80008000; // msb = carry-outs 248 r = c >> 15; // convert to bool 249 #endif 250 251 return r; 252 } 253 254 static __device__ __forceinline__ unsigned int vcmpge2(unsigned int a, unsigned int b) 255 { 256 unsigned int r, c; 257 258 #if __CUDA_ARCH__ >= 300 259 r = vsetge2(a, b); 260 c = r << 16; // convert bool 261 r = c - r; // into mask 262 #else 263 asm("not.b32 %0, %0;" : "+r"(b)); 264 c = vavrg2(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 265 c = c & 0x80008000; // msb = carry-outs 266 r = c >> 15; // convert 267 r = c - r; // msbs to 268 r = c | r; // mask 269 #endif 270 271 return r; 272 } 273 274 static __device__ __forceinline__ unsigned int vsetgt2(unsigned int a, unsigned int b) 275 { 276 unsigned int r = 0; 277 278 #if __CUDA_ARCH__ >= 300 279 asm("vset2.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 280 #else 281 unsigned int c; 282 asm("not.b32 %0, %0;" : "+r"(b)); 283 c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] 284 c = c & 0x80008000; // msbs = carry-outs 285 r = c >> 15; // convert to bool 286 #endif 287 288 return r; 289 } 290 291 static __device__ __forceinline__ unsigned int vcmpgt2(unsigned int a, unsigned int b) 292 { 293 unsigned int r, c; 294 295 #if __CUDA_ARCH__ >= 300 296 r = vsetgt2(a, b); 297 c = r << 16; // convert bool 298 r = c - r; // into mask 299 #else 300 asm("not.b32 %0, %0;" : "+r"(b)); 301 c = vavg2(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] 302 c = c & 0x80008000; // msbs = carry-outs 303 r = c >> 15; // convert 304 r = c - r; // msbs to 305 r = c | r; // mask 306 #endif 307 308 return r; 309 } 310 311 static __device__ __forceinline__ unsigned int vsetle2(unsigned int a, unsigned int b) 312 { 313 unsigned int r = 0; 314 315 #if __CUDA_ARCH__ >= 300 316 asm("vset2.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 317 #else 318 unsigned int c; 319 asm("not.b32 %0, %0;" : "+r"(a)); 320 c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 321 c = c & 0x80008000; // msb = carry-outs 322 r = c >> 15; // convert to bool 323 #endif 324 325 return r; 326 } 327 328 static __device__ __forceinline__ unsigned int vcmple2(unsigned int a, unsigned int b) 329 { 330 unsigned int r, c; 331 332 #if __CUDA_ARCH__ >= 300 333 r = vsetle2(a, b); 334 c = r << 16; // convert bool 335 r = c - r; // into mask 336 #else 337 asm("not.b32 %0, %0;" : "+r"(a)); 338 c = vavrg2(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 339 c = c & 0x80008000; // msb = carry-outs 340 r = c >> 15; // convert 341 r = c - r; // msbs to 342 r = c | r; // mask 343 #endif 344 345 return r; 346 } 347 348 static __device__ __forceinline__ unsigned int vsetlt2(unsigned int a, unsigned int b) 349 { 350 unsigned int r = 0; 351 352 #if __CUDA_ARCH__ >= 300 353 asm("vset2.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 354 #else 355 unsigned int c; 356 asm("not.b32 %0, %0;" : "+r"(a)); 357 c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] 358 c = c & 0x80008000; // msb = carry-outs 359 r = c >> 15; // convert to bool 360 #endif 361 362 return r; 363 } 364 365 static __device__ __forceinline__ unsigned int vcmplt2(unsigned int a, unsigned int b) 366 { 367 unsigned int r, c; 368 369 #if __CUDA_ARCH__ >= 300 370 r = vsetlt2(a, b); 371 c = r << 16; // convert bool 372 r = c - r; // into mask 373 #else 374 asm("not.b32 %0, %0;" : "+r"(a)); 375 c = vavg2(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] 376 c = c & 0x80008000; // msb = carry-outs 377 r = c >> 15; // convert 378 r = c - r; // msbs to 379 r = c | r; // mask 380 #endif 381 382 return r; 383 } 384 385 static __device__ __forceinline__ unsigned int vsetne2(unsigned int a, unsigned int b) 386 { 387 unsigned int r = 0; 388 389 #if __CUDA_ARCH__ >= 300 390 asm ("vset2.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 391 #else 392 // inspired by Alan Mycroft's null-byte detection algorithm: 393 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 394 unsigned int c; 395 r = a ^ b; // 0x0000 if a == b 396 c = r | 0x80008000; // set msbs, to catch carry out 397 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 398 c = r | c; // msb = 1, if r was not 0x0000 399 c = c & 0x80008000; // extract msbs 400 r = c >> 15; // convert to bool 401 #endif 402 403 return r; 404 } 405 406 static __device__ __forceinline__ unsigned int vcmpne2(unsigned int a, unsigned int b) 407 { 408 unsigned int r, c; 409 410 #if __CUDA_ARCH__ >= 300 411 r = vsetne2(a, b); 412 c = r << 16; // convert bool 413 r = c - r; // into mask 414 #else 415 // inspired by Alan Mycroft's null-byte detection algorithm: 416 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 417 r = a ^ b; // 0x0000 if a == b 418 c = r | 0x80008000; // set msbs, to catch carry out 419 c = c - 0x00010001; // msb = 0, if r was 0x0000 or 0x8000 420 c = r | c; // msb = 1, if r was not 0x0000 421 c = c & 0x80008000; // extract msbs 422 r = c >> 15; // convert 423 r = c - r; // msbs to 424 r = c | r; // mask 425 #endif 426 427 return r; 428 } 429 430 static __device__ __forceinline__ unsigned int vmax2(unsigned int a, unsigned int b) 431 { 432 unsigned int r = 0; 433 434 #if __CUDA_ARCH__ >= 300 435 asm("vmax2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 436 #elif __CUDA_ARCH__ >= 200 437 asm("vmax.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 438 asm("vmax.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 439 #else 440 unsigned int s, t, u; 441 r = a & 0x0000ffff; // extract low halfword 442 s = b & 0x0000ffff; // extract low halfword 443 t = ::max(r, s); // maximum of low halfwords 444 r = a & 0xffff0000; // extract high halfword 445 s = b & 0xffff0000; // extract high halfword 446 u = ::max(r, s); // maximum of high halfwords 447 r = t | u; // combine halfword maximums 448 #endif 449 450 return r; 451 } 452 453 static __device__ __forceinline__ unsigned int vmin2(unsigned int a, unsigned int b) 454 { 455 unsigned int r = 0; 456 457 #if __CUDA_ARCH__ >= 300 458 asm("vmin2.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 459 #elif __CUDA_ARCH__ >= 200 460 asm("vmin.u32.u32.u32 %0.h0, %1.h0, %2.h0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 461 asm("vmin.u32.u32.u32 %0.h1, %1.h1, %2.h1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 462 #else 463 unsigned int s, t, u; 464 r = a & 0x0000ffff; // extract low halfword 465 s = b & 0x0000ffff; // extract low halfword 466 t = ::min(r, s); // minimum of low halfwords 467 r = a & 0xffff0000; // extract high halfword 468 s = b & 0xffff0000; // extract high halfword 469 u = ::min(r, s); // minimum of high halfwords 470 r = t | u; // combine halfword minimums 471 #endif 472 473 return r; 474 } 475 476 // 4 477 478 static __device__ __forceinline__ unsigned int vadd4(unsigned int a, unsigned int b) 479 { 480 unsigned int r = 0; 481 482 #if __CUDA_ARCH__ >= 300 483 asm("vadd4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 484 #elif __CUDA_ARCH__ >= 200 485 asm("vadd.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 486 asm("vadd.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 487 asm("vadd.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 488 asm("vadd.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 489 #else 490 unsigned int s, t; 491 s = a ^ b; // sum bits 492 r = a & 0x7f7f7f7f; // clear msbs 493 t = b & 0x7f7f7f7f; // clear msbs 494 s = s & 0x80808080; // msb sum bits 495 r = r + t; // add without msbs, record carry-out in msbs 496 r = r ^ s; // sum of msb sum and carry-in bits, w/o carry-out 497 #endif /* __CUDA_ARCH__ >= 300 */ 498 499 return r; 500 } 501 502 static __device__ __forceinline__ unsigned int vsub4(unsigned int a, unsigned int b) 503 { 504 unsigned int r = 0; 505 506 #if __CUDA_ARCH__ >= 300 507 asm("vsub4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 508 #elif __CUDA_ARCH__ >= 200 509 asm("vsub.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 510 asm("vsub.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 511 asm("vsub.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 512 asm("vsub.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 513 #else 514 unsigned int s, t; 515 s = a ^ ~b; // inverted sum bits 516 r = a | 0x80808080; // set msbs 517 t = b & 0x7f7f7f7f; // clear msbs 518 s = s & 0x80808080; // inverted msb sum bits 519 r = r - t; // subtract w/o msbs, record inverted borrows in msb 520 r = r ^ s; // combine inverted msb sum bits and borrows 521 #endif 522 523 return r; 524 } 525 526 static __device__ __forceinline__ unsigned int vavg4(unsigned int a, unsigned int b) 527 { 528 unsigned int r, s; 529 530 // HAKMEM #23: a + b = 2 * (a & b) + (a ^ b) ==> 531 // (a + b) / 2 = (a & b) + ((a ^ b) >> 1) 532 s = a ^ b; 533 r = a & b; 534 s = s & 0xfefefefe; // ensure following shift doesn't cross byte boundaries 535 s = s >> 1; 536 s = r + s; 537 538 return s; 539 } 540 541 static __device__ __forceinline__ unsigned int vavrg4(unsigned int a, unsigned int b) 542 { 543 unsigned int r = 0; 544 545 #if __CUDA_ARCH__ >= 300 546 asm("vavrg4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 547 #else 548 // HAKMEM #23: a + b = 2 * (a | b) - (a ^ b) ==> 549 // (a + b + 1) / 2 = (a | b) - ((a ^ b) >> 1) 550 unsigned int c; 551 c = a ^ b; 552 r = a | b; 553 c = c & 0xfefefefe; // ensure following shift doesn't cross byte boundaries 554 c = c >> 1; 555 r = r - c; 556 #endif 557 558 return r; 559 } 560 561 static __device__ __forceinline__ unsigned int vseteq4(unsigned int a, unsigned int b) 562 { 563 unsigned int r = 0; 564 565 #if __CUDA_ARCH__ >= 300 566 asm("vset4.u32.u32.eq %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 567 #else 568 // inspired by Alan Mycroft's null-byte detection algorithm: 569 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 570 unsigned int c; 571 r = a ^ b; // 0x00 if a == b 572 c = r | 0x80808080; // set msbs, to catch carry out 573 r = r ^ c; // extract msbs, msb = 1 if r < 0x80 574 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 575 c = r & ~c; // msb = 1, if r was 0x00 576 r = c >> 7; // convert to bool 577 #endif 578 579 return r; 580 } 581 582 static __device__ __forceinline__ unsigned int vcmpeq4(unsigned int a, unsigned int b) 583 { 584 unsigned int r, t; 585 586 #if __CUDA_ARCH__ >= 300 587 r = vseteq4(a, b); 588 t = r << 8; // convert bool 589 r = t - r; // to mask 590 #else 591 // inspired by Alan Mycroft's null-byte detection algorithm: 592 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 593 t = a ^ b; // 0x00 if a == b 594 r = t | 0x80808080; // set msbs, to catch carry out 595 t = t ^ r; // extract msbs, msb = 1 if t < 0x80 596 r = r - 0x01010101; // msb = 0, if t was 0x00 or 0x80 597 r = t & ~r; // msb = 1, if t was 0x00 598 t = r >> 7; // build mask 599 t = r - t; // from 600 r = t | r; // msbs 601 #endif 602 603 return r; 604 } 605 606 static __device__ __forceinline__ unsigned int vsetle4(unsigned int a, unsigned int b) 607 { 608 unsigned int r = 0; 609 610 #if __CUDA_ARCH__ >= 300 611 asm("vset4.u32.u32.le %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 612 #else 613 unsigned int c; 614 asm("not.b32 %0, %0;" : "+r"(a)); 615 c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 616 c = c & 0x80808080; // msb = carry-outs 617 r = c >> 7; // convert to bool 618 #endif 619 620 return r; 621 } 622 623 static __device__ __forceinline__ unsigned int vcmple4(unsigned int a, unsigned int b) 624 { 625 unsigned int r, c; 626 627 #if __CUDA_ARCH__ >= 300 628 r = vsetle4(a, b); 629 c = r << 8; // convert bool 630 r = c - r; // to mask 631 #else 632 asm("not.b32 %0, %0;" : "+r"(a)); 633 c = vavrg4(a, b); // (b + ~a + 1) / 2 = (b - a) / 2 634 c = c & 0x80808080; // msbs = carry-outs 635 r = c >> 7; // convert 636 r = c - r; // msbs to 637 r = c | r; // mask 638 #endif 639 640 return r; 641 } 642 643 static __device__ __forceinline__ unsigned int vsetlt4(unsigned int a, unsigned int b) 644 { 645 unsigned int r = 0; 646 647 #if __CUDA_ARCH__ >= 300 648 asm("vset4.u32.u32.lt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 649 #else 650 unsigned int c; 651 asm("not.b32 %0, %0;" : "+r"(a)); 652 c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] 653 c = c & 0x80808080; // msb = carry-outs 654 r = c >> 7; // convert to bool 655 #endif 656 657 return r; 658 } 659 660 static __device__ __forceinline__ unsigned int vcmplt4(unsigned int a, unsigned int b) 661 { 662 unsigned int r, c; 663 664 #if __CUDA_ARCH__ >= 300 665 r = vsetlt4(a, b); 666 c = r << 8; // convert bool 667 r = c - r; // to mask 668 #else 669 asm("not.b32 %0, %0;" : "+r"(a)); 670 c = vavg4(a, b); // (b + ~a) / 2 = (b - a) / 2 [rounded down] 671 c = c & 0x80808080; // msbs = carry-outs 672 r = c >> 7; // convert 673 r = c - r; // msbs to 674 r = c | r; // mask 675 #endif 676 677 return r; 678 } 679 680 static __device__ __forceinline__ unsigned int vsetge4(unsigned int a, unsigned int b) 681 { 682 unsigned int r = 0; 683 684 #if __CUDA_ARCH__ >= 300 685 asm("vset4.u32.u32.ge %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 686 #else 687 unsigned int c; 688 asm("not.b32 %0, %0;" : "+r"(b)); 689 c = vavrg4(a, b); // (a + ~b + 1) / 2 = (a - b) / 2 690 c = c & 0x80808080; // msb = carry-outs 691 r = c >> 7; // convert to bool 692 #endif 693 694 return r; 695 } 696 697 static __device__ __forceinline__ unsigned int vcmpge4(unsigned int a, unsigned int b) 698 { 699 unsigned int r, s; 700 701 #if __CUDA_ARCH__ >= 300 702 r = vsetge4(a, b); 703 s = r << 8; // convert bool 704 r = s - r; // to mask 705 #else 706 asm ("not.b32 %0,%0;" : "+r"(b)); 707 r = vavrg4 (a, b); // (a + ~b + 1) / 2 = (a - b) / 2 708 r = r & 0x80808080; // msb = carry-outs 709 s = r >> 7; // build mask 710 s = r - s; // from 711 r = s | r; // msbs 712 #endif 713 714 return r; 715 } 716 717 static __device__ __forceinline__ unsigned int vsetgt4(unsigned int a, unsigned int b) 718 { 719 unsigned int r = 0; 720 721 #if __CUDA_ARCH__ >= 300 722 asm("vset4.u32.u32.gt %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 723 #else 724 unsigned int c; 725 asm("not.b32 %0, %0;" : "+r"(b)); 726 c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] 727 c = c & 0x80808080; // msb = carry-outs 728 r = c >> 7; // convert to bool 729 #endif 730 731 return r; 732 } 733 734 static __device__ __forceinline__ unsigned int vcmpgt4(unsigned int a, unsigned int b) 735 { 736 unsigned int r, c; 737 738 #if __CUDA_ARCH__ >= 300 739 r = vsetgt4(a, b); 740 c = r << 8; // convert bool 741 r = c - r; // to mask 742 #else 743 asm("not.b32 %0, %0;" : "+r"(b)); 744 c = vavg4(a, b); // (a + ~b) / 2 = (a - b) / 2 [rounded down] 745 c = c & 0x80808080; // msb = carry-outs 746 r = c >> 7; // convert 747 r = c - r; // msbs to 748 r = c | r; // mask 749 #endif 750 751 return r; 752 } 753 754 static __device__ __forceinline__ unsigned int vsetne4(unsigned int a, unsigned int b) 755 { 756 unsigned int r = 0; 757 758 #if __CUDA_ARCH__ >= 300 759 asm("vset4.u32.u32.ne %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 760 #else 761 // inspired by Alan Mycroft's null-byte detection algorithm: 762 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 763 unsigned int c; 764 r = a ^ b; // 0x00 if a == b 765 c = r | 0x80808080; // set msbs, to catch carry out 766 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 767 c = r | c; // msb = 1, if r was not 0x00 768 c = c & 0x80808080; // extract msbs 769 r = c >> 7; // convert to bool 770 #endif 771 772 return r; 773 } 774 775 static __device__ __forceinline__ unsigned int vcmpne4(unsigned int a, unsigned int b) 776 { 777 unsigned int r, c; 778 779 #if __CUDA_ARCH__ >= 300 780 r = vsetne4(a, b); 781 c = r << 8; // convert bool 782 r = c - r; // to mask 783 #else 784 // inspired by Alan Mycroft's null-byte detection algorithm: 785 // null_byte(x) = ((x - 0x01010101) & (~x & 0x80808080)) 786 r = a ^ b; // 0x00 if a == b 787 c = r | 0x80808080; // set msbs, to catch carry out 788 c = c - 0x01010101; // msb = 0, if r was 0x00 or 0x80 789 c = r | c; // msb = 1, if r was not 0x00 790 c = c & 0x80808080; // extract msbs 791 r = c >> 7; // convert 792 r = c - r; // msbs to 793 r = c | r; // mask 794 #endif 795 796 return r; 797 } 798 799 static __device__ __forceinline__ unsigned int vabsdiff4(unsigned int a, unsigned int b) 800 { 801 unsigned int r = 0; 802 803 #if __CUDA_ARCH__ >= 300 804 asm("vabsdiff4.u32.u32.u32.sat %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 805 #elif __CUDA_ARCH__ >= 200 806 asm("vabsdiff.u32.u32.u32.sat %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 807 asm("vabsdiff.u32.u32.u32.sat %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 808 asm("vabsdiff.u32.u32.u32.sat %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 809 asm("vabsdiff.u32.u32.u32.sat %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 810 #else 811 unsigned int s; 812 s = vcmpge4(a, b); // mask = 0xff if a >= b 813 r = a ^ b; // 814 s = (r & s) ^ b; // select a when a >= b, else select b => max(a,b) 815 r = s ^ r; // select a when b >= a, else select b => min(a,b) 816 r = s - r; // |a - b| = max(a,b) - min(a,b); 817 #endif 818 819 return r; 820 } 821 822 static __device__ __forceinline__ unsigned int vmax4(unsigned int a, unsigned int b) 823 { 824 unsigned int r = 0; 825 826 #if __CUDA_ARCH__ >= 300 827 asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 828 #elif __CUDA_ARCH__ >= 200 829 asm("vmax.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 830 asm("vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 831 asm("vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 832 asm("vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 833 #else 834 unsigned int s; 835 s = vcmpge4(a, b); // mask = 0xff if a >= b 836 r = a & s; // select a when b >= a 837 s = b & ~s; // select b when b < a 838 r = r | s; // combine byte selections 839 #endif 840 841 return r; // byte-wise unsigned maximum 842 } 843 844 static __device__ __forceinline__ unsigned int vmin4(unsigned int a, unsigned int b) 845 { 846 unsigned int r = 0; 847 848 #if __CUDA_ARCH__ >= 300 849 asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 850 #elif __CUDA_ARCH__ >= 200 851 asm("vmin.u32.u32.u32 %0.b0, %1.b0, %2.b0, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 852 asm("vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 853 asm("vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 854 asm("vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(r)); 855 #else 856 unsigned int s; 857 s = vcmpge4(b, a); // mask = 0xff if a >= b 858 r = a & s; // select a when b >= a 859 s = b & ~s; // select b when b < a 860 r = r | s; // combine byte selections 861 #endif 862 863 return r; 864 } 865}}} 866 867//! @endcond 868 869#endif // __OPENCV_CUDA_SIMD_FUNCTIONS_HPP__ 870