1// This sample demonstrates working on one piece of data using two GPUs. 2// It splits input into two parts and processes them separately on different GPUs. 3 4#ifdef WIN32 5 #define NOMINMAX 6 #include <windows.h> 7#else 8 #include <pthread.h> 9 #include <unistd.h> 10#endif 11 12#include <iostream> 13#include <iomanip> 14 15#include "opencv2/core.hpp" 16#include "opencv2/highgui.hpp" 17#include "opencv2/imgproc.hpp" 18#include "opencv2/cudastereo.hpp" 19 20#include "tick_meter.hpp" 21 22using namespace std; 23using namespace cv; 24using namespace cv::cuda; 25 26/////////////////////////////////////////////////////////// 27// Thread 28// OS-specific wrappers for multi-threading 29 30#ifdef WIN32 31class Thread 32{ 33 struct UserData 34 { 35 void (*func)(void* userData); 36 void* param; 37 }; 38 39 static DWORD WINAPI WinThreadFunction(LPVOID lpParam) 40 { 41 UserData* userData = static_cast<UserData*>(lpParam); 42 43 userData->func(userData->param); 44 45 return 0; 46 } 47 48 UserData userData_; 49 HANDLE thread_; 50 DWORD threadId_; 51 52public: 53 Thread(void (*func)(void* userData), void* userData) 54 { 55 userData_.func = func; 56 userData_.param = userData; 57 58 thread_ = CreateThread( 59 NULL, // default security attributes 60 0, // use default stack size 61 WinThreadFunction, // thread function name 62 &userData_, // argument to thread function 63 0, // use default creation flags 64 &threadId_); // returns the thread identifier 65 } 66 67 ~Thread() 68 { 69 CloseHandle(thread_); 70 } 71 72 void wait() 73 { 74 WaitForSingleObject(thread_, INFINITE); 75 } 76}; 77#else 78class Thread 79{ 80 struct UserData 81 { 82 void (*func)(void* userData); 83 void* param; 84 }; 85 86 static void* PThreadFunction(void* lpParam) 87 { 88 UserData* userData = static_cast<UserData*>(lpParam); 89 90 userData->func(userData->param); 91 92 return 0; 93 } 94 95 pthread_t thread_; 96 UserData userData_; 97 98public: 99 Thread(void (*func)(void* userData), void* userData) 100 { 101 userData_.func = func; 102 userData_.param = userData; 103 104 pthread_create(&thread_, NULL, PThreadFunction, &userData_); 105 } 106 107 ~Thread() 108 { 109 pthread_detach(thread_); 110 } 111 112 void wait() 113 { 114 pthread_join(thread_, NULL); 115 } 116}; 117#endif 118 119/////////////////////////////////////////////////////////// 120// StereoSingleGpu 121// Run Stereo algorithm on single GPU 122 123class StereoSingleGpu 124{ 125public: 126 explicit StereoSingleGpu(int deviceId = 0); 127 ~StereoSingleGpu(); 128 129 void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity); 130 131private: 132 int deviceId_; 133 GpuMat d_leftFrame; 134 GpuMat d_rightFrame; 135 GpuMat d_disparity; 136 Ptr<cuda::StereoBM> d_alg; 137}; 138 139StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId) 140{ 141 cuda::setDevice(deviceId_); 142 d_alg = cuda::createStereoBM(256); 143} 144 145StereoSingleGpu::~StereoSingleGpu() 146{ 147 cuda::setDevice(deviceId_); 148 d_leftFrame.release(); 149 d_rightFrame.release(); 150 d_disparity.release(); 151 d_alg.release(); 152} 153 154void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity) 155{ 156 cuda::setDevice(deviceId_); 157 d_leftFrame.upload(leftFrame); 158 d_rightFrame.upload(rightFrame); 159 d_alg->compute(d_leftFrame, d_rightFrame, d_disparity); 160 d_disparity.download(disparity); 161} 162 163/////////////////////////////////////////////////////////// 164// StereoMultiGpuThread 165// Run Stereo algorithm on two GPUs using different host threads 166 167class StereoMultiGpuThread 168{ 169public: 170 StereoMultiGpuThread(); 171 ~StereoMultiGpuThread(); 172 173 void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity); 174 175private: 176 GpuMat d_leftFrames[2]; 177 GpuMat d_rightFrames[2]; 178 GpuMat d_disparities[2]; 179 Ptr<cuda::StereoBM> d_algs[2]; 180 181 struct StereoLaunchData 182 { 183 int deviceId; 184 Mat leftFrame; 185 Mat rightFrame; 186 Mat disparity; 187 GpuMat* d_leftFrame; 188 GpuMat* d_rightFrame; 189 GpuMat* d_disparity; 190 Ptr<cuda::StereoBM> d_alg; 191 }; 192 193 static void launchGpuStereoAlg(void* userData); 194}; 195 196StereoMultiGpuThread::StereoMultiGpuThread() 197{ 198 cuda::setDevice(0); 199 d_algs[0] = cuda::createStereoBM(256); 200 201 cuda::setDevice(1); 202 d_algs[1] = cuda::createStereoBM(256); 203} 204 205StereoMultiGpuThread::~StereoMultiGpuThread() 206{ 207 cuda::setDevice(0); 208 d_leftFrames[0].release(); 209 d_rightFrames[0].release(); 210 d_disparities[0].release(); 211 d_algs[0].release(); 212 213 cuda::setDevice(1); 214 d_leftFrames[1].release(); 215 d_rightFrames[1].release(); 216 d_disparities[1].release(); 217 d_algs[1].release(); 218} 219 220void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity) 221{ 222 disparity.create(leftFrame.size(), CV_8UC1); 223 224 // Split input data onto two parts for each GPUs. 225 // We add small border for each part, 226 // because original algorithm doesn't calculate disparity on image borders. 227 // With such padding we will get output in the middle of final result. 228 229 StereoLaunchData launchDatas[2]; 230 231 launchDatas[0].deviceId = 0; 232 launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32); 233 launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32); 234 launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2); 235 launchDatas[0].d_leftFrame = &d_leftFrames[0]; 236 launchDatas[0].d_rightFrame = &d_rightFrames[0]; 237 launchDatas[0].d_disparity = &d_disparities[0]; 238 launchDatas[0].d_alg = d_algs[0]; 239 240 launchDatas[1].deviceId = 1; 241 launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows); 242 launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows); 243 launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows); 244 launchDatas[1].d_leftFrame = &d_leftFrames[1]; 245 launchDatas[1].d_rightFrame = &d_rightFrames[1]; 246 launchDatas[1].d_disparity = &d_disparities[1]; 247 launchDatas[1].d_alg = d_algs[1]; 248 249 Thread thread0(launchGpuStereoAlg, &launchDatas[0]); 250 Thread thread1(launchGpuStereoAlg, &launchDatas[1]); 251 252 thread0.wait(); 253 thread1.wait(); 254} 255 256void StereoMultiGpuThread::launchGpuStereoAlg(void* userData) 257{ 258 StereoLaunchData* data = static_cast<StereoLaunchData*>(userData); 259 260 cuda::setDevice(data->deviceId); 261 data->d_leftFrame->upload(data->leftFrame); 262 data->d_rightFrame->upload(data->rightFrame); 263 data->d_alg->compute(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity); 264 265 if (data->deviceId == 0) 266 data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity); 267 else 268 data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity); 269} 270 271/////////////////////////////////////////////////////////// 272// StereoMultiGpuStream 273// Run Stereo algorithm on two GPUs from single host thread using async API 274 275class StereoMultiGpuStream 276{ 277public: 278 StereoMultiGpuStream(); 279 ~StereoMultiGpuStream(); 280 281 void compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity); 282 283private: 284 GpuMat d_leftFrames[2]; 285 GpuMat d_rightFrames[2]; 286 GpuMat d_disparities[2]; 287 Ptr<cuda::StereoBM> d_algs[2]; 288 Ptr<Stream> streams[2]; 289}; 290 291StereoMultiGpuStream::StereoMultiGpuStream() 292{ 293 cuda::setDevice(0); 294 d_algs[0] = cuda::createStereoBM(256); 295 streams[0] = makePtr<Stream>(); 296 297 cuda::setDevice(1); 298 d_algs[1] = cuda::createStereoBM(256); 299 streams[1] = makePtr<Stream>(); 300} 301 302StereoMultiGpuStream::~StereoMultiGpuStream() 303{ 304 cuda::setDevice(0); 305 d_leftFrames[0].release(); 306 d_rightFrames[0].release(); 307 d_disparities[0].release(); 308 d_algs[0].release(); 309 streams[0].release(); 310 311 cuda::setDevice(1); 312 d_leftFrames[1].release(); 313 d_rightFrames[1].release(); 314 d_disparities[1].release(); 315 d_algs[1].release(); 316 streams[1].release(); 317} 318 319void StereoMultiGpuStream::compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity) 320{ 321 disparity.create(leftFrame.size(), CV_8UC1); 322 323 // Split input data onto two parts for each GPUs. 324 // We add small border for each part, 325 // because original algorithm doesn't calculate disparity on image borders. 326 // With such padding we will get output in the middle of final result. 327 328 Mat leftFrameHdr = leftFrame.createMatHeader(); 329 Mat rightFrameHdr = rightFrame.createMatHeader(); 330 Mat disparityHdr = disparity.createMatHeader(); 331 Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2); 332 Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows); 333 334 cuda::setDevice(0); 335 d_leftFrames[0].upload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]); 336 d_rightFrames[0].upload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]); 337 d_algs[0]->compute(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]); 338 d_disparities[0].rowRange(0, leftFrame.rows / 2).download(disparityPart0, *streams[0]); 339 340 cuda::setDevice(1); 341 d_leftFrames[1].upload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]); 342 d_rightFrames[1].upload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]); 343 d_algs[1]->compute(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]); 344 d_disparities[1].rowRange(32, d_disparities[1].rows).download(disparityPart1, *streams[1]); 345 346 cuda::setDevice(0); 347 streams[0]->waitForCompletion(); 348 349 cuda::setDevice(1); 350 streams[1]->waitForCompletion(); 351} 352 353/////////////////////////////////////////////////////////// 354// main 355 356int main(int argc, char** argv) 357{ 358 if (argc != 3) 359 { 360 cerr << "Usage: stereo_multi_gpu <left_video> <right_video>" << endl; 361 return -1; 362 } 363 364 const int numDevices = getCudaEnabledDeviceCount(); 365 if (numDevices != 2) 366 { 367 cerr << "Two GPUs are required" << endl; 368 return -1; 369 } 370 371 for (int i = 0; i < numDevices; ++i) 372 { 373 DeviceInfo devInfo(i); 374 if (!devInfo.isCompatible()) 375 { 376 cerr << "CUDA module was't built for GPU #" << i << " (" 377 << devInfo.name() << ", CC " << devInfo.majorVersion() 378 << devInfo.minorVersion() << endl; 379 return -1; 380 } 381 382 printShortCudaDeviceInfo(i); 383 } 384 385 VideoCapture leftVideo(argv[1]); 386 VideoCapture rightVideo(argv[2]); 387 388 if (!leftVideo.isOpened()) 389 { 390 cerr << "Can't open " << argv[1] << " video file" << endl; 391 return -1; 392 } 393 394 if (!rightVideo.isOpened()) 395 { 396 cerr << "Can't open " << argv[2] << " video file" << endl; 397 return -1; 398 } 399 400 cout << endl; 401 cout << "This sample demonstrates working on one piece of data using two GPUs." << endl; 402 cout << "It splits input into two parts and processes them separately on different GPUs." << endl; 403 cout << endl; 404 405 Mat leftFrame, rightFrame; 406 HostMem leftGrayFrame, rightGrayFrame; 407 408 StereoSingleGpu gpu0Alg(0); 409 StereoSingleGpu gpu1Alg(1); 410 StereoMultiGpuThread multiThreadAlg; 411 StereoMultiGpuStream multiStreamAlg; 412 413 Mat disparityGpu0; 414 Mat disparityGpu1; 415 Mat disparityMultiThread; 416 HostMem disparityMultiStream; 417 418 Mat disparityGpu0Show; 419 Mat disparityGpu1Show; 420 Mat disparityMultiThreadShow; 421 Mat disparityMultiStreamShow; 422 423 TickMeter tm; 424 425 cout << "-------------------------------------------------------------------" << endl; 426 cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl; 427 cout << "-------------------------------------------------------------------" << endl; 428 429 for (int i = 0;; ++i) 430 { 431 leftVideo >> leftFrame; 432 rightVideo >> rightFrame; 433 434 if (leftFrame.empty() || rightFrame.empty()) 435 break; 436 437 if (leftFrame.size() != rightFrame.size()) 438 { 439 cerr << "Frames have different sizes" << endl; 440 return -1; 441 } 442 443 leftGrayFrame.create(leftFrame.size(), CV_8UC1); 444 rightGrayFrame.create(leftFrame.size(), CV_8UC1); 445 446 cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY); 447 cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY); 448 449 tm.reset(); tm.start(); 450 gpu0Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(), 451 disparityGpu0); 452 tm.stop(); 453 454 const double gpu0Time = tm.getTimeMilli(); 455 456 tm.reset(); tm.start(); 457 gpu1Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(), 458 disparityGpu1); 459 tm.stop(); 460 461 const double gpu1Time = tm.getTimeMilli(); 462 463 tm.reset(); tm.start(); 464 multiThreadAlg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(), 465 disparityMultiThread); 466 tm.stop(); 467 468 const double multiThreadTime = tm.getTimeMilli(); 469 470 tm.reset(); tm.start(); 471 multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream); 472 tm.stop(); 473 474 const double multiStreamTime = tm.getTimeMilli(); 475 476 cout << "| " << setw(5) << i << " | " 477 << setw(8) << setprecision(1) << fixed << gpu0Time << " | " 478 << setw(8) << setprecision(1) << fixed << gpu1Time << " | " 479 << setw(15) << setprecision(1) << fixed << multiThreadTime << " | " 480 << setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl; 481 482 resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA); 483 resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA); 484 resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA); 485 resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA); 486 487 imshow("disparityGpu0", disparityGpu0Show); 488 imshow("disparityGpu1", disparityGpu1Show); 489 imshow("disparityMultiThread", disparityMultiThreadShow); 490 imshow("disparityMultiStream", disparityMultiStreamShow); 491 492 const int key = waitKey(30) & 0xff; 493 if (key == 27) 494 break; 495 } 496 497 cout << "-------------------------------------------------------------------" << endl; 498 499 return 0; 500} 501