1793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler/* This sample demonstrates the way you can perform independed tasks 2793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler on the different GPUs */ 3793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 4793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Disable some warnings which are caused with CUDA headers 5793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(_MSC_VER) 6793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#pragma warning(disable: 4201 4408 4100) 7793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif 8793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 9793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <iostream> 10793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "cvconfig.h" 11793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "opencv2/core/core.hpp" 12793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "opencv2/cudaarithm.hpp" 13793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 14793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#ifdef HAVE_TBB 15793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler# include "tbb/tbb_stddef.h" 16793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler# if TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202 17793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler# include "tbb/tbb.h" 18793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler# include "tbb/task.h" 19793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler# undef min 20793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler# undef max 21793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler# else 22793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler# undef HAVE_TBB 23793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler# endif 24793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif 25793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 26793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if !defined(HAVE_CUDA) || !defined(HAVE_TBB) || defined(__arm__) 27793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 28793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerint main() 29793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{ 30793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if !defined(HAVE_CUDA) 31793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n"; 32793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif 33793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 34793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if !defined(HAVE_TBB) 35793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n"; 36793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif 37793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 38793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(__arm__) 39793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler std::cout << "Unsupported for ARM CUDA library." << std::endl; 40793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif 41793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 42793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return 0; 43793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 44793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 45793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#else 46793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 47793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <cuda.h> 48793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <cuda_runtime.h> 49793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 50793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerusing namespace std; 51793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerusing namespace cv; 52793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerusing namespace cv::cuda; 53793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 54793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerstruct Worker { void operator()(int device_id) const; }; 55793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid destroyContexts(); 56793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 57793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define safeCall(expr) safeCall_(expr, #expr, __FILE__, __LINE__) 58793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerinline void safeCall_(int code, const char* expr, const char* file, int line) 59793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{ 60793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (code != CUDA_SUCCESS) 61793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 62793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler std::cout << "CUDA driver API error: code " << code << ", expr " << expr 63793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler << ", file " << file << ", line " << line << endl; 64793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler destroyContexts(); 65793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler exit(-1); 66793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 67793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 68793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 69793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Each GPU is associated with its own context 70793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerCUcontext contexts[2]; 71793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 72793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerint main() 73793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{ 74793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int num_devices = getCudaEnabledDeviceCount(); 75793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (num_devices < 2) 76793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 77793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler std::cout << "Two or more GPUs are required\n"; 78793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return -1; 79793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 80793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 81793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler for (int i = 0; i < num_devices; ++i) 82793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 83793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cv::cuda::printShortCudaDeviceInfo(i); 84793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 85793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler DeviceInfo dev_info(i); 86793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler if (!dev_info.isCompatible()) 87793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler { 88793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler std::cout << "CUDA module isn't built for GPU #" << i << " (" 89793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler << dev_info.name() << ", CC " << dev_info.majorVersion() 90793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler << dev_info.minorVersion() << "\n"; 91793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return -1; 92793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 93793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler } 94793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 95793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Init CUDA Driver API 96793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuInit(0)); 97793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 98793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Create context for GPU #0 99793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler CUdevice device; 100793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuDeviceGet(&device, 0)); 101793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuCtxCreate(&contexts[0], 0, device)); 102793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 103793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler CUcontext prev_context; 104793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuCtxPopCurrent(&prev_context)); 105793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 106793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Create context for GPU #1 107793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuDeviceGet(&device, 1)); 108793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuCtxCreate(&contexts[1], 0, device)); 109793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 110793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuCtxPopCurrent(&prev_context)); 111793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 112793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Execute calculation in two threads using two GPUs 113793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler int devices[] = {0, 1}; 114793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler tbb::parallel_do(devices, devices + 2, Worker()); 115793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 116793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler destroyContexts(); 117793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler return 0; 118793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 119793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 120793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 121793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid Worker::operator()(int device_id) const 122793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{ 123793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Set the proper context 124793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuCtxPushCurrent(contexts[device_id])); 125793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 126793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler Mat src(1000, 1000, CV_32F); 127793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler Mat dst; 128793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 129793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler RNG rng(0); 130793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler rng.fill(src, RNG::UNIFORM, 0, 1); 131793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 132793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // CPU works 133793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cv::transpose(src, dst); 134793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 135793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // GPU works 136793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler GpuMat d_src(src); 137793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler GpuMat d_dst; 138793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler cuda::transpose(d_src, d_dst); 139793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 140793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Check results 141793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler bool passed = cv::norm(dst - Mat(d_dst), NORM_INF) < 1e-3; 142793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): " 143793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler << (passed ? "passed" : "FAILED") << endl; 144793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 145793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // Deallocate data here, otherwise deallocation will be performed 146793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler // after context is extracted from the stack 147793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler d_src.release(); 148793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler d_dst.release(); 149793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 150793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler CUcontext prev_context; 151793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuCtxPopCurrent(&prev_context)); 152793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 153793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 154793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 155793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid destroyContexts() 156793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{ 157793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuCtxDestroy(contexts[0])); 158793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler safeCall(cuCtxDestroy(contexts[1])); 159793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler} 160793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler 161793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif 162