1793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler/* This sample demonstrates the way you can perform independed tasks
2793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler   on the different GPUs */
3793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
4793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Disable some warnings which are caused with CUDA headers
5793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(_MSC_VER)
6793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#pragma warning(disable: 4201 4408 4100)
7793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif
8793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
9793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <iostream>
10793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "cvconfig.h"
11793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "opencv2/core/core.hpp"
12793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include "opencv2/cudaarithm.hpp"
13793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
14793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#ifdef HAVE_TBB
15793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#  include "tbb/tbb_stddef.h"
16793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#  if TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202
17793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#    include "tbb/tbb.h"
18793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#    include "tbb/task.h"
19793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#    undef min
20793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#    undef max
21793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#  else
22793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#    undef HAVE_TBB
23793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#  endif
24793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif
25793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
26793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if !defined(HAVE_CUDA) || !defined(HAVE_TBB) || defined(__arm__)
27793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
28793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerint main()
29793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{
30793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if !defined(HAVE_CUDA)
31793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    std::cout << "CUDA support is required (CMake key 'WITH_CUDA' must be true).\n";
32793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif
33793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
34793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if !defined(HAVE_TBB)
35793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    std::cout << "TBB support is required (CMake key 'WITH_TBB' must be true).\n";
36793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif
37793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
38793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#if defined(__arm__)
39793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    std::cout << "Unsupported for ARM CUDA library." << std::endl;
40793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif
41793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
42793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    return 0;
43793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
44793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
45793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#else
46793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
47793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <cuda.h>
48793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#include <cuda_runtime.h>
49793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
50793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerusing namespace std;
51793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerusing namespace cv;
52793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerusing namespace cv::cuda;
53793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
54793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerstruct Worker { void operator()(int device_id) const; };
55793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid destroyContexts();
56793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
57793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#define safeCall(expr) safeCall_(expr, #expr, __FILE__, __LINE__)
58793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerinline void safeCall_(int code, const char* expr, const char* file, int line)
59793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{
60793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    if (code != CUDA_SUCCESS)
61793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    {
62793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        std::cout << "CUDA driver API error: code " << code << ", expr " << expr
63793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            << ", file " << file << ", line " << line << endl;
64793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        destroyContexts();
65793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        exit(-1);
66793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    }
67793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
68793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
69793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler// Each GPU is associated with its own context
70793ee12c6df9cad3806238d32528c49a3ff9331dNoah PreslerCUcontext contexts[2];
71793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
72793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslerint main()
73793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{
74793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int num_devices = getCudaEnabledDeviceCount();
75793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    if (num_devices < 2)
76793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    {
77793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        std::cout << "Two or more GPUs are required\n";
78793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        return -1;
79793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    }
80793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
81793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    for (int i = 0; i < num_devices; ++i)
82793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    {
83793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        cv::cuda::printShortCudaDeviceInfo(i);
84793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
85793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        DeviceInfo dev_info(i);
86793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        if (!dev_info.isCompatible())
87793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        {
88793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            std::cout << "CUDA module isn't built for GPU #" << i << " ("
89793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                 << dev_info.name() << ", CC " << dev_info.majorVersion()
90793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler                 << dev_info.minorVersion() << "\n";
91793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler            return -1;
92793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        }
93793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    }
94793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
95793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // Init CUDA Driver API
96793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuInit(0));
97793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
98793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // Create context for GPU #0
99793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    CUdevice device;
100793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuDeviceGet(&device, 0));
101793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuCtxCreate(&contexts[0], 0, device));
102793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
103793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    CUcontext prev_context;
104793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuCtxPopCurrent(&prev_context));
105793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
106793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // Create context for GPU #1
107793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuDeviceGet(&device, 1));
108793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuCtxCreate(&contexts[1], 0, device));
109793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
110793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuCtxPopCurrent(&prev_context));
111793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
112793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // Execute calculation in two threads using two GPUs
113793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    int devices[] = {0, 1};
114793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    tbb::parallel_do(devices, devices + 2, Worker());
115793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
116793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    destroyContexts();
117793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    return 0;
118793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
119793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
120793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
121793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid Worker::operator()(int device_id) const
122793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{
123793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // Set the proper context
124793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuCtxPushCurrent(contexts[device_id]));
125793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
126793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    Mat src(1000, 1000, CV_32F);
127793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    Mat dst;
128793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
129793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    RNG rng(0);
130793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    rng.fill(src, RNG::UNIFORM, 0, 1);
131793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
132793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // CPU works
133793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    cv::transpose(src, dst);
134793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
135793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // GPU works
136793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    GpuMat d_src(src);
137793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    GpuMat d_dst;
138793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    cuda::transpose(d_src, d_dst);
139793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
140793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // Check results
141793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    bool passed = cv::norm(dst - Mat(d_dst), NORM_INF) < 1e-3;
142793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    std::cout << "GPU #" << device_id << " (" << DeviceInfo().name() << "): "
143793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler        << (passed ? "passed" : "FAILED") << endl;
144793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
145793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // Deallocate data here, otherwise deallocation will be performed
146793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    // after context is extracted from the stack
147793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    d_src.release();
148793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    d_dst.release();
149793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
150793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    CUcontext prev_context;
151793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuCtxPopCurrent(&prev_context));
152793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
153793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
154793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
155793ee12c6df9cad3806238d32528c49a3ff9331dNoah Preslervoid destroyContexts()
156793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler{
157793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuCtxDestroy(contexts[0]));
158793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler    safeCall(cuCtxDestroy(contexts[1]));
159793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler}
160793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler
161793ee12c6df9cad3806238d32528c49a3ff9331dNoah Presler#endif
162