1// This sample demonstrates working on one piece of data using two GPUs.
2// It splits input into two parts and processes them separately on different GPUs.
3
4#ifdef WIN32
5    #define NOMINMAX
6    #include <windows.h>
7#else
8    #include <pthread.h>
9    #include <unistd.h>
10#endif
11
12#include <iostream>
13#include <iomanip>
14
15#include "opencv2/core.hpp"
16#include "opencv2/highgui.hpp"
17#include "opencv2/imgproc.hpp"
18#include "opencv2/cudastereo.hpp"
19
20#include "tick_meter.hpp"
21
22using namespace std;
23using namespace cv;
24using namespace cv::cuda;
25
26///////////////////////////////////////////////////////////
27// Thread
28// OS-specific wrappers for multi-threading
29
30#ifdef WIN32
31class Thread
32{
33    struct UserData
34    {
35        void (*func)(void* userData);
36        void* param;
37    };
38
39    static DWORD WINAPI WinThreadFunction(LPVOID lpParam)
40    {
41        UserData* userData = static_cast<UserData*>(lpParam);
42
43        userData->func(userData->param);
44
45        return 0;
46    }
47
48    UserData userData_;
49    HANDLE thread_;
50    DWORD threadId_;
51
52public:
53    Thread(void (*func)(void* userData), void* userData)
54    {
55        userData_.func = func;
56        userData_.param = userData;
57
58        thread_ = CreateThread(
59            NULL,                   // default security attributes
60            0,                      // use default stack size
61            WinThreadFunction,      // thread function name
62            &userData_,             // argument to thread function
63            0,                      // use default creation flags
64            &threadId_);            // returns the thread identifier
65    }
66
67    ~Thread()
68    {
69        CloseHandle(thread_);
70    }
71
72    void wait()
73    {
74        WaitForSingleObject(thread_, INFINITE);
75    }
76};
77#else
78class Thread
79{
80    struct UserData
81    {
82        void (*func)(void* userData);
83        void* param;
84    };
85
86    static void* PThreadFunction(void* lpParam)
87    {
88        UserData* userData = static_cast<UserData*>(lpParam);
89
90        userData->func(userData->param);
91
92        return 0;
93    }
94
95    pthread_t thread_;
96    UserData userData_;
97
98public:
99    Thread(void (*func)(void* userData), void* userData)
100    {
101        userData_.func = func;
102        userData_.param = userData;
103
104        pthread_create(&thread_, NULL, PThreadFunction, &userData_);
105    }
106
107    ~Thread()
108    {
109        pthread_detach(thread_);
110    }
111
112    void wait()
113    {
114        pthread_join(thread_, NULL);
115    }
116};
117#endif
118
119///////////////////////////////////////////////////////////
120// StereoSingleGpu
121// Run Stereo algorithm on single GPU
122
123class StereoSingleGpu
124{
125public:
126    explicit StereoSingleGpu(int deviceId = 0);
127    ~StereoSingleGpu();
128
129    void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
130
131private:
132    int deviceId_;
133    GpuMat d_leftFrame;
134    GpuMat d_rightFrame;
135    GpuMat d_disparity;
136    Ptr<cuda::StereoBM> d_alg;
137};
138
139StereoSingleGpu::StereoSingleGpu(int deviceId) : deviceId_(deviceId)
140{
141    cuda::setDevice(deviceId_);
142    d_alg = cuda::createStereoBM(256);
143}
144
145StereoSingleGpu::~StereoSingleGpu()
146{
147    cuda::setDevice(deviceId_);
148    d_leftFrame.release();
149    d_rightFrame.release();
150    d_disparity.release();
151    d_alg.release();
152}
153
154void StereoSingleGpu::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
155{
156    cuda::setDevice(deviceId_);
157    d_leftFrame.upload(leftFrame);
158    d_rightFrame.upload(rightFrame);
159    d_alg->compute(d_leftFrame, d_rightFrame, d_disparity);
160    d_disparity.download(disparity);
161}
162
163///////////////////////////////////////////////////////////
164// StereoMultiGpuThread
165// Run Stereo algorithm on two GPUs using different host threads
166
167class StereoMultiGpuThread
168{
169public:
170    StereoMultiGpuThread();
171    ~StereoMultiGpuThread();
172
173    void compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity);
174
175private:
176    GpuMat d_leftFrames[2];
177    GpuMat d_rightFrames[2];
178    GpuMat d_disparities[2];
179    Ptr<cuda::StereoBM> d_algs[2];
180
181    struct StereoLaunchData
182    {
183        int deviceId;
184        Mat leftFrame;
185        Mat rightFrame;
186        Mat disparity;
187        GpuMat* d_leftFrame;
188        GpuMat* d_rightFrame;
189        GpuMat* d_disparity;
190        Ptr<cuda::StereoBM> d_alg;
191    };
192
193    static void launchGpuStereoAlg(void* userData);
194};
195
196StereoMultiGpuThread::StereoMultiGpuThread()
197{
198    cuda::setDevice(0);
199    d_algs[0] = cuda::createStereoBM(256);
200
201    cuda::setDevice(1);
202    d_algs[1] = cuda::createStereoBM(256);
203}
204
205StereoMultiGpuThread::~StereoMultiGpuThread()
206{
207    cuda::setDevice(0);
208    d_leftFrames[0].release();
209    d_rightFrames[0].release();
210    d_disparities[0].release();
211    d_algs[0].release();
212
213    cuda::setDevice(1);
214    d_leftFrames[1].release();
215    d_rightFrames[1].release();
216    d_disparities[1].release();
217    d_algs[1].release();
218}
219
220void StereoMultiGpuThread::compute(const Mat& leftFrame, const Mat& rightFrame, Mat& disparity)
221{
222    disparity.create(leftFrame.size(), CV_8UC1);
223
224    // Split input data onto two parts for each GPUs.
225    // We add small border for each part,
226    // because original algorithm doesn't calculate disparity on image borders.
227    // With such padding we will get output in the middle of final result.
228
229    StereoLaunchData launchDatas[2];
230
231    launchDatas[0].deviceId = 0;
232    launchDatas[0].leftFrame = leftFrame.rowRange(0, leftFrame.rows / 2 + 32);
233    launchDatas[0].rightFrame = rightFrame.rowRange(0, rightFrame.rows / 2 + 32);
234    launchDatas[0].disparity = disparity.rowRange(0, leftFrame.rows / 2);
235    launchDatas[0].d_leftFrame = &d_leftFrames[0];
236    launchDatas[0].d_rightFrame = &d_rightFrames[0];
237    launchDatas[0].d_disparity = &d_disparities[0];
238    launchDatas[0].d_alg = d_algs[0];
239
240    launchDatas[1].deviceId = 1;
241    launchDatas[1].leftFrame = leftFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
242    launchDatas[1].rightFrame = rightFrame.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows);
243    launchDatas[1].disparity = disparity.rowRange(leftFrame.rows / 2, leftFrame.rows);
244    launchDatas[1].d_leftFrame = &d_leftFrames[1];
245    launchDatas[1].d_rightFrame = &d_rightFrames[1];
246    launchDatas[1].d_disparity = &d_disparities[1];
247    launchDatas[1].d_alg = d_algs[1];
248
249    Thread thread0(launchGpuStereoAlg, &launchDatas[0]);
250    Thread thread1(launchGpuStereoAlg, &launchDatas[1]);
251
252    thread0.wait();
253    thread1.wait();
254}
255
256void StereoMultiGpuThread::launchGpuStereoAlg(void* userData)
257{
258    StereoLaunchData* data = static_cast<StereoLaunchData*>(userData);
259
260    cuda::setDevice(data->deviceId);
261    data->d_leftFrame->upload(data->leftFrame);
262    data->d_rightFrame->upload(data->rightFrame);
263    data->d_alg->compute(*data->d_leftFrame, *data->d_rightFrame, *data->d_disparity);
264
265    if (data->deviceId == 0)
266        data->d_disparity->rowRange(0, data->d_disparity->rows - 32).download(data->disparity);
267    else
268        data->d_disparity->rowRange(32, data->d_disparity->rows).download(data->disparity);
269}
270
271///////////////////////////////////////////////////////////
272// StereoMultiGpuStream
273// Run Stereo algorithm on two GPUs from single host thread using async API
274
275class StereoMultiGpuStream
276{
277public:
278    StereoMultiGpuStream();
279    ~StereoMultiGpuStream();
280
281    void compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity);
282
283private:
284    GpuMat d_leftFrames[2];
285    GpuMat d_rightFrames[2];
286    GpuMat d_disparities[2];
287    Ptr<cuda::StereoBM> d_algs[2];
288    Ptr<Stream> streams[2];
289};
290
291StereoMultiGpuStream::StereoMultiGpuStream()
292{
293    cuda::setDevice(0);
294    d_algs[0] = cuda::createStereoBM(256);
295    streams[0] = makePtr<Stream>();
296
297    cuda::setDevice(1);
298    d_algs[1] = cuda::createStereoBM(256);
299    streams[1] = makePtr<Stream>();
300}
301
302StereoMultiGpuStream::~StereoMultiGpuStream()
303{
304    cuda::setDevice(0);
305    d_leftFrames[0].release();
306    d_rightFrames[0].release();
307    d_disparities[0].release();
308    d_algs[0].release();
309    streams[0].release();
310
311    cuda::setDevice(1);
312    d_leftFrames[1].release();
313    d_rightFrames[1].release();
314    d_disparities[1].release();
315    d_algs[1].release();
316    streams[1].release();
317}
318
319void StereoMultiGpuStream::compute(const HostMem& leftFrame, const HostMem& rightFrame, HostMem& disparity)
320{
321    disparity.create(leftFrame.size(), CV_8UC1);
322
323    // Split input data onto two parts for each GPUs.
324    // We add small border for each part,
325    // because original algorithm doesn't calculate disparity on image borders.
326    // With such padding we will get output in the middle of final result.
327
328    Mat leftFrameHdr = leftFrame.createMatHeader();
329    Mat rightFrameHdr = rightFrame.createMatHeader();
330    Mat disparityHdr = disparity.createMatHeader();
331    Mat disparityPart0 = disparityHdr.rowRange(0, leftFrame.rows / 2);
332    Mat disparityPart1 = disparityHdr.rowRange(leftFrame.rows / 2, leftFrame.rows);
333
334    cuda::setDevice(0);
335    d_leftFrames[0].upload(leftFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
336    d_rightFrames[0].upload(rightFrameHdr.rowRange(0, leftFrame.rows / 2 + 32), *streams[0]);
337    d_algs[0]->compute(d_leftFrames[0], d_rightFrames[0], d_disparities[0], *streams[0]);
338    d_disparities[0].rowRange(0, leftFrame.rows / 2).download(disparityPart0, *streams[0]);
339
340    cuda::setDevice(1);
341    d_leftFrames[1].upload(leftFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
342    d_rightFrames[1].upload(rightFrameHdr.rowRange(leftFrame.rows / 2 - 32, leftFrame.rows), *streams[1]);
343    d_algs[1]->compute(d_leftFrames[1], d_rightFrames[1], d_disparities[1], *streams[1]);
344    d_disparities[1].rowRange(32, d_disparities[1].rows).download(disparityPart1, *streams[1]);
345
346    cuda::setDevice(0);
347    streams[0]->waitForCompletion();
348
349    cuda::setDevice(1);
350    streams[1]->waitForCompletion();
351}
352
353///////////////////////////////////////////////////////////
354// main
355
356int main(int argc, char** argv)
357{
358    if (argc != 3)
359    {
360        cerr << "Usage: stereo_multi_gpu <left_video> <right_video>" << endl;
361        return -1;
362    }
363
364    const int numDevices = getCudaEnabledDeviceCount();
365    if (numDevices != 2)
366    {
367        cerr << "Two GPUs are required" << endl;
368        return -1;
369    }
370
371    for (int i = 0; i < numDevices; ++i)
372    {
373        DeviceInfo devInfo(i);
374        if (!devInfo.isCompatible())
375        {
376            cerr << "CUDA module was't built for GPU #" << i << " ("
377                 << devInfo.name() << ", CC " << devInfo.majorVersion()
378                 << devInfo.minorVersion() << endl;
379            return -1;
380        }
381
382        printShortCudaDeviceInfo(i);
383    }
384
385    VideoCapture leftVideo(argv[1]);
386    VideoCapture rightVideo(argv[2]);
387
388    if (!leftVideo.isOpened())
389    {
390         cerr << "Can't open " << argv[1] << " video file" << endl;
391         return -1;
392    }
393
394    if (!rightVideo.isOpened())
395    {
396         cerr << "Can't open " << argv[2] << " video file" << endl;
397         return -1;
398    }
399
400    cout << endl;
401    cout << "This sample demonstrates working on one piece of data using two GPUs." << endl;
402    cout << "It splits input into two parts and processes them separately on different GPUs." << endl;
403    cout << endl;
404
405    Mat leftFrame, rightFrame;
406    HostMem leftGrayFrame, rightGrayFrame;
407
408    StereoSingleGpu gpu0Alg(0);
409    StereoSingleGpu gpu1Alg(1);
410    StereoMultiGpuThread multiThreadAlg;
411    StereoMultiGpuStream multiStreamAlg;
412
413    Mat disparityGpu0;
414    Mat disparityGpu1;
415    Mat disparityMultiThread;
416    HostMem disparityMultiStream;
417
418    Mat disparityGpu0Show;
419    Mat disparityGpu1Show;
420    Mat disparityMultiThreadShow;
421    Mat disparityMultiStreamShow;
422
423    TickMeter tm;
424
425    cout << "-------------------------------------------------------------------" << endl;
426    cout << "| Frame | GPU 0 ms | GPU 1 ms | Multi Thread ms | Multi Stream ms |" << endl;
427    cout << "-------------------------------------------------------------------" << endl;
428
429    for (int i = 0;; ++i)
430    {
431        leftVideo >> leftFrame;
432        rightVideo >> rightFrame;
433
434        if (leftFrame.empty() || rightFrame.empty())
435            break;
436
437        if (leftFrame.size() != rightFrame.size())
438        {
439            cerr << "Frames have different sizes" << endl;
440            return -1;
441        }
442
443        leftGrayFrame.create(leftFrame.size(), CV_8UC1);
444        rightGrayFrame.create(leftFrame.size(), CV_8UC1);
445
446        cvtColor(leftFrame, leftGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
447        cvtColor(rightFrame, rightGrayFrame.createMatHeader(), COLOR_BGR2GRAY);
448
449        tm.reset(); tm.start();
450        gpu0Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
451                        disparityGpu0);
452        tm.stop();
453
454        const double gpu0Time = tm.getTimeMilli();
455
456        tm.reset(); tm.start();
457        gpu1Alg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
458                        disparityGpu1);
459        tm.stop();
460
461        const double gpu1Time = tm.getTimeMilli();
462
463        tm.reset(); tm.start();
464        multiThreadAlg.compute(leftGrayFrame.createMatHeader(), rightGrayFrame.createMatHeader(),
465                               disparityMultiThread);
466        tm.stop();
467
468        const double multiThreadTime = tm.getTimeMilli();
469
470        tm.reset(); tm.start();
471        multiStreamAlg.compute(leftGrayFrame, rightGrayFrame, disparityMultiStream);
472        tm.stop();
473
474        const double multiStreamTime = tm.getTimeMilli();
475
476        cout << "| " << setw(5) << i << " | "
477             << setw(8) << setprecision(1) << fixed << gpu0Time << " | "
478             << setw(8) << setprecision(1) << fixed << gpu1Time << " | "
479             << setw(15) << setprecision(1) << fixed << multiThreadTime << " | "
480             << setw(15) << setprecision(1) << fixed << multiStreamTime << " |" << endl;
481
482        resize(disparityGpu0, disparityGpu0Show, Size(1024, 768), 0, 0, INTER_AREA);
483        resize(disparityGpu1, disparityGpu1Show, Size(1024, 768), 0, 0, INTER_AREA);
484        resize(disparityMultiThread, disparityMultiThreadShow, Size(1024, 768), 0, 0, INTER_AREA);
485        resize(disparityMultiStream.createMatHeader(), disparityMultiStreamShow, Size(1024, 768), 0, 0, INTER_AREA);
486
487        imshow("disparityGpu0", disparityGpu0Show);
488        imshow("disparityGpu1", disparityGpu1Show);
489        imshow("disparityMultiThread", disparityMultiThreadShow);
490        imshow("disparityMultiStream", disparityMultiStreamShow);
491
492        const int key = waitKey(30) & 0xff;
493        if (key == 27)
494            break;
495    }
496
497    cout << "-------------------------------------------------------------------" << endl;
498
499    return 0;
500}
501