cuda_stream.cpp revision 793ee12c6df9cad3806238d32528c49a3ff9331d
1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// Redistribution and use in source and binary forms, with or without modification,
18// are permitted provided that the following conditions are met:
19//
20//   * Redistribution's of source code must retain the above copyright notice,
21//     this list of conditions and the following disclaimer.
22//
23//   * Redistribution's in binary form must reproduce the above copyright notice,
24//     this list of conditions and the following disclaimer in the documentation
25//     and/or other materials provided with the distribution.
26//
27//   * The name of the copyright holders may not be used to endorse or promote products
28//     derived from this software without specific prior written permission.
29//
30// This software is provided by the copyright holders and contributors "as is" and
31// any express or implied warranties, including, but not limited to, the implied
32// warranties of merchantability and fitness for a particular purpose are disclaimed.
33// In no event shall the Intel Corporation or contributors be liable for any direct,
34// indirect, incidental, special, exemplary, or consequential damages
35// (including, but not limited to, procurement of substitute goods or services;
36// loss of use, data, or profits; or business interruption) however caused
37// and on any theory of liability, whether in contract, strict liability,
38// or tort (including negligence or otherwise) arising in any way out of
39// the use of this software, even if advised of the possibility of such damage.
40//
41//M*/
42
43#include "precomp.hpp"
44
45using namespace cv;
46using namespace cv::cuda;
47
48/////////////////////////////////////////////////////////////
49/// MemoryStack
50
51#ifdef HAVE_CUDA
52
53namespace
54{
55    class MemoryPool;
56
57    class MemoryStack
58    {
59    public:
60        uchar* requestMemory(size_t size);
61        void returnMemory(uchar* ptr);
62
63        uchar* datastart;
64        uchar* dataend;
65        uchar* tip;
66
67        bool isFree;
68        MemoryPool* pool;
69
70    #if !defined(NDEBUG)
71        std::vector<size_t> allocations;
72    #endif
73    };
74
75    uchar* MemoryStack::requestMemory(size_t size)
76    {
77        const size_t freeMem = dataend - tip;
78
79        if (size > freeMem)
80            return 0;
81
82        uchar* ptr = tip;
83
84        tip += size;
85
86    #if !defined(NDEBUG)
87        allocations.push_back(size);
88    #endif
89
90        return ptr;
91    }
92
93    void MemoryStack::returnMemory(uchar* ptr)
94    {
95        CV_DbgAssert( ptr >= datastart && ptr < dataend );
96
97    #if !defined(NDEBUG)
98        const size_t allocSize = tip - ptr;
99        CV_Assert( allocSize == allocations.back() );
100        allocations.pop_back();
101    #endif
102
103        tip = ptr;
104    }
105}
106
107#endif
108
109/////////////////////////////////////////////////////////////
110/// MemoryPool
111
112#ifdef HAVE_CUDA
113
114namespace
115{
116    class MemoryPool
117    {
118    public:
119        MemoryPool();
120
121        void initialize(size_t stackSize, int stackCount);
122        void release();
123
124        MemoryStack* getFreeMemStack();
125        void returnMemStack(MemoryStack* memStack);
126
127    private:
128        void initilizeImpl();
129
130        Mutex mtx_;
131
132        bool initialized_;
133        size_t stackSize_;
134        int stackCount_;
135
136        uchar* mem_;
137
138        std::vector<MemoryStack> stacks_;
139    };
140
141    MemoryPool::MemoryPool() : initialized_(false), mem_(0)
142    {
143        // default : 10 Mb, 5 stacks
144        stackSize_ = 10 * 1024 * 1024;
145        stackCount_ = 5;
146    }
147
148    void MemoryPool::initialize(size_t stackSize, int stackCount)
149    {
150        AutoLock lock(mtx_);
151
152        release();
153
154        stackSize_ = stackSize;
155        stackCount_ = stackCount;
156
157        initilizeImpl();
158    }
159
160    void MemoryPool::initilizeImpl()
161    {
162        const size_t totalSize = stackSize_ * stackCount_;
163
164        if (totalSize > 0)
165        {
166            cudaError_t err = cudaMalloc(&mem_, totalSize);
167            if (err != cudaSuccess)
168                return;
169
170            stacks_.resize(stackCount_);
171
172            uchar* ptr = mem_;
173
174            for (int i = 0; i < stackCount_; ++i)
175            {
176                stacks_[i].datastart = ptr;
177                stacks_[i].dataend = ptr + stackSize_;
178                stacks_[i].tip = ptr;
179                stacks_[i].isFree = true;
180                stacks_[i].pool = this;
181
182                ptr += stackSize_;
183            }
184
185            initialized_ = true;
186        }
187    }
188
189    void MemoryPool::release()
190    {
191        if (mem_)
192        {
193#if !defined(NDEBUG)
194            for (int i = 0; i < stackCount_; ++i)
195            {
196                CV_DbgAssert( stacks_[i].isFree );
197                CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
198            }
199#endif
200
201            cudaFree(mem_);
202
203            mem_ = 0;
204            initialized_ = false;
205        }
206    }
207
208    MemoryStack* MemoryPool::getFreeMemStack()
209    {
210        AutoLock lock(mtx_);
211
212        if (!initialized_)
213            initilizeImpl();
214
215        if (!mem_)
216            return 0;
217
218        for (int i = 0; i < stackCount_; ++i)
219        {
220            if (stacks_[i].isFree)
221            {
222                stacks_[i].isFree = false;
223                return &stacks_[i];
224            }
225        }
226
227        return 0;
228    }
229
230    void MemoryPool::returnMemStack(MemoryStack* memStack)
231    {
232        AutoLock lock(mtx_);
233
234        CV_DbgAssert( !memStack->isFree );
235
236#if !defined(NDEBUG)
237        bool found = false;
238        for (int i = 0; i < stackCount_; ++i)
239        {
240            if (memStack == &stacks_[i])
241            {
242                found = true;
243                break;
244            }
245        }
246        CV_DbgAssert( found );
247#endif
248
249        CV_DbgAssert( memStack->tip == memStack->datastart );
250
251        memStack->isFree = true;
252    }
253}
254
255#endif
256
257////////////////////////////////////////////////////////////////
258/// Stream::Impl
259
260#ifndef HAVE_CUDA
261
262class cv::cuda::Stream::Impl
263{
264public:
265    Impl(void* ptr = 0)
266    {
267        (void) ptr;
268        throw_no_cuda();
269    }
270};
271
272#else
273
274namespace
275{
276    class StackAllocator;
277}
278
279class cv::cuda::Stream::Impl
280{
281public:
282    cudaStream_t stream;
283    Ptr<StackAllocator> stackAllocator_;
284
285    Impl();
286    Impl(cudaStream_t stream);
287
288    ~Impl();
289};
290
291cv::cuda::Stream::Impl::Impl() : stream(0)
292{
293    cudaSafeCall( cudaStreamCreate(&stream) );
294
295    stackAllocator_ = makePtr<StackAllocator>(stream);
296}
297
298cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
299{
300    stackAllocator_ = makePtr<StackAllocator>(stream);
301}
302
303cv::cuda::Stream::Impl::~Impl()
304{
305    stackAllocator_.release();
306
307    if (stream)
308        cudaStreamDestroy(stream);
309}
310
311#endif
312
313/////////////////////////////////////////////////////////////
314/// DefaultDeviceInitializer
315
316#ifdef HAVE_CUDA
317
318namespace cv { namespace cuda
319{
320    class DefaultDeviceInitializer
321    {
322    public:
323        DefaultDeviceInitializer();
324        ~DefaultDeviceInitializer();
325
326        Stream& getNullStream(int deviceId);
327        MemoryPool* getMemoryPool(int deviceId);
328
329    private:
330        void initStreams();
331        void initPools();
332
333        std::vector<Ptr<Stream> > streams_;
334        Mutex streams_mtx_;
335
336        std::vector<MemoryPool> pools_;
337        Mutex pools_mtx_;
338    };
339
340    DefaultDeviceInitializer::DefaultDeviceInitializer()
341    {
342    }
343
344    DefaultDeviceInitializer::~DefaultDeviceInitializer()
345    {
346        streams_.clear();
347
348        for (size_t i = 0; i < pools_.size(); ++i)
349        {
350            cudaSetDevice(static_cast<int>(i));
351            pools_[i].release();
352        }
353
354        pools_.clear();
355    }
356
357    Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
358    {
359        AutoLock lock(streams_mtx_);
360
361        if (streams_.empty())
362        {
363            int deviceCount = getCudaEnabledDeviceCount();
364
365            if (deviceCount > 0)
366                streams_.resize(deviceCount);
367        }
368
369        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
370
371        if (streams_[deviceId].empty())
372        {
373            cudaStream_t stream = NULL;
374            Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
375            streams_[deviceId] = Ptr<Stream>(new Stream(impl));
376        }
377
378        return *streams_[deviceId];
379    }
380
381    MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId)
382    {
383        AutoLock lock(pools_mtx_);
384
385        if (pools_.empty())
386        {
387            int deviceCount = getCudaEnabledDeviceCount();
388
389            if (deviceCount > 0)
390                pools_.resize(deviceCount);
391        }
392
393        CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
394
395        return &pools_[deviceId];
396    }
397
398    DefaultDeviceInitializer initializer;
399}}
400
401#endif
402
403/////////////////////////////////////////////////////////////
404/// Stream
405
406cv::cuda::Stream::Stream()
407{
408#ifndef HAVE_CUDA
409    throw_no_cuda();
410#else
411    impl_ = makePtr<Impl>();
412#endif
413}
414
415bool cv::cuda::Stream::queryIfComplete() const
416{
417#ifndef HAVE_CUDA
418    throw_no_cuda();
419    return false;
420#else
421    cudaError_t err = cudaStreamQuery(impl_->stream);
422
423    if (err == cudaErrorNotReady || err == cudaSuccess)
424        return err == cudaSuccess;
425
426    cudaSafeCall(err);
427    return false;
428#endif
429}
430
431void cv::cuda::Stream::waitForCompletion()
432{
433#ifndef HAVE_CUDA
434    throw_no_cuda();
435#else
436    cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
437#endif
438}
439
440void cv::cuda::Stream::waitEvent(const Event& event)
441{
442#ifndef HAVE_CUDA
443    (void) event;
444    throw_no_cuda();
445#else
446    cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
447#endif
448}
449
450#if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
451
452namespace
453{
454    struct CallbackData
455    {
456        Stream::StreamCallback callback;
457        void* userData;
458
459        CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
460    };
461
462    void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
463    {
464        CallbackData* data = reinterpret_cast<CallbackData*>(userData);
465        data->callback(static_cast<int>(status), data->userData);
466        delete data;
467    }
468}
469
470#endif
471
472void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
473{
474#ifndef HAVE_CUDA
475    (void) callback;
476    (void) userData;
477    throw_no_cuda();
478#else
479    #if CUDART_VERSION < 5000
480        (void) callback;
481        (void) userData;
482        CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
483    #else
484        CallbackData* data = new CallbackData(callback, userData);
485
486        cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
487    #endif
488#endif
489}
490
491Stream& cv::cuda::Stream::Null()
492{
493#ifndef HAVE_CUDA
494    throw_no_cuda();
495    static Stream stream;
496    return stream;
497#else
498    const int deviceId = getDevice();
499    return initializer.getNullStream(deviceId);
500#endif
501}
502
503cv::cuda::Stream::operator bool_type() const
504{
505#ifndef HAVE_CUDA
506    return 0;
507#else
508    return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
509#endif
510}
511
512#ifdef HAVE_CUDA
513
514cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
515{
516    return stream.impl_->stream;
517}
518
519#endif
520
521/////////////////////////////////////////////////////////////
522/// StackAllocator
523
524#ifdef HAVE_CUDA
525
526namespace
527{
528    bool enableMemoryPool = true;
529
530    class StackAllocator : public GpuMat::Allocator
531    {
532    public:
533        explicit StackAllocator(cudaStream_t stream);
534        ~StackAllocator();
535
536        bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
537        void free(GpuMat* mat);
538
539    private:
540        StackAllocator(const StackAllocator&);
541        StackAllocator& operator =(const StackAllocator&);
542
543        cudaStream_t stream_;
544        MemoryStack* memStack_;
545        size_t alignment_;
546    };
547
548    StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
549    {
550        if (enableMemoryPool)
551        {
552            const int deviceId = getDevice();
553            memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack();
554            DeviceInfo devInfo(deviceId);
555            alignment_ = devInfo.textureAlignment();
556        }
557    }
558
559    StackAllocator::~StackAllocator()
560    {
561        cudaStreamSynchronize(stream_);
562
563        if (memStack_ != 0)
564            memStack_->pool->returnMemStack(memStack_);
565    }
566
567    size_t alignUp(size_t what, size_t alignment)
568    {
569        size_t alignMask = alignment-1;
570        size_t inverseAlignMask = ~alignMask;
571        size_t res = (what + alignMask) & inverseAlignMask;
572        return res;
573    }
574
575    bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
576    {
577        if (memStack_ == 0)
578            return false;
579
580        size_t pitch, memSize;
581
582        if (rows > 1 && cols > 1)
583        {
584            pitch = alignUp(cols * elemSize, alignment_);
585            memSize = pitch * rows;
586        }
587        else
588        {
589            // Single row or single column must be continuous
590            pitch = elemSize * cols;
591            memSize = alignUp(elemSize * cols * rows, 64);
592        }
593
594        uchar* ptr = memStack_->requestMemory(memSize);
595
596        if (ptr == 0)
597            return false;
598
599        mat->data = ptr;
600        mat->step = pitch;
601        mat->refcount = (int*) fastMalloc(sizeof(int));
602
603        return true;
604    }
605
606    void StackAllocator::free(GpuMat* mat)
607    {
608        if (memStack_ == 0)
609            return;
610
611        memStack_->returnMemory(mat->datastart);
612        fastFree(mat->refcount);
613    }
614}
615
616#endif
617
618/////////////////////////////////////////////////////////////
619/// BufferPool
620
621void cv::cuda::setBufferPoolUsage(bool on)
622{
623#ifndef HAVE_CUDA
624    (void)on;
625    throw_no_cuda();
626#else
627    enableMemoryPool = on;
628#endif
629}
630
631void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
632{
633#ifndef HAVE_CUDA
634    (void)deviceId;
635    (void)stackSize;
636    (void)stackCount;
637    throw_no_cuda();
638#else
639    const int currentDevice = getDevice();
640
641    if (deviceId >= 0)
642    {
643        setDevice(deviceId);
644        initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
645    }
646    else
647    {
648        const int deviceCount = getCudaEnabledDeviceCount();
649
650        for (deviceId = 0; deviceId < deviceCount; ++deviceId)
651        {
652            setDevice(deviceId);
653            initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
654        }
655    }
656
657    setDevice(currentDevice);
658#endif
659}
660
661#ifdef HAVE_CUDA
662
663cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
664{
665}
666
667GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
668{
669    GpuMat buf(allocator_);
670    buf.create(rows, cols, type);
671    return buf;
672}
673
674#endif
675
676////////////////////////////////////////////////////////////////
677// Event
678
679#ifndef HAVE_CUDA
680
681class cv::cuda::Event::Impl
682{
683public:
684    Impl(unsigned int)
685    {
686        throw_no_cuda();
687    }
688};
689
690#else
691
692class cv::cuda::Event::Impl
693{
694public:
695    cudaEvent_t event;
696
697    Impl(unsigned int flags);
698    ~Impl();
699};
700
701cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0)
702{
703    cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
704}
705
706cv::cuda::Event::Impl::~Impl()
707{
708    if (event)
709        cudaEventDestroy(event);
710}
711
712cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event)
713{
714    return event.impl_->event;
715}
716
717#endif
718
719cv::cuda::Event::Event(CreateFlags flags)
720{
721#ifndef HAVE_CUDA
722    (void) flags;
723    throw_no_cuda();
724#else
725    impl_ = makePtr<Impl>(flags);
726#endif
727}
728
729void cv::cuda::Event::record(Stream& stream)
730{
731#ifndef HAVE_CUDA
732    (void) stream;
733    throw_no_cuda();
734#else
735    cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
736#endif
737}
738
739bool cv::cuda::Event::queryIfComplete() const
740{
741#ifndef HAVE_CUDA
742    throw_no_cuda();
743    return false;
744#else
745    cudaError_t err = cudaEventQuery(impl_->event);
746
747    if (err == cudaErrorNotReady || err == cudaSuccess)
748        return err == cudaSuccess;
749
750    cudaSafeCall(err);
751    return false;
752#endif
753}
754
755void cv::cuda::Event::waitForCompletion()
756{
757#ifndef HAVE_CUDA
758    throw_no_cuda();
759#else
760    cudaSafeCall( cudaEventSynchronize(impl_->event) );
761#endif
762}
763
764float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
765{
766#ifndef HAVE_CUDA
767    (void) start;
768    (void) end;
769    throw_no_cuda();
770    return 0.0f;
771#else
772    float ms;
773    cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
774    return ms;
775#endif
776}
777