cuda_stream.cpp revision 793ee12c6df9cad3806238d32528c49a3ff9331d
1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14// Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// Redistribution and use in source and binary forms, with or without modification, 18// are permitted provided that the following conditions are met: 19// 20// * Redistribution's of source code must retain the above copyright notice, 21// this list of conditions and the following disclaimer. 22// 23// * Redistribution's in binary form must reproduce the above copyright notice, 24// this list of conditions and the following disclaimer in the documentation 25// and/or other materials provided with the distribution. 26// 27// * The name of the copyright holders may not be used to endorse or promote products 28// derived from this software without specific prior written permission. 29// 30// This software is provided by the copyright holders and contributors "as is" and 31// any express or implied warranties, including, but not limited to, the implied 32// warranties of merchantability and fitness for a particular purpose are disclaimed. 33// In no event shall the Intel Corporation or contributors be liable for any direct, 34// indirect, incidental, special, exemplary, or consequential damages 35// (including, but not limited to, procurement of substitute goods or services; 36// loss of use, data, or profits; or business interruption) however caused 37// and on any theory of liability, whether in contract, strict liability, 38// or tort (including negligence or otherwise) arising in any way out of 39// the use of this software, even if advised of the possibility of such damage. 40// 41//M*/ 42 43#include "precomp.hpp" 44 45using namespace cv; 46using namespace cv::cuda; 47 48///////////////////////////////////////////////////////////// 49/// MemoryStack 50 51#ifdef HAVE_CUDA 52 53namespace 54{ 55 class MemoryPool; 56 57 class MemoryStack 58 { 59 public: 60 uchar* requestMemory(size_t size); 61 void returnMemory(uchar* ptr); 62 63 uchar* datastart; 64 uchar* dataend; 65 uchar* tip; 66 67 bool isFree; 68 MemoryPool* pool; 69 70 #if !defined(NDEBUG) 71 std::vector<size_t> allocations; 72 #endif 73 }; 74 75 uchar* MemoryStack::requestMemory(size_t size) 76 { 77 const size_t freeMem = dataend - tip; 78 79 if (size > freeMem) 80 return 0; 81 82 uchar* ptr = tip; 83 84 tip += size; 85 86 #if !defined(NDEBUG) 87 allocations.push_back(size); 88 #endif 89 90 return ptr; 91 } 92 93 void MemoryStack::returnMemory(uchar* ptr) 94 { 95 CV_DbgAssert( ptr >= datastart && ptr < dataend ); 96 97 #if !defined(NDEBUG) 98 const size_t allocSize = tip - ptr; 99 CV_Assert( allocSize == allocations.back() ); 100 allocations.pop_back(); 101 #endif 102 103 tip = ptr; 104 } 105} 106 107#endif 108 109///////////////////////////////////////////////////////////// 110/// MemoryPool 111 112#ifdef HAVE_CUDA 113 114namespace 115{ 116 class MemoryPool 117 { 118 public: 119 MemoryPool(); 120 121 void initialize(size_t stackSize, int stackCount); 122 void release(); 123 124 MemoryStack* getFreeMemStack(); 125 void returnMemStack(MemoryStack* memStack); 126 127 private: 128 void initilizeImpl(); 129 130 Mutex mtx_; 131 132 bool initialized_; 133 size_t stackSize_; 134 int stackCount_; 135 136 uchar* mem_; 137 138 std::vector<MemoryStack> stacks_; 139 }; 140 141 MemoryPool::MemoryPool() : initialized_(false), mem_(0) 142 { 143 // default : 10 Mb, 5 stacks 144 stackSize_ = 10 * 1024 * 1024; 145 stackCount_ = 5; 146 } 147 148 void MemoryPool::initialize(size_t stackSize, int stackCount) 149 { 150 AutoLock lock(mtx_); 151 152 release(); 153 154 stackSize_ = stackSize; 155 stackCount_ = stackCount; 156 157 initilizeImpl(); 158 } 159 160 void MemoryPool::initilizeImpl() 161 { 162 const size_t totalSize = stackSize_ * stackCount_; 163 164 if (totalSize > 0) 165 { 166 cudaError_t err = cudaMalloc(&mem_, totalSize); 167 if (err != cudaSuccess) 168 return; 169 170 stacks_.resize(stackCount_); 171 172 uchar* ptr = mem_; 173 174 for (int i = 0; i < stackCount_; ++i) 175 { 176 stacks_[i].datastart = ptr; 177 stacks_[i].dataend = ptr + stackSize_; 178 stacks_[i].tip = ptr; 179 stacks_[i].isFree = true; 180 stacks_[i].pool = this; 181 182 ptr += stackSize_; 183 } 184 185 initialized_ = true; 186 } 187 } 188 189 void MemoryPool::release() 190 { 191 if (mem_) 192 { 193#if !defined(NDEBUG) 194 for (int i = 0; i < stackCount_; ++i) 195 { 196 CV_DbgAssert( stacks_[i].isFree ); 197 CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart ); 198 } 199#endif 200 201 cudaFree(mem_); 202 203 mem_ = 0; 204 initialized_ = false; 205 } 206 } 207 208 MemoryStack* MemoryPool::getFreeMemStack() 209 { 210 AutoLock lock(mtx_); 211 212 if (!initialized_) 213 initilizeImpl(); 214 215 if (!mem_) 216 return 0; 217 218 for (int i = 0; i < stackCount_; ++i) 219 { 220 if (stacks_[i].isFree) 221 { 222 stacks_[i].isFree = false; 223 return &stacks_[i]; 224 } 225 } 226 227 return 0; 228 } 229 230 void MemoryPool::returnMemStack(MemoryStack* memStack) 231 { 232 AutoLock lock(mtx_); 233 234 CV_DbgAssert( !memStack->isFree ); 235 236#if !defined(NDEBUG) 237 bool found = false; 238 for (int i = 0; i < stackCount_; ++i) 239 { 240 if (memStack == &stacks_[i]) 241 { 242 found = true; 243 break; 244 } 245 } 246 CV_DbgAssert( found ); 247#endif 248 249 CV_DbgAssert( memStack->tip == memStack->datastart ); 250 251 memStack->isFree = true; 252 } 253} 254 255#endif 256 257//////////////////////////////////////////////////////////////// 258/// Stream::Impl 259 260#ifndef HAVE_CUDA 261 262class cv::cuda::Stream::Impl 263{ 264public: 265 Impl(void* ptr = 0) 266 { 267 (void) ptr; 268 throw_no_cuda(); 269 } 270}; 271 272#else 273 274namespace 275{ 276 class StackAllocator; 277} 278 279class cv::cuda::Stream::Impl 280{ 281public: 282 cudaStream_t stream; 283 Ptr<StackAllocator> stackAllocator_; 284 285 Impl(); 286 Impl(cudaStream_t stream); 287 288 ~Impl(); 289}; 290 291cv::cuda::Stream::Impl::Impl() : stream(0) 292{ 293 cudaSafeCall( cudaStreamCreate(&stream) ); 294 295 stackAllocator_ = makePtr<StackAllocator>(stream); 296} 297 298cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_) 299{ 300 stackAllocator_ = makePtr<StackAllocator>(stream); 301} 302 303cv::cuda::Stream::Impl::~Impl() 304{ 305 stackAllocator_.release(); 306 307 if (stream) 308 cudaStreamDestroy(stream); 309} 310 311#endif 312 313///////////////////////////////////////////////////////////// 314/// DefaultDeviceInitializer 315 316#ifdef HAVE_CUDA 317 318namespace cv { namespace cuda 319{ 320 class DefaultDeviceInitializer 321 { 322 public: 323 DefaultDeviceInitializer(); 324 ~DefaultDeviceInitializer(); 325 326 Stream& getNullStream(int deviceId); 327 MemoryPool* getMemoryPool(int deviceId); 328 329 private: 330 void initStreams(); 331 void initPools(); 332 333 std::vector<Ptr<Stream> > streams_; 334 Mutex streams_mtx_; 335 336 std::vector<MemoryPool> pools_; 337 Mutex pools_mtx_; 338 }; 339 340 DefaultDeviceInitializer::DefaultDeviceInitializer() 341 { 342 } 343 344 DefaultDeviceInitializer::~DefaultDeviceInitializer() 345 { 346 streams_.clear(); 347 348 for (size_t i = 0; i < pools_.size(); ++i) 349 { 350 cudaSetDevice(static_cast<int>(i)); 351 pools_[i].release(); 352 } 353 354 pools_.clear(); 355 } 356 357 Stream& DefaultDeviceInitializer::getNullStream(int deviceId) 358 { 359 AutoLock lock(streams_mtx_); 360 361 if (streams_.empty()) 362 { 363 int deviceCount = getCudaEnabledDeviceCount(); 364 365 if (deviceCount > 0) 366 streams_.resize(deviceCount); 367 } 368 369 CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) ); 370 371 if (streams_[deviceId].empty()) 372 { 373 cudaStream_t stream = NULL; 374 Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream); 375 streams_[deviceId] = Ptr<Stream>(new Stream(impl)); 376 } 377 378 return *streams_[deviceId]; 379 } 380 381 MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId) 382 { 383 AutoLock lock(pools_mtx_); 384 385 if (pools_.empty()) 386 { 387 int deviceCount = getCudaEnabledDeviceCount(); 388 389 if (deviceCount > 0) 390 pools_.resize(deviceCount); 391 } 392 393 CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) ); 394 395 return &pools_[deviceId]; 396 } 397 398 DefaultDeviceInitializer initializer; 399}} 400 401#endif 402 403///////////////////////////////////////////////////////////// 404/// Stream 405 406cv::cuda::Stream::Stream() 407{ 408#ifndef HAVE_CUDA 409 throw_no_cuda(); 410#else 411 impl_ = makePtr<Impl>(); 412#endif 413} 414 415bool cv::cuda::Stream::queryIfComplete() const 416{ 417#ifndef HAVE_CUDA 418 throw_no_cuda(); 419 return false; 420#else 421 cudaError_t err = cudaStreamQuery(impl_->stream); 422 423 if (err == cudaErrorNotReady || err == cudaSuccess) 424 return err == cudaSuccess; 425 426 cudaSafeCall(err); 427 return false; 428#endif 429} 430 431void cv::cuda::Stream::waitForCompletion() 432{ 433#ifndef HAVE_CUDA 434 throw_no_cuda(); 435#else 436 cudaSafeCall( cudaStreamSynchronize(impl_->stream) ); 437#endif 438} 439 440void cv::cuda::Stream::waitEvent(const Event& event) 441{ 442#ifndef HAVE_CUDA 443 (void) event; 444 throw_no_cuda(); 445#else 446 cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) ); 447#endif 448} 449 450#if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000) 451 452namespace 453{ 454 struct CallbackData 455 { 456 Stream::StreamCallback callback; 457 void* userData; 458 459 CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {} 460 }; 461 462 void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData) 463 { 464 CallbackData* data = reinterpret_cast<CallbackData*>(userData); 465 data->callback(static_cast<int>(status), data->userData); 466 delete data; 467 } 468} 469 470#endif 471 472void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData) 473{ 474#ifndef HAVE_CUDA 475 (void) callback; 476 (void) userData; 477 throw_no_cuda(); 478#else 479 #if CUDART_VERSION < 5000 480 (void) callback; 481 (void) userData; 482 CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0"); 483 #else 484 CallbackData* data = new CallbackData(callback, userData); 485 486 cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) ); 487 #endif 488#endif 489} 490 491Stream& cv::cuda::Stream::Null() 492{ 493#ifndef HAVE_CUDA 494 throw_no_cuda(); 495 static Stream stream; 496 return stream; 497#else 498 const int deviceId = getDevice(); 499 return initializer.getNullStream(deviceId); 500#endif 501} 502 503cv::cuda::Stream::operator bool_type() const 504{ 505#ifndef HAVE_CUDA 506 return 0; 507#else 508 return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0; 509#endif 510} 511 512#ifdef HAVE_CUDA 513 514cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream) 515{ 516 return stream.impl_->stream; 517} 518 519#endif 520 521///////////////////////////////////////////////////////////// 522/// StackAllocator 523 524#ifdef HAVE_CUDA 525 526namespace 527{ 528 bool enableMemoryPool = true; 529 530 class StackAllocator : public GpuMat::Allocator 531 { 532 public: 533 explicit StackAllocator(cudaStream_t stream); 534 ~StackAllocator(); 535 536 bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize); 537 void free(GpuMat* mat); 538 539 private: 540 StackAllocator(const StackAllocator&); 541 StackAllocator& operator =(const StackAllocator&); 542 543 cudaStream_t stream_; 544 MemoryStack* memStack_; 545 size_t alignment_; 546 }; 547 548 StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0) 549 { 550 if (enableMemoryPool) 551 { 552 const int deviceId = getDevice(); 553 memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack(); 554 DeviceInfo devInfo(deviceId); 555 alignment_ = devInfo.textureAlignment(); 556 } 557 } 558 559 StackAllocator::~StackAllocator() 560 { 561 cudaStreamSynchronize(stream_); 562 563 if (memStack_ != 0) 564 memStack_->pool->returnMemStack(memStack_); 565 } 566 567 size_t alignUp(size_t what, size_t alignment) 568 { 569 size_t alignMask = alignment-1; 570 size_t inverseAlignMask = ~alignMask; 571 size_t res = (what + alignMask) & inverseAlignMask; 572 return res; 573 } 574 575 bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize) 576 { 577 if (memStack_ == 0) 578 return false; 579 580 size_t pitch, memSize; 581 582 if (rows > 1 && cols > 1) 583 { 584 pitch = alignUp(cols * elemSize, alignment_); 585 memSize = pitch * rows; 586 } 587 else 588 { 589 // Single row or single column must be continuous 590 pitch = elemSize * cols; 591 memSize = alignUp(elemSize * cols * rows, 64); 592 } 593 594 uchar* ptr = memStack_->requestMemory(memSize); 595 596 if (ptr == 0) 597 return false; 598 599 mat->data = ptr; 600 mat->step = pitch; 601 mat->refcount = (int*) fastMalloc(sizeof(int)); 602 603 return true; 604 } 605 606 void StackAllocator::free(GpuMat* mat) 607 { 608 if (memStack_ == 0) 609 return; 610 611 memStack_->returnMemory(mat->datastart); 612 fastFree(mat->refcount); 613 } 614} 615 616#endif 617 618///////////////////////////////////////////////////////////// 619/// BufferPool 620 621void cv::cuda::setBufferPoolUsage(bool on) 622{ 623#ifndef HAVE_CUDA 624 (void)on; 625 throw_no_cuda(); 626#else 627 enableMemoryPool = on; 628#endif 629} 630 631void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount) 632{ 633#ifndef HAVE_CUDA 634 (void)deviceId; 635 (void)stackSize; 636 (void)stackCount; 637 throw_no_cuda(); 638#else 639 const int currentDevice = getDevice(); 640 641 if (deviceId >= 0) 642 { 643 setDevice(deviceId); 644 initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount); 645 } 646 else 647 { 648 const int deviceCount = getCudaEnabledDeviceCount(); 649 650 for (deviceId = 0; deviceId < deviceCount; ++deviceId) 651 { 652 setDevice(deviceId); 653 initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount); 654 } 655 } 656 657 setDevice(currentDevice); 658#endif 659} 660 661#ifdef HAVE_CUDA 662 663cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get()) 664{ 665} 666 667GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type) 668{ 669 GpuMat buf(allocator_); 670 buf.create(rows, cols, type); 671 return buf; 672} 673 674#endif 675 676//////////////////////////////////////////////////////////////// 677// Event 678 679#ifndef HAVE_CUDA 680 681class cv::cuda::Event::Impl 682{ 683public: 684 Impl(unsigned int) 685 { 686 throw_no_cuda(); 687 } 688}; 689 690#else 691 692class cv::cuda::Event::Impl 693{ 694public: 695 cudaEvent_t event; 696 697 Impl(unsigned int flags); 698 ~Impl(); 699}; 700 701cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0) 702{ 703 cudaSafeCall( cudaEventCreateWithFlags(&event, flags) ); 704} 705 706cv::cuda::Event::Impl::~Impl() 707{ 708 if (event) 709 cudaEventDestroy(event); 710} 711 712cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event) 713{ 714 return event.impl_->event; 715} 716 717#endif 718 719cv::cuda::Event::Event(CreateFlags flags) 720{ 721#ifndef HAVE_CUDA 722 (void) flags; 723 throw_no_cuda(); 724#else 725 impl_ = makePtr<Impl>(flags); 726#endif 727} 728 729void cv::cuda::Event::record(Stream& stream) 730{ 731#ifndef HAVE_CUDA 732 (void) stream; 733 throw_no_cuda(); 734#else 735 cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) ); 736#endif 737} 738 739bool cv::cuda::Event::queryIfComplete() const 740{ 741#ifndef HAVE_CUDA 742 throw_no_cuda(); 743 return false; 744#else 745 cudaError_t err = cudaEventQuery(impl_->event); 746 747 if (err == cudaErrorNotReady || err == cudaSuccess) 748 return err == cudaSuccess; 749 750 cudaSafeCall(err); 751 return false; 752#endif 753} 754 755void cv::cuda::Event::waitForCompletion() 756{ 757#ifndef HAVE_CUDA 758 throw_no_cuda(); 759#else 760 cudaSafeCall( cudaEventSynchronize(impl_->event) ); 761#endif 762} 763 764float cv::cuda::Event::elapsedTime(const Event& start, const Event& end) 765{ 766#ifndef HAVE_CUDA 767 (void) start; 768 (void) end; 769 throw_no_cuda(); 770 return 0.0f; 771#else 772 float ms; 773 cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) ); 774 return ms; 775#endif 776} 777