1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "precomp.hpp"
44
45 using namespace cv;
46 using namespace cv::cuda;
47
48 /////////////////////////////////////////////////////////////
49 /// MemoryStack
50
51 #ifdef HAVE_CUDA
52
53 namespace
54 {
55 class MemoryPool;
56
57 class MemoryStack
58 {
59 public:
60 uchar* requestMemory(size_t size);
61 void returnMemory(uchar* ptr);
62
63 uchar* datastart;
64 uchar* dataend;
65 uchar* tip;
66
67 bool isFree;
68 MemoryPool* pool;
69
70 #if !defined(NDEBUG)
71 std::vector<size_t> allocations;
72 #endif
73 };
74
requestMemory(size_t size)75 uchar* MemoryStack::requestMemory(size_t size)
76 {
77 const size_t freeMem = dataend - tip;
78
79 if (size > freeMem)
80 return 0;
81
82 uchar* ptr = tip;
83
84 tip += size;
85
86 #if !defined(NDEBUG)
87 allocations.push_back(size);
88 #endif
89
90 return ptr;
91 }
92
returnMemory(uchar * ptr)93 void MemoryStack::returnMemory(uchar* ptr)
94 {
95 CV_DbgAssert( ptr >= datastart && ptr < dataend );
96
97 #if !defined(NDEBUG)
98 const size_t allocSize = tip - ptr;
99 CV_Assert( allocSize == allocations.back() );
100 allocations.pop_back();
101 #endif
102
103 tip = ptr;
104 }
105 }
106
107 #endif
108
109 /////////////////////////////////////////////////////////////
110 /// MemoryPool
111
112 #ifdef HAVE_CUDA
113
114 namespace
115 {
116 class MemoryPool
117 {
118 public:
119 MemoryPool();
120
121 void initialize(size_t stackSize, int stackCount);
122 void release();
123
124 MemoryStack* getFreeMemStack();
125 void returnMemStack(MemoryStack* memStack);
126
127 private:
128 void initilizeImpl();
129
130 Mutex mtx_;
131
132 bool initialized_;
133 size_t stackSize_;
134 int stackCount_;
135
136 uchar* mem_;
137
138 std::vector<MemoryStack> stacks_;
139 };
140
MemoryPool()141 MemoryPool::MemoryPool() : initialized_(false), mem_(0)
142 {
143 // default : 10 Mb, 5 stacks
144 stackSize_ = 10 * 1024 * 1024;
145 stackCount_ = 5;
146 }
147
initialize(size_t stackSize,int stackCount)148 void MemoryPool::initialize(size_t stackSize, int stackCount)
149 {
150 AutoLock lock(mtx_);
151
152 release();
153
154 stackSize_ = stackSize;
155 stackCount_ = stackCount;
156
157 initilizeImpl();
158 }
159
initilizeImpl()160 void MemoryPool::initilizeImpl()
161 {
162 const size_t totalSize = stackSize_ * stackCount_;
163
164 if (totalSize > 0)
165 {
166 cudaError_t err = cudaMalloc(&mem_, totalSize);
167 if (err != cudaSuccess)
168 return;
169
170 stacks_.resize(stackCount_);
171
172 uchar* ptr = mem_;
173
174 for (int i = 0; i < stackCount_; ++i)
175 {
176 stacks_[i].datastart = ptr;
177 stacks_[i].dataend = ptr + stackSize_;
178 stacks_[i].tip = ptr;
179 stacks_[i].isFree = true;
180 stacks_[i].pool = this;
181
182 ptr += stackSize_;
183 }
184
185 initialized_ = true;
186 }
187 }
188
release()189 void MemoryPool::release()
190 {
191 if (mem_)
192 {
193 #if !defined(NDEBUG)
194 for (int i = 0; i < stackCount_; ++i)
195 {
196 CV_DbgAssert( stacks_[i].isFree );
197 CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
198 }
199 #endif
200
201 cudaFree(mem_);
202
203 mem_ = 0;
204 initialized_ = false;
205 }
206 }
207
getFreeMemStack()208 MemoryStack* MemoryPool::getFreeMemStack()
209 {
210 AutoLock lock(mtx_);
211
212 if (!initialized_)
213 initilizeImpl();
214
215 if (!mem_)
216 return 0;
217
218 for (int i = 0; i < stackCount_; ++i)
219 {
220 if (stacks_[i].isFree)
221 {
222 stacks_[i].isFree = false;
223 return &stacks_[i];
224 }
225 }
226
227 return 0;
228 }
229
returnMemStack(MemoryStack * memStack)230 void MemoryPool::returnMemStack(MemoryStack* memStack)
231 {
232 AutoLock lock(mtx_);
233
234 CV_DbgAssert( !memStack->isFree );
235
236 #if !defined(NDEBUG)
237 bool found = false;
238 for (int i = 0; i < stackCount_; ++i)
239 {
240 if (memStack == &stacks_[i])
241 {
242 found = true;
243 break;
244 }
245 }
246 CV_DbgAssert( found );
247 #endif
248
249 CV_DbgAssert( memStack->tip == memStack->datastart );
250
251 memStack->isFree = true;
252 }
253 }
254
255 #endif
256
257 ////////////////////////////////////////////////////////////////
258 /// Stream::Impl
259
260 #ifndef HAVE_CUDA
261
262 class cv::cuda::Stream::Impl
263 {
264 public:
Impl(void * ptr=0)265 Impl(void* ptr = 0)
266 {
267 (void) ptr;
268 throw_no_cuda();
269 }
270 };
271
272 #else
273
274 namespace
275 {
276 class StackAllocator;
277 }
278
279 class cv::cuda::Stream::Impl
280 {
281 public:
282 cudaStream_t stream;
283 Ptr<StackAllocator> stackAllocator_;
284
285 Impl();
286 Impl(cudaStream_t stream);
287
288 ~Impl();
289 };
290
Impl()291 cv::cuda::Stream::Impl::Impl() : stream(0)
292 {
293 cudaSafeCall( cudaStreamCreate(&stream) );
294
295 stackAllocator_ = makePtr<StackAllocator>(stream);
296 }
297
Impl(cudaStream_t stream_)298 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
299 {
300 stackAllocator_ = makePtr<StackAllocator>(stream);
301 }
302
~Impl()303 cv::cuda::Stream::Impl::~Impl()
304 {
305 stackAllocator_.release();
306
307 if (stream)
308 cudaStreamDestroy(stream);
309 }
310
311 #endif
312
313 /////////////////////////////////////////////////////////////
314 /// DefaultDeviceInitializer
315
316 #ifdef HAVE_CUDA
317
318 namespace cv { namespace cuda
319 {
320 class DefaultDeviceInitializer
321 {
322 public:
323 DefaultDeviceInitializer();
324 ~DefaultDeviceInitializer();
325
326 Stream& getNullStream(int deviceId);
327 MemoryPool* getMemoryPool(int deviceId);
328
329 private:
330 void initStreams();
331 void initPools();
332
333 std::vector<Ptr<Stream> > streams_;
334 Mutex streams_mtx_;
335
336 std::vector<MemoryPool> pools_;
337 Mutex pools_mtx_;
338 };
339
DefaultDeviceInitializer()340 DefaultDeviceInitializer::DefaultDeviceInitializer()
341 {
342 }
343
~DefaultDeviceInitializer()344 DefaultDeviceInitializer::~DefaultDeviceInitializer()
345 {
346 streams_.clear();
347
348 for (size_t i = 0; i < pools_.size(); ++i)
349 {
350 cudaSetDevice(static_cast<int>(i));
351 pools_[i].release();
352 }
353
354 pools_.clear();
355 }
356
getNullStream(int deviceId)357 Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
358 {
359 AutoLock lock(streams_mtx_);
360
361 if (streams_.empty())
362 {
363 int deviceCount = getCudaEnabledDeviceCount();
364
365 if (deviceCount > 0)
366 streams_.resize(deviceCount);
367 }
368
369 CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
370
371 if (streams_[deviceId].empty())
372 {
373 cudaStream_t stream = NULL;
374 Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
375 streams_[deviceId] = Ptr<Stream>(new Stream(impl));
376 }
377
378 return *streams_[deviceId];
379 }
380
getMemoryPool(int deviceId)381 MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId)
382 {
383 AutoLock lock(pools_mtx_);
384
385 if (pools_.empty())
386 {
387 int deviceCount = getCudaEnabledDeviceCount();
388
389 if (deviceCount > 0)
390 pools_.resize(deviceCount);
391 }
392
393 CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
394
395 return &pools_[deviceId];
396 }
397
398 DefaultDeviceInitializer initializer;
399 }}
400
401 #endif
402
403 /////////////////////////////////////////////////////////////
404 /// Stream
405
Stream()406 cv::cuda::Stream::Stream()
407 {
408 #ifndef HAVE_CUDA
409 throw_no_cuda();
410 #else
411 impl_ = makePtr<Impl>();
412 #endif
413 }
414
queryIfComplete() const415 bool cv::cuda::Stream::queryIfComplete() const
416 {
417 #ifndef HAVE_CUDA
418 throw_no_cuda();
419 return false;
420 #else
421 cudaError_t err = cudaStreamQuery(impl_->stream);
422
423 if (err == cudaErrorNotReady || err == cudaSuccess)
424 return err == cudaSuccess;
425
426 cudaSafeCall(err);
427 return false;
428 #endif
429 }
430
waitForCompletion()431 void cv::cuda::Stream::waitForCompletion()
432 {
433 #ifndef HAVE_CUDA
434 throw_no_cuda();
435 #else
436 cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
437 #endif
438 }
439
waitEvent(const Event & event)440 void cv::cuda::Stream::waitEvent(const Event& event)
441 {
442 #ifndef HAVE_CUDA
443 (void) event;
444 throw_no_cuda();
445 #else
446 cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
447 #endif
448 }
449
450 #if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
451
452 namespace
453 {
454 struct CallbackData
455 {
456 Stream::StreamCallback callback;
457 void* userData;
458
CallbackData__anon81e7630e0411::CallbackData459 CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
460 };
461
cudaStreamCallback(cudaStream_t,cudaError_t status,void * userData)462 void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
463 {
464 CallbackData* data = reinterpret_cast<CallbackData*>(userData);
465 data->callback(static_cast<int>(status), data->userData);
466 delete data;
467 }
468 }
469
470 #endif
471
enqueueHostCallback(StreamCallback callback,void * userData)472 void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
473 {
474 #ifndef HAVE_CUDA
475 (void) callback;
476 (void) userData;
477 throw_no_cuda();
478 #else
479 #if CUDART_VERSION < 5000
480 (void) callback;
481 (void) userData;
482 CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
483 #else
484 CallbackData* data = new CallbackData(callback, userData);
485
486 cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
487 #endif
488 #endif
489 }
490
Null()491 Stream& cv::cuda::Stream::Null()
492 {
493 #ifndef HAVE_CUDA
494 throw_no_cuda();
495 static Stream stream;
496 return stream;
497 #else
498 const int deviceId = getDevice();
499 return initializer.getNullStream(deviceId);
500 #endif
501 }
502
operator bool_type() const503 cv::cuda::Stream::operator bool_type() const
504 {
505 #ifndef HAVE_CUDA
506 return 0;
507 #else
508 return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
509 #endif
510 }
511
512 #ifdef HAVE_CUDA
513
getStream(const Stream & stream)514 cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
515 {
516 return stream.impl_->stream;
517 }
518
519 #endif
520
521 /////////////////////////////////////////////////////////////
522 /// StackAllocator
523
524 #ifdef HAVE_CUDA
525
526 namespace
527 {
528 bool enableMemoryPool = true;
529
530 class StackAllocator : public GpuMat::Allocator
531 {
532 public:
533 explicit StackAllocator(cudaStream_t stream);
534 ~StackAllocator();
535
536 bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
537 void free(GpuMat* mat);
538
539 private:
540 StackAllocator(const StackAllocator&);
541 StackAllocator& operator =(const StackAllocator&);
542
543 cudaStream_t stream_;
544 MemoryStack* memStack_;
545 size_t alignment_;
546 };
547
StackAllocator(cudaStream_t stream)548 StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
549 {
550 if (enableMemoryPool)
551 {
552 const int deviceId = getDevice();
553 memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack();
554 DeviceInfo devInfo(deviceId);
555 alignment_ = devInfo.textureAlignment();
556 }
557 }
558
~StackAllocator()559 StackAllocator::~StackAllocator()
560 {
561 cudaStreamSynchronize(stream_);
562
563 if (memStack_ != 0)
564 memStack_->pool->returnMemStack(memStack_);
565 }
566
alignUp(size_t what,size_t alignment)567 size_t alignUp(size_t what, size_t alignment)
568 {
569 size_t alignMask = alignment-1;
570 size_t inverseAlignMask = ~alignMask;
571 size_t res = (what + alignMask) & inverseAlignMask;
572 return res;
573 }
574
allocate(GpuMat * mat,int rows,int cols,size_t elemSize)575 bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
576 {
577 if (memStack_ == 0)
578 return false;
579
580 size_t pitch, memSize;
581
582 if (rows > 1 && cols > 1)
583 {
584 pitch = alignUp(cols * elemSize, alignment_);
585 memSize = pitch * rows;
586 }
587 else
588 {
589 // Single row or single column must be continuous
590 pitch = elemSize * cols;
591 memSize = alignUp(elemSize * cols * rows, 64);
592 }
593
594 uchar* ptr = memStack_->requestMemory(memSize);
595
596 if (ptr == 0)
597 return false;
598
599 mat->data = ptr;
600 mat->step = pitch;
601 mat->refcount = (int*) fastMalloc(sizeof(int));
602
603 return true;
604 }
605
free(GpuMat * mat)606 void StackAllocator::free(GpuMat* mat)
607 {
608 if (memStack_ == 0)
609 return;
610
611 memStack_->returnMemory(mat->datastart);
612 fastFree(mat->refcount);
613 }
614 }
615
616 #endif
617
618 /////////////////////////////////////////////////////////////
619 /// BufferPool
620
setBufferPoolUsage(bool on)621 void cv::cuda::setBufferPoolUsage(bool on)
622 {
623 #ifndef HAVE_CUDA
624 (void)on;
625 throw_no_cuda();
626 #else
627 enableMemoryPool = on;
628 #endif
629 }
630
setBufferPoolConfig(int deviceId,size_t stackSize,int stackCount)631 void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
632 {
633 #ifndef HAVE_CUDA
634 (void)deviceId;
635 (void)stackSize;
636 (void)stackCount;
637 throw_no_cuda();
638 #else
639 const int currentDevice = getDevice();
640
641 if (deviceId >= 0)
642 {
643 setDevice(deviceId);
644 initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
645 }
646 else
647 {
648 const int deviceCount = getCudaEnabledDeviceCount();
649
650 for (deviceId = 0; deviceId < deviceCount; ++deviceId)
651 {
652 setDevice(deviceId);
653 initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
654 }
655 }
656
657 setDevice(currentDevice);
658 #endif
659 }
660
661 #ifdef HAVE_CUDA
662
BufferPool(Stream & stream)663 cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
664 {
665 }
666
getBuffer(int rows,int cols,int type)667 GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
668 {
669 GpuMat buf(allocator_);
670 buf.create(rows, cols, type);
671 return buf;
672 }
673
674 #endif
675
676 ////////////////////////////////////////////////////////////////
677 // Event
678
679 #ifndef HAVE_CUDA
680
681 class cv::cuda::Event::Impl
682 {
683 public:
Impl(unsigned int)684 Impl(unsigned int)
685 {
686 throw_no_cuda();
687 }
688 };
689
690 #else
691
692 class cv::cuda::Event::Impl
693 {
694 public:
695 cudaEvent_t event;
696
697 Impl(unsigned int flags);
698 ~Impl();
699 };
700
Impl(unsigned int flags)701 cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0)
702 {
703 cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
704 }
705
~Impl()706 cv::cuda::Event::Impl::~Impl()
707 {
708 if (event)
709 cudaEventDestroy(event);
710 }
711
getEvent(const Event & event)712 cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event)
713 {
714 return event.impl_->event;
715 }
716
717 #endif
718
Event(CreateFlags flags)719 cv::cuda::Event::Event(CreateFlags flags)
720 {
721 #ifndef HAVE_CUDA
722 (void) flags;
723 throw_no_cuda();
724 #else
725 impl_ = makePtr<Impl>(flags);
726 #endif
727 }
728
record(Stream & stream)729 void cv::cuda::Event::record(Stream& stream)
730 {
731 #ifndef HAVE_CUDA
732 (void) stream;
733 throw_no_cuda();
734 #else
735 cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
736 #endif
737 }
738
queryIfComplete() const739 bool cv::cuda::Event::queryIfComplete() const
740 {
741 #ifndef HAVE_CUDA
742 throw_no_cuda();
743 return false;
744 #else
745 cudaError_t err = cudaEventQuery(impl_->event);
746
747 if (err == cudaErrorNotReady || err == cudaSuccess)
748 return err == cudaSuccess;
749
750 cudaSafeCall(err);
751 return false;
752 #endif
753 }
754
waitForCompletion()755 void cv::cuda::Event::waitForCompletion()
756 {
757 #ifndef HAVE_CUDA
758 throw_no_cuda();
759 #else
760 cudaSafeCall( cudaEventSynchronize(impl_->event) );
761 #endif
762 }
763
elapsedTime(const Event & start,const Event & end)764 float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
765 {
766 #ifndef HAVE_CUDA
767 (void) start;
768 (void) end;
769 throw_no_cuda();
770 return 0.0f;
771 #else
772 float ms;
773 cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
774 return ms;
775 #endif
776 }
777