1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #include "precomp.hpp"
44 
45 using namespace cv;
46 using namespace cv::cuda;
47 
48 /////////////////////////////////////////////////////////////
49 /// MemoryStack
50 
51 #ifdef HAVE_CUDA
52 
53 namespace
54 {
55     class MemoryPool;
56 
57     class MemoryStack
58     {
59     public:
60         uchar* requestMemory(size_t size);
61         void returnMemory(uchar* ptr);
62 
63         uchar* datastart;
64         uchar* dataend;
65         uchar* tip;
66 
67         bool isFree;
68         MemoryPool* pool;
69 
70     #if !defined(NDEBUG)
71         std::vector<size_t> allocations;
72     #endif
73     };
74 
requestMemory(size_t size)75     uchar* MemoryStack::requestMemory(size_t size)
76     {
77         const size_t freeMem = dataend - tip;
78 
79         if (size > freeMem)
80             return 0;
81 
82         uchar* ptr = tip;
83 
84         tip += size;
85 
86     #if !defined(NDEBUG)
87         allocations.push_back(size);
88     #endif
89 
90         return ptr;
91     }
92 
returnMemory(uchar * ptr)93     void MemoryStack::returnMemory(uchar* ptr)
94     {
95         CV_DbgAssert( ptr >= datastart && ptr < dataend );
96 
97     #if !defined(NDEBUG)
98         const size_t allocSize = tip - ptr;
99         CV_Assert( allocSize == allocations.back() );
100         allocations.pop_back();
101     #endif
102 
103         tip = ptr;
104     }
105 }
106 
107 #endif
108 
109 /////////////////////////////////////////////////////////////
110 /// MemoryPool
111 
112 #ifdef HAVE_CUDA
113 
114 namespace
115 {
116     class MemoryPool
117     {
118     public:
119         MemoryPool();
120 
121         void initialize(size_t stackSize, int stackCount);
122         void release();
123 
124         MemoryStack* getFreeMemStack();
125         void returnMemStack(MemoryStack* memStack);
126 
127     private:
128         void initilizeImpl();
129 
130         Mutex mtx_;
131 
132         bool initialized_;
133         size_t stackSize_;
134         int stackCount_;
135 
136         uchar* mem_;
137 
138         std::vector<MemoryStack> stacks_;
139     };
140 
MemoryPool()141     MemoryPool::MemoryPool() : initialized_(false), mem_(0)
142     {
143         // default : 10 Mb, 5 stacks
144         stackSize_ = 10 * 1024 * 1024;
145         stackCount_ = 5;
146     }
147 
initialize(size_t stackSize,int stackCount)148     void MemoryPool::initialize(size_t stackSize, int stackCount)
149     {
150         AutoLock lock(mtx_);
151 
152         release();
153 
154         stackSize_ = stackSize;
155         stackCount_ = stackCount;
156 
157         initilizeImpl();
158     }
159 
initilizeImpl()160     void MemoryPool::initilizeImpl()
161     {
162         const size_t totalSize = stackSize_ * stackCount_;
163 
164         if (totalSize > 0)
165         {
166             cudaError_t err = cudaMalloc(&mem_, totalSize);
167             if (err != cudaSuccess)
168                 return;
169 
170             stacks_.resize(stackCount_);
171 
172             uchar* ptr = mem_;
173 
174             for (int i = 0; i < stackCount_; ++i)
175             {
176                 stacks_[i].datastart = ptr;
177                 stacks_[i].dataend = ptr + stackSize_;
178                 stacks_[i].tip = ptr;
179                 stacks_[i].isFree = true;
180                 stacks_[i].pool = this;
181 
182                 ptr += stackSize_;
183             }
184 
185             initialized_ = true;
186         }
187     }
188 
release()189     void MemoryPool::release()
190     {
191         if (mem_)
192         {
193 #if !defined(NDEBUG)
194             for (int i = 0; i < stackCount_; ++i)
195             {
196                 CV_DbgAssert( stacks_[i].isFree );
197                 CV_DbgAssert( stacks_[i].tip == stacks_[i].datastart );
198             }
199 #endif
200 
201             cudaFree(mem_);
202 
203             mem_ = 0;
204             initialized_ = false;
205         }
206     }
207 
getFreeMemStack()208     MemoryStack* MemoryPool::getFreeMemStack()
209     {
210         AutoLock lock(mtx_);
211 
212         if (!initialized_)
213             initilizeImpl();
214 
215         if (!mem_)
216             return 0;
217 
218         for (int i = 0; i < stackCount_; ++i)
219         {
220             if (stacks_[i].isFree)
221             {
222                 stacks_[i].isFree = false;
223                 return &stacks_[i];
224             }
225         }
226 
227         return 0;
228     }
229 
returnMemStack(MemoryStack * memStack)230     void MemoryPool::returnMemStack(MemoryStack* memStack)
231     {
232         AutoLock lock(mtx_);
233 
234         CV_DbgAssert( !memStack->isFree );
235 
236 #if !defined(NDEBUG)
237         bool found = false;
238         for (int i = 0; i < stackCount_; ++i)
239         {
240             if (memStack == &stacks_[i])
241             {
242                 found = true;
243                 break;
244             }
245         }
246         CV_DbgAssert( found );
247 #endif
248 
249         CV_DbgAssert( memStack->tip == memStack->datastart );
250 
251         memStack->isFree = true;
252     }
253 }
254 
255 #endif
256 
257 ////////////////////////////////////////////////////////////////
258 /// Stream::Impl
259 
260 #ifndef HAVE_CUDA
261 
262 class cv::cuda::Stream::Impl
263 {
264 public:
Impl(void * ptr=0)265     Impl(void* ptr = 0)
266     {
267         (void) ptr;
268         throw_no_cuda();
269     }
270 };
271 
272 #else
273 
274 namespace
275 {
276     class StackAllocator;
277 }
278 
279 class cv::cuda::Stream::Impl
280 {
281 public:
282     cudaStream_t stream;
283     Ptr<StackAllocator> stackAllocator_;
284 
285     Impl();
286     Impl(cudaStream_t stream);
287 
288     ~Impl();
289 };
290 
Impl()291 cv::cuda::Stream::Impl::Impl() : stream(0)
292 {
293     cudaSafeCall( cudaStreamCreate(&stream) );
294 
295     stackAllocator_ = makePtr<StackAllocator>(stream);
296 }
297 
Impl(cudaStream_t stream_)298 cv::cuda::Stream::Impl::Impl(cudaStream_t stream_) : stream(stream_)
299 {
300     stackAllocator_ = makePtr<StackAllocator>(stream);
301 }
302 
~Impl()303 cv::cuda::Stream::Impl::~Impl()
304 {
305     stackAllocator_.release();
306 
307     if (stream)
308         cudaStreamDestroy(stream);
309 }
310 
311 #endif
312 
313 /////////////////////////////////////////////////////////////
314 /// DefaultDeviceInitializer
315 
316 #ifdef HAVE_CUDA
317 
318 namespace cv { namespace cuda
319 {
320     class DefaultDeviceInitializer
321     {
322     public:
323         DefaultDeviceInitializer();
324         ~DefaultDeviceInitializer();
325 
326         Stream& getNullStream(int deviceId);
327         MemoryPool* getMemoryPool(int deviceId);
328 
329     private:
330         void initStreams();
331         void initPools();
332 
333         std::vector<Ptr<Stream> > streams_;
334         Mutex streams_mtx_;
335 
336         std::vector<MemoryPool> pools_;
337         Mutex pools_mtx_;
338     };
339 
DefaultDeviceInitializer()340     DefaultDeviceInitializer::DefaultDeviceInitializer()
341     {
342     }
343 
~DefaultDeviceInitializer()344     DefaultDeviceInitializer::~DefaultDeviceInitializer()
345     {
346         streams_.clear();
347 
348         for (size_t i = 0; i < pools_.size(); ++i)
349         {
350             cudaSetDevice(static_cast<int>(i));
351             pools_[i].release();
352         }
353 
354         pools_.clear();
355     }
356 
getNullStream(int deviceId)357     Stream& DefaultDeviceInitializer::getNullStream(int deviceId)
358     {
359         AutoLock lock(streams_mtx_);
360 
361         if (streams_.empty())
362         {
363             int deviceCount = getCudaEnabledDeviceCount();
364 
365             if (deviceCount > 0)
366                 streams_.resize(deviceCount);
367         }
368 
369         CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(streams_.size()) );
370 
371         if (streams_[deviceId].empty())
372         {
373             cudaStream_t stream = NULL;
374             Ptr<Stream::Impl> impl = makePtr<Stream::Impl>(stream);
375             streams_[deviceId] = Ptr<Stream>(new Stream(impl));
376         }
377 
378         return *streams_[deviceId];
379     }
380 
getMemoryPool(int deviceId)381     MemoryPool* DefaultDeviceInitializer::getMemoryPool(int deviceId)
382     {
383         AutoLock lock(pools_mtx_);
384 
385         if (pools_.empty())
386         {
387             int deviceCount = getCudaEnabledDeviceCount();
388 
389             if (deviceCount > 0)
390                 pools_.resize(deviceCount);
391         }
392 
393         CV_DbgAssert( deviceId >= 0 && deviceId < static_cast<int>(pools_.size()) );
394 
395         return &pools_[deviceId];
396     }
397 
398     DefaultDeviceInitializer initializer;
399 }}
400 
401 #endif
402 
403 /////////////////////////////////////////////////////////////
404 /// Stream
405 
Stream()406 cv::cuda::Stream::Stream()
407 {
408 #ifndef HAVE_CUDA
409     throw_no_cuda();
410 #else
411     impl_ = makePtr<Impl>();
412 #endif
413 }
414 
queryIfComplete() const415 bool cv::cuda::Stream::queryIfComplete() const
416 {
417 #ifndef HAVE_CUDA
418     throw_no_cuda();
419     return false;
420 #else
421     cudaError_t err = cudaStreamQuery(impl_->stream);
422 
423     if (err == cudaErrorNotReady || err == cudaSuccess)
424         return err == cudaSuccess;
425 
426     cudaSafeCall(err);
427     return false;
428 #endif
429 }
430 
waitForCompletion()431 void cv::cuda::Stream::waitForCompletion()
432 {
433 #ifndef HAVE_CUDA
434     throw_no_cuda();
435 #else
436     cudaSafeCall( cudaStreamSynchronize(impl_->stream) );
437 #endif
438 }
439 
waitEvent(const Event & event)440 void cv::cuda::Stream::waitEvent(const Event& event)
441 {
442 #ifndef HAVE_CUDA
443     (void) event;
444     throw_no_cuda();
445 #else
446     cudaSafeCall( cudaStreamWaitEvent(impl_->stream, EventAccessor::getEvent(event), 0) );
447 #endif
448 }
449 
450 #if defined(HAVE_CUDA) && (CUDART_VERSION >= 5000)
451 
452 namespace
453 {
454     struct CallbackData
455     {
456         Stream::StreamCallback callback;
457         void* userData;
458 
CallbackData__anon81e7630e0411::CallbackData459         CallbackData(Stream::StreamCallback callback_, void* userData_) : callback(callback_), userData(userData_) {}
460     };
461 
cudaStreamCallback(cudaStream_t,cudaError_t status,void * userData)462     void CUDART_CB cudaStreamCallback(cudaStream_t, cudaError_t status, void* userData)
463     {
464         CallbackData* data = reinterpret_cast<CallbackData*>(userData);
465         data->callback(static_cast<int>(status), data->userData);
466         delete data;
467     }
468 }
469 
470 #endif
471 
enqueueHostCallback(StreamCallback callback,void * userData)472 void cv::cuda::Stream::enqueueHostCallback(StreamCallback callback, void* userData)
473 {
474 #ifndef HAVE_CUDA
475     (void) callback;
476     (void) userData;
477     throw_no_cuda();
478 #else
479     #if CUDART_VERSION < 5000
480         (void) callback;
481         (void) userData;
482         CV_Error(cv::Error::StsNotImplemented, "This function requires CUDA >= 5.0");
483     #else
484         CallbackData* data = new CallbackData(callback, userData);
485 
486         cudaSafeCall( cudaStreamAddCallback(impl_->stream, cudaStreamCallback, data, 0) );
487     #endif
488 #endif
489 }
490 
Null()491 Stream& cv::cuda::Stream::Null()
492 {
493 #ifndef HAVE_CUDA
494     throw_no_cuda();
495     static Stream stream;
496     return stream;
497 #else
498     const int deviceId = getDevice();
499     return initializer.getNullStream(deviceId);
500 #endif
501 }
502 
operator bool_type() const503 cv::cuda::Stream::operator bool_type() const
504 {
505 #ifndef HAVE_CUDA
506     return 0;
507 #else
508     return (impl_->stream != 0) ? &Stream::this_type_does_not_support_comparisons : 0;
509 #endif
510 }
511 
512 #ifdef HAVE_CUDA
513 
getStream(const Stream & stream)514 cudaStream_t cv::cuda::StreamAccessor::getStream(const Stream& stream)
515 {
516     return stream.impl_->stream;
517 }
518 
519 #endif
520 
521 /////////////////////////////////////////////////////////////
522 /// StackAllocator
523 
524 #ifdef HAVE_CUDA
525 
526 namespace
527 {
528     bool enableMemoryPool = true;
529 
530     class StackAllocator : public GpuMat::Allocator
531     {
532     public:
533         explicit StackAllocator(cudaStream_t stream);
534         ~StackAllocator();
535 
536         bool allocate(GpuMat* mat, int rows, int cols, size_t elemSize);
537         void free(GpuMat* mat);
538 
539     private:
540         StackAllocator(const StackAllocator&);
541         StackAllocator& operator =(const StackAllocator&);
542 
543         cudaStream_t stream_;
544         MemoryStack* memStack_;
545         size_t alignment_;
546     };
547 
StackAllocator(cudaStream_t stream)548     StackAllocator::StackAllocator(cudaStream_t stream) : stream_(stream), memStack_(0)
549     {
550         if (enableMemoryPool)
551         {
552             const int deviceId = getDevice();
553             memStack_ = initializer.getMemoryPool(deviceId)->getFreeMemStack();
554             DeviceInfo devInfo(deviceId);
555             alignment_ = devInfo.textureAlignment();
556         }
557     }
558 
~StackAllocator()559     StackAllocator::~StackAllocator()
560     {
561         cudaStreamSynchronize(stream_);
562 
563         if (memStack_ != 0)
564             memStack_->pool->returnMemStack(memStack_);
565     }
566 
alignUp(size_t what,size_t alignment)567     size_t alignUp(size_t what, size_t alignment)
568     {
569         size_t alignMask = alignment-1;
570         size_t inverseAlignMask = ~alignMask;
571         size_t res = (what + alignMask) & inverseAlignMask;
572         return res;
573     }
574 
allocate(GpuMat * mat,int rows,int cols,size_t elemSize)575     bool StackAllocator::allocate(GpuMat* mat, int rows, int cols, size_t elemSize)
576     {
577         if (memStack_ == 0)
578             return false;
579 
580         size_t pitch, memSize;
581 
582         if (rows > 1 && cols > 1)
583         {
584             pitch = alignUp(cols * elemSize, alignment_);
585             memSize = pitch * rows;
586         }
587         else
588         {
589             // Single row or single column must be continuous
590             pitch = elemSize * cols;
591             memSize = alignUp(elemSize * cols * rows, 64);
592         }
593 
594         uchar* ptr = memStack_->requestMemory(memSize);
595 
596         if (ptr == 0)
597             return false;
598 
599         mat->data = ptr;
600         mat->step = pitch;
601         mat->refcount = (int*) fastMalloc(sizeof(int));
602 
603         return true;
604     }
605 
free(GpuMat * mat)606     void StackAllocator::free(GpuMat* mat)
607     {
608         if (memStack_ == 0)
609             return;
610 
611         memStack_->returnMemory(mat->datastart);
612         fastFree(mat->refcount);
613     }
614 }
615 
616 #endif
617 
618 /////////////////////////////////////////////////////////////
619 /// BufferPool
620 
setBufferPoolUsage(bool on)621 void cv::cuda::setBufferPoolUsage(bool on)
622 {
623 #ifndef HAVE_CUDA
624     (void)on;
625     throw_no_cuda();
626 #else
627     enableMemoryPool = on;
628 #endif
629 }
630 
setBufferPoolConfig(int deviceId,size_t stackSize,int stackCount)631 void cv::cuda::setBufferPoolConfig(int deviceId, size_t stackSize, int stackCount)
632 {
633 #ifndef HAVE_CUDA
634     (void)deviceId;
635     (void)stackSize;
636     (void)stackCount;
637     throw_no_cuda();
638 #else
639     const int currentDevice = getDevice();
640 
641     if (deviceId >= 0)
642     {
643         setDevice(deviceId);
644         initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
645     }
646     else
647     {
648         const int deviceCount = getCudaEnabledDeviceCount();
649 
650         for (deviceId = 0; deviceId < deviceCount; ++deviceId)
651         {
652             setDevice(deviceId);
653             initializer.getMemoryPool(deviceId)->initialize(stackSize, stackCount);
654         }
655     }
656 
657     setDevice(currentDevice);
658 #endif
659 }
660 
661 #ifdef HAVE_CUDA
662 
BufferPool(Stream & stream)663 cv::cuda::BufferPool::BufferPool(Stream& stream) : allocator_(stream.impl_->stackAllocator_.get())
664 {
665 }
666 
getBuffer(int rows,int cols,int type)667 GpuMat cv::cuda::BufferPool::getBuffer(int rows, int cols, int type)
668 {
669     GpuMat buf(allocator_);
670     buf.create(rows, cols, type);
671     return buf;
672 }
673 
674 #endif
675 
676 ////////////////////////////////////////////////////////////////
677 // Event
678 
679 #ifndef HAVE_CUDA
680 
681 class cv::cuda::Event::Impl
682 {
683 public:
Impl(unsigned int)684     Impl(unsigned int)
685     {
686         throw_no_cuda();
687     }
688 };
689 
690 #else
691 
692 class cv::cuda::Event::Impl
693 {
694 public:
695     cudaEvent_t event;
696 
697     Impl(unsigned int flags);
698     ~Impl();
699 };
700 
Impl(unsigned int flags)701 cv::cuda::Event::Impl::Impl(unsigned int flags) : event(0)
702 {
703     cudaSafeCall( cudaEventCreateWithFlags(&event, flags) );
704 }
705 
~Impl()706 cv::cuda::Event::Impl::~Impl()
707 {
708     if (event)
709         cudaEventDestroy(event);
710 }
711 
getEvent(const Event & event)712 cudaEvent_t cv::cuda::EventAccessor::getEvent(const Event& event)
713 {
714     return event.impl_->event;
715 }
716 
717 #endif
718 
Event(CreateFlags flags)719 cv::cuda::Event::Event(CreateFlags flags)
720 {
721 #ifndef HAVE_CUDA
722     (void) flags;
723     throw_no_cuda();
724 #else
725     impl_ = makePtr<Impl>(flags);
726 #endif
727 }
728 
record(Stream & stream)729 void cv::cuda::Event::record(Stream& stream)
730 {
731 #ifndef HAVE_CUDA
732     (void) stream;
733     throw_no_cuda();
734 #else
735     cudaSafeCall( cudaEventRecord(impl_->event, StreamAccessor::getStream(stream)) );
736 #endif
737 }
738 
queryIfComplete() const739 bool cv::cuda::Event::queryIfComplete() const
740 {
741 #ifndef HAVE_CUDA
742     throw_no_cuda();
743     return false;
744 #else
745     cudaError_t err = cudaEventQuery(impl_->event);
746 
747     if (err == cudaErrorNotReady || err == cudaSuccess)
748         return err == cudaSuccess;
749 
750     cudaSafeCall(err);
751     return false;
752 #endif
753 }
754 
waitForCompletion()755 void cv::cuda::Event::waitForCompletion()
756 {
757 #ifndef HAVE_CUDA
758     throw_no_cuda();
759 #else
760     cudaSafeCall( cudaEventSynchronize(impl_->event) );
761 #endif
762 }
763 
elapsedTime(const Event & start,const Event & end)764 float cv::cuda::Event::elapsedTime(const Event& start, const Event& end)
765 {
766 #ifndef HAVE_CUDA
767     (void) start;
768     (void) end;
769     throw_no_cuda();
770     return 0.0f;
771 #else
772     float ms;
773     cudaSafeCall( cudaEventElapsedTime(&ms, start.impl_->event, end.impl_->event) );
774     return ms;
775 #endif
776 }
777