1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
6 // Third party copyrights are property of their respective owners.
7 
8 #include "precomp.hpp"
9 #ifndef __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
10 #define __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
11 
12 #include "opencl_kernels_photo.hpp"
13 
14 #ifdef HAVE_OPENCL
15 
16 namespace cv {
17 
18 enum
19 {
20     BLOCK_ROWS = 32,
21     BLOCK_COLS = 32,
22     CTA_SIZE_INTEL = 64,
23     CTA_SIZE_DEFAULT = 256
24 };
25 
divUp(int a,int b)26 static int divUp(int a, int b)
27 {
28     return (a + b - 1) / b;
29 }
30 
31 template <typename FT, typename ST, typename WT>
ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,int searchWindowSize,int templateWindowSize,const FT * h,int hn,int cn,int normType,int & almostTemplateWindowSizeSqBinShift)32 static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,
33                                       int searchWindowSize, int templateWindowSize,
34                                       const FT *h, int hn, int cn, int normType,
35                                       int & almostTemplateWindowSizeSqBinShift)
36 {
37     const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
38         std::numeric_limits<ST>::max();
39     int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue,
40                                            std::numeric_limits<int>::max());
41     int depth = DataType<FT>::depth;
42     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
43 
44     if (depth == CV_64F && !doubleSupport)
45         return false;
46 
47     // precalc weight for every possible l2 dist between blocks
48     // additional optimization of precalced weights to replace division(averaging) by binary shift
49     CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX)
50     int templateWindowSizeSq = templateWindowSize * templateWindowSize;
51     almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq);
52     FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
53 
54     const FT WEIGHT_THRESHOLD = 1e-3f;
55     int maxDist = normType == NORM_L1 ? std::numeric_limits<ST>::max() * cn :
56         std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
57     int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
58     FT den[4];
59     CV_Assert(hn > 0 && hn <= 4);
60     for (int i=0; i<hn; i++)
61         den[i] = 1.0f / (h[i] * h[i] * cn);
62 
63     almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));
64 
65     char buf[40];
66     ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
67                   format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
68                          " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
69                          ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
70                          ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
71                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
72                          normType == NORM_L1 ? " -D ABS" : ""));
73     if (k.empty())
74         return false;
75 
76     k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
77            almostDist2ActualDistMultiplier, fixedPointMult,
78            ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);
79 
80     size_t globalsize[1] = { almostMaxDist };
81     return k.run(1, globalsize, NULL, false);
82 }
83 
ocl_fastNlMeansDenoising(InputArray _src,OutputArray _dst,const float * h,int hn,int templateWindowSize,int searchWindowSize,int normType)84 static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn,
85                                      int templateWindowSize, int searchWindowSize, int normType)
86 {
87     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
88     int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
89     Size size = _src.size();
90 
91     if (cn < 1 || cn > 4 || ((normType != NORM_L2 || depth != CV_8U) &&
92                              (normType != NORM_L1 || (depth != CV_8U && depth != CV_16U))))
93         return false;
94 
95     int templateWindowHalfWize = templateWindowSize / 2;
96     int searchWindowHalfSize = searchWindowSize / 2;
97     templateWindowSize  = templateWindowHalfWize * 2 + 1;
98     searchWindowSize = searchWindowHalfSize * 2 + 1;
99     int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
100     int almostTemplateWindowSizeSqBinShift = -1;
101 
102     char buf[4][40];
103     String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
104                          " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
105                          " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
106                          " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
107                          " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
108                          " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
109                          templateWindowSize, searchWindowSize,
110                          ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
111                          ocl::typeToStr(CV_32SC(hn)),
112                          depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
113                          format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
114                          depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
115                          format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
116                          depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
117                          format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
118                          depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
119                          format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
120                          BLOCK_COLS, BLOCK_ROWS,
121                          ctaSize, templateWindowHalfWize, searchWindowHalfSize,
122                          ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
123                          (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn),
124                          ocl::convertTypeStr(CV_32S, depth, cn, buf[3]),
125                          normType == NORM_L1 ? " -D ABS" : "");
126 
127     ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
128     if (k.empty())
129         return false;
130 
131     UMat almostDist2Weight;
132     if ((depth == CV_8U &&
133          !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
134                                                        searchWindowSize, templateWindowSize,
135                                                        h, hn, cn, normType,
136                                                        almostTemplateWindowSizeSqBinShift)) ||
137         (depth == CV_16U &&
138          !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
139                                                           searchWindowSize, templateWindowSize,
140                                                           h, hn, cn, normType,
141                                                           almostTemplateWindowSizeSqBinShift)))
142         return false;
143     CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
144 
145     UMat srcex;
146     int borderSize = searchWindowHalfSize + templateWindowHalfWize;
147     if (cn == 3) {
148         srcex.create(size.height + 2*borderSize, size.width + 2*borderSize, CV_MAKE_TYPE(depth, 4));
149         UMat src(srcex, Rect(borderSize, borderSize, size.width, size.height));
150         int from_to[] = { 0,0, 1,1, 2,2 };
151         mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, src), from_to, 3);
152         copyMakeBorder(src, srcex, borderSize, borderSize, borderSize, borderSize,
153                        BORDER_DEFAULT|BORDER_ISOLATED); // create borders in place
154     }
155     else
156         copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
157 
158     _dst.create(size, type);
159     UMat dst;
160     if (cn == 3)
161         dst.create(size, CV_MAKE_TYPE(depth, 4));
162     else
163         dst = _dst.getUMat();
164 
165     int searchWindowSizeSq = searchWindowSize * searchWindowSize;
166     Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
167     Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy);
168     UMat buffer(upColSumSize + colSumSize, CV_32SC(cn));
169 
170     srcex = srcex(Rect(Point(borderSize, borderSize), size));
171     k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst),
172            ocl::KernelArg::PtrReadOnly(almostDist2Weight),
173            ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
174 
175     size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
176     if (!k.run(2, globalsize, localsize, false)) return false;
177 
178     if (cn == 3) {
179         int from_to[] = { 0,0, 1,1, 2,2 };
180         mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3);
181     }
182 
183     return true;
184 }
185 
ocl_fastNlMeansDenoisingColored(InputArray _src,OutputArray _dst,float h,float hForColorComponents,int templateWindowSize,int searchWindowSize)186 static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
187                                       float h, float hForColorComponents,
188                                       int templateWindowSize, int searchWindowSize)
189 {
190     UMat src = _src.getUMat();
191     _dst.create(src.size(), src.type());
192     UMat dst = _dst.getUMat();
193 
194     UMat src_lab;
195     cvtColor(src, src_lab, COLOR_LBGR2Lab);
196 
197     UMat l(src.size(), CV_8U);
198     UMat ab(src.size(), CV_8UC2);
199     std::vector<UMat> l_ab(2), l_ab_denoised(2);
200     l_ab[0] = l;
201     l_ab[1] = ab;
202     l_ab_denoised[0].create(src.size(), CV_8U);
203     l_ab_denoised[1].create(src.size(), CV_8UC2);
204 
205     int from_to[] = { 0,0, 1,1, 2,2 };
206     mixChannels(std::vector<UMat>(1, src_lab), l_ab, from_to, 3);
207 
208     fastNlMeansDenoising(l_ab[0], l_ab_denoised[0], h, templateWindowSize, searchWindowSize);
209     fastNlMeansDenoising(l_ab[1], l_ab_denoised[1], hForColorComponents, templateWindowSize, searchWindowSize);
210 
211     UMat dst_lab(src.size(), CV_8UC3);
212     mixChannels(l_ab_denoised, std::vector<UMat>(1, dst_lab), from_to, 3);
213 
214     cvtColor(dst_lab, dst, COLOR_Lab2LBGR, src.channels());
215     return true;
216 }
217 
218 }
219 
220 #endif
221 #endif
222