1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 // Copyright (C) 2014, Advanced Micro Devices, Inc., all rights reserved.
6 // Third party copyrights are property of their respective owners.
7
8 #include "precomp.hpp"
9 #ifndef __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
10 #define __OPENCV_FAST_NLMEANS_DENOISING_OPENCL_HPP__
11
12 #include "opencl_kernels_photo.hpp"
13
14 #ifdef HAVE_OPENCL
15
16 namespace cv {
17
18 enum
19 {
20 BLOCK_ROWS = 32,
21 BLOCK_COLS = 32,
22 CTA_SIZE_INTEL = 64,
23 CTA_SIZE_DEFAULT = 256
24 };
25
divUp(int a,int b)26 static int divUp(int a, int b)
27 {
28 return (a + b - 1) / b;
29 }
30
31 template <typename FT, typename ST, typename WT>
ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,int searchWindowSize,int templateWindowSize,const FT * h,int hn,int cn,int normType,int & almostTemplateWindowSizeSqBinShift)32 static bool ocl_calcAlmostDist2Weight(UMat & almostDist2Weight,
33 int searchWindowSize, int templateWindowSize,
34 const FT *h, int hn, int cn, int normType,
35 int & almostTemplateWindowSizeSqBinShift)
36 {
37 const WT maxEstimateSumValue = searchWindowSize * searchWindowSize *
38 std::numeric_limits<ST>::max();
39 int fixedPointMult = (int)std::min<WT>(std::numeric_limits<WT>::max() / maxEstimateSumValue,
40 std::numeric_limits<int>::max());
41 int depth = DataType<FT>::depth;
42 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
43
44 if (depth == CV_64F && !doubleSupport)
45 return false;
46
47 // precalc weight for every possible l2 dist between blocks
48 // additional optimization of precalced weights to replace division(averaging) by binary shift
49 CV_Assert(templateWindowSize <= 46340); // sqrt(INT_MAX)
50 int templateWindowSizeSq = templateWindowSize * templateWindowSize;
51 almostTemplateWindowSizeSqBinShift = getNearestPowerOf2(templateWindowSizeSq);
52 FT almostDist2ActualDistMultiplier = (FT)(1 << almostTemplateWindowSizeSqBinShift) / templateWindowSizeSq;
53
54 const FT WEIGHT_THRESHOLD = 1e-3f;
55 int maxDist = normType == NORM_L1 ? std::numeric_limits<ST>::max() * cn :
56 std::numeric_limits<ST>::max() * std::numeric_limits<ST>::max() * cn;
57 int almostMaxDist = (int)(maxDist / almostDist2ActualDistMultiplier + 1);
58 FT den[4];
59 CV_Assert(hn > 0 && hn <= 4);
60 for (int i=0; i<hn; i++)
61 den[i] = 1.0f / (h[i] * h[i] * cn);
62
63 almostDist2Weight.create(1, almostMaxDist, CV_32SC(hn == 3 ? 4 : hn));
64
65 char buf[40];
66 ocl::Kernel k("calcAlmostDist2Weight", ocl::photo::nlmeans_oclsrc,
67 format("-D OP_CALC_WEIGHTS -D FT=%s -D w_t=%s"
68 " -D wlut_t=%s -D convert_wlut_t=%s%s%s",
69 ocl::typeToStr(depth), ocl::typeToStr(CV_MAKE_TYPE(depth, hn)),
70 ocl::typeToStr(CV_32SC(hn)), ocl::convertTypeStr(depth, CV_32S, hn, buf),
71 doubleSupport ? " -D DOUBLE_SUPPORT" : "",
72 normType == NORM_L1 ? " -D ABS" : ""));
73 if (k.empty())
74 return false;
75
76 k.args(ocl::KernelArg::PtrWriteOnly(almostDist2Weight), almostMaxDist,
77 almostDist2ActualDistMultiplier, fixedPointMult,
78 ocl::KernelArg::Constant(den, (hn == 3 ? 4 : hn)*sizeof(FT)), WEIGHT_THRESHOLD);
79
80 size_t globalsize[1] = { almostMaxDist };
81 return k.run(1, globalsize, NULL, false);
82 }
83
ocl_fastNlMeansDenoising(InputArray _src,OutputArray _dst,const float * h,int hn,int templateWindowSize,int searchWindowSize,int normType)84 static bool ocl_fastNlMeansDenoising(InputArray _src, OutputArray _dst, const float *h, int hn,
85 int templateWindowSize, int searchWindowSize, int normType)
86 {
87 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
88 int ctaSize = ocl::Device::getDefault().isIntel() ? CTA_SIZE_INTEL : CTA_SIZE_DEFAULT;
89 Size size = _src.size();
90
91 if (cn < 1 || cn > 4 || ((normType != NORM_L2 || depth != CV_8U) &&
92 (normType != NORM_L1 || (depth != CV_8U && depth != CV_16U))))
93 return false;
94
95 int templateWindowHalfWize = templateWindowSize / 2;
96 int searchWindowHalfSize = searchWindowSize / 2;
97 templateWindowSize = templateWindowHalfWize * 2 + 1;
98 searchWindowSize = searchWindowHalfSize * 2 + 1;
99 int nblocksx = divUp(size.width, BLOCK_COLS), nblocksy = divUp(size.height, BLOCK_ROWS);
100 int almostTemplateWindowSizeSqBinShift = -1;
101
102 char buf[4][40];
103 String opts = format("-D OP_CALC_FASTNLMEANS -D TEMPLATE_SIZE=%d -D SEARCH_SIZE=%d"
104 " -D pixel_t=%s -D int_t=%s -D wlut_t=%s"
105 " -D weight_t=%s -D convert_weight_t=%s -D sum_t=%s -D convert_sum_t=%s"
106 " -D BLOCK_COLS=%d -D BLOCK_ROWS=%d"
107 " -D CTA_SIZE=%d -D TEMPLATE_SIZE2=%d -D SEARCH_SIZE2=%d"
108 " -D convert_int_t=%s -D cn=%d -D psz=%d -D convert_pixel_t=%s%s",
109 templateWindowSize, searchWindowSize,
110 ocl::typeToStr(type), ocl::typeToStr(CV_32SC(cn)),
111 ocl::typeToStr(CV_32SC(hn)),
112 depth == CV_8U ? ocl::typeToStr(CV_32SC(hn)) :
113 format("long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
114 depth == CV_8U ? ocl::convertTypeStr(CV_32S, CV_32S, hn, buf[0]) :
115 format("convert_long%s", hn > 1 ? format("%d", hn).c_str() : "").c_str(),
116 depth == CV_8U ? ocl::typeToStr(CV_32SC(cn)) :
117 format("long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
118 depth == CV_8U ? ocl::convertTypeStr(depth, CV_32S, cn, buf[1]) :
119 format("convert_long%s", cn > 1 ? format("%d", cn).c_str() : "").c_str(),
120 BLOCK_COLS, BLOCK_ROWS,
121 ctaSize, templateWindowHalfWize, searchWindowHalfSize,
122 ocl::convertTypeStr(depth, CV_32S, cn, buf[2]), cn,
123 (depth == CV_8U ? sizeof(uchar) : sizeof(ushort)) * (cn == 3 ? 4 : cn),
124 ocl::convertTypeStr(CV_32S, depth, cn, buf[3]),
125 normType == NORM_L1 ? " -D ABS" : "");
126
127 ocl::Kernel k("fastNlMeansDenoising", ocl::photo::nlmeans_oclsrc, opts);
128 if (k.empty())
129 return false;
130
131 UMat almostDist2Weight;
132 if ((depth == CV_8U &&
133 !ocl_calcAlmostDist2Weight<float, uchar, int>(almostDist2Weight,
134 searchWindowSize, templateWindowSize,
135 h, hn, cn, normType,
136 almostTemplateWindowSizeSqBinShift)) ||
137 (depth == CV_16U &&
138 !ocl_calcAlmostDist2Weight<float, ushort, int64>(almostDist2Weight,
139 searchWindowSize, templateWindowSize,
140 h, hn, cn, normType,
141 almostTemplateWindowSizeSqBinShift)))
142 return false;
143 CV_Assert(almostTemplateWindowSizeSqBinShift >= 0);
144
145 UMat srcex;
146 int borderSize = searchWindowHalfSize + templateWindowHalfWize;
147 if (cn == 3) {
148 srcex.create(size.height + 2*borderSize, size.width + 2*borderSize, CV_MAKE_TYPE(depth, 4));
149 UMat src(srcex, Rect(borderSize, borderSize, size.width, size.height));
150 int from_to[] = { 0,0, 1,1, 2,2 };
151 mixChannels(std::vector<UMat>(1, _src.getUMat()), std::vector<UMat>(1, src), from_to, 3);
152 copyMakeBorder(src, srcex, borderSize, borderSize, borderSize, borderSize,
153 BORDER_DEFAULT|BORDER_ISOLATED); // create borders in place
154 }
155 else
156 copyMakeBorder(_src, srcex, borderSize, borderSize, borderSize, borderSize, BORDER_DEFAULT);
157
158 _dst.create(size, type);
159 UMat dst;
160 if (cn == 3)
161 dst.create(size, CV_MAKE_TYPE(depth, 4));
162 else
163 dst = _dst.getUMat();
164
165 int searchWindowSizeSq = searchWindowSize * searchWindowSize;
166 Size upColSumSize(size.width, searchWindowSizeSq * nblocksy);
167 Size colSumSize(nblocksx * templateWindowSize, searchWindowSizeSq * nblocksy);
168 UMat buffer(upColSumSize + colSumSize, CV_32SC(cn));
169
170 srcex = srcex(Rect(Point(borderSize, borderSize), size));
171 k.args(ocl::KernelArg::ReadOnlyNoSize(srcex), ocl::KernelArg::WriteOnly(dst),
172 ocl::KernelArg::PtrReadOnly(almostDist2Weight),
173 ocl::KernelArg::PtrReadOnly(buffer), almostTemplateWindowSizeSqBinShift);
174
175 size_t globalsize[2] = { nblocksx * ctaSize, nblocksy }, localsize[2] = { ctaSize, 1 };
176 if (!k.run(2, globalsize, localsize, false)) return false;
177
178 if (cn == 3) {
179 int from_to[] = { 0,0, 1,1, 2,2 };
180 mixChannels(std::vector<UMat>(1, dst), std::vector<UMat>(1, _dst.getUMat()), from_to, 3);
181 }
182
183 return true;
184 }
185
ocl_fastNlMeansDenoisingColored(InputArray _src,OutputArray _dst,float h,float hForColorComponents,int templateWindowSize,int searchWindowSize)186 static bool ocl_fastNlMeansDenoisingColored( InputArray _src, OutputArray _dst,
187 float h, float hForColorComponents,
188 int templateWindowSize, int searchWindowSize)
189 {
190 UMat src = _src.getUMat();
191 _dst.create(src.size(), src.type());
192 UMat dst = _dst.getUMat();
193
194 UMat src_lab;
195 cvtColor(src, src_lab, COLOR_LBGR2Lab);
196
197 UMat l(src.size(), CV_8U);
198 UMat ab(src.size(), CV_8UC2);
199 std::vector<UMat> l_ab(2), l_ab_denoised(2);
200 l_ab[0] = l;
201 l_ab[1] = ab;
202 l_ab_denoised[0].create(src.size(), CV_8U);
203 l_ab_denoised[1].create(src.size(), CV_8UC2);
204
205 int from_to[] = { 0,0, 1,1, 2,2 };
206 mixChannels(std::vector<UMat>(1, src_lab), l_ab, from_to, 3);
207
208 fastNlMeansDenoising(l_ab[0], l_ab_denoised[0], h, templateWindowSize, searchWindowSize);
209 fastNlMeansDenoising(l_ab[1], l_ab_denoised[1], hForColorComponents, templateWindowSize, searchWindowSize);
210
211 UMat dst_lab(src.size(), CV_8UC3);
212 mixChannels(l_ab_denoised, std::vector<UMat>(1, dst_lab), from_to, 3);
213
214 cvtColor(dst_lab, dst, COLOR_Lab2LBGR, src.channels());
215 return true;
216 }
217
218 }
219
220 #endif
221 #endif
222