1 #include <iostream>                   // Console I/O
2 #include <sstream>                    // String to number conversion
3 
4 #include <opencv2/core.hpp>      // Basic OpenCV structures
5 #include <opencv2/core/utility.hpp>
6 #include <opencv2/imgproc.hpp>// Image processing methods for the CPU
7 #include <opencv2/imgcodecs.hpp>// Read images
8 
9 // CUDA structures and methods
10 #include <opencv2/cudaarithm.hpp>
11 #include <opencv2/cudafilters.hpp>
12 
13 using namespace std;
14 using namespace cv;
15 
16 double getPSNR(const Mat& I1, const Mat& I2);      // CPU versions
17 Scalar getMSSIM( const Mat& I1, const Mat& I2);
18 
19 double getPSNR_CUDA(const Mat& I1, const Mat& I2);  // Basic CUDA versions
20 Scalar getMSSIM_CUDA( const Mat& I1, const Mat& I2);
21 
22 //! [psnr]
23 struct BufferPSNR                                     // Optimized CUDA versions
24 {   // Data allocations are very expensive on CUDA. Use a buffer to solve: allocate once reuse later.
25     cuda::GpuMat gI1, gI2, gs, t1,t2;
26 
27     cuda::GpuMat buf;
28 };
29 //! [psnr]
30 double getPSNR_CUDA_optimized(const Mat& I1, const Mat& I2, BufferPSNR& b);
31 
32 //! [ssim]
33 struct BufferMSSIM                                     // Optimized CUDA versions
34 {   // Data allocations are very expensive on CUDA. Use a buffer to solve: allocate once reuse later.
35     cuda::GpuMat gI1, gI2, gs, t1,t2;
36 
37     cuda::GpuMat I1_2, I2_2, I1_I2;
38     vector<cuda::GpuMat> vI1, vI2;
39 
40     cuda::GpuMat mu1, mu2;
41     cuda::GpuMat mu1_2, mu2_2, mu1_mu2;
42 
43     cuda::GpuMat sigma1_2, sigma2_2, sigma12;
44     cuda::GpuMat t3;
45 
46     cuda::GpuMat ssim_map;
47 
48     cuda::GpuMat buf;
49 };
50 //! [ssim]
51 Scalar getMSSIM_CUDA_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b);
52 
help()53 static void help()
54 {
55     cout
56         << "\n--------------------------------------------------------------------------" << endl
57         << "This program shows how to port your CPU code to CUDA or write that from scratch." << endl
58         << "You can see the performance improvement for the similarity check methods (PSNR and SSIM)."  << endl
59         << "Usage:"                                                               << endl
60         << "./gpu-basics-similarity referenceImage comparedImage numberOfTimesToRunTest(like 10)." << endl
61         << "--------------------------------------------------------------------------"   << endl
62         << endl;
63 }
64 
main(int,char * argv[])65 int main(int, char *argv[])
66 {
67     help();
68     Mat I1 = imread(argv[1]);           // Read the two images
69     Mat I2 = imread(argv[2]);
70 
71     if (!I1.data || !I2.data)           // Check for success
72     {
73         cout << "Couldn't read the image";
74         return 0;
75     }
76 
77     BufferPSNR bufferPSNR;
78     BufferMSSIM bufferMSSIM;
79 
80     int TIMES = 10;
81     stringstream sstr(argv[3]);
82     sstr >> TIMES;
83     double time, result = 0;
84 
85     //------------------------------- PSNR CPU ----------------------------------------------------
86     time = (double)getTickCount();
87 
88     for (int i = 0; i < TIMES; ++i)
89         result = getPSNR(I1,I2);
90 
91     time = 1000*((double)getTickCount() - time)/getTickFrequency();
92     time /= TIMES;
93 
94     cout << "Time of PSNR CPU (averaged for " << TIMES << " runs): " << time << " milliseconds."
95         << " With result of: " << result << endl;
96 
97     //------------------------------- PSNR CUDA ----------------------------------------------------
98     time = (double)getTickCount();
99 
100     for (int i = 0; i < TIMES; ++i)
101         result = getPSNR_CUDA(I1,I2);
102 
103     time = 1000*((double)getTickCount() - time)/getTickFrequency();
104     time /= TIMES;
105 
106     cout << "Time of PSNR CUDA (averaged for " << TIMES << " runs): " << time << " milliseconds."
107         << " With result of: " <<  result << endl;
108 
109     //------------------------------- PSNR CUDA Optimized--------------------------------------------
110     time = (double)getTickCount();                                  // Initial call
111     result = getPSNR_CUDA_optimized(I1, I2, bufferPSNR);
112     time = 1000*((double)getTickCount() - time)/getTickFrequency();
113     cout << "Initial call CUDA optimized:              " << time  <<" milliseconds."
114         << " With result of: " << result << endl;
115 
116     time = (double)getTickCount();
117     for (int i = 0; i < TIMES; ++i)
118         result = getPSNR_CUDA_optimized(I1, I2, bufferPSNR);
119 
120     time = 1000*((double)getTickCount() - time)/getTickFrequency();
121     time /= TIMES;
122 
123     cout << "Time of PSNR CUDA OPTIMIZED ( / " << TIMES << " runs): " << time
124         << " milliseconds." << " With result of: " <<  result << endl << endl;
125 
126 
127     //------------------------------- SSIM CPU -----------------------------------------------------
128     Scalar x;
129     time = (double)getTickCount();
130 
131     for (int i = 0; i < TIMES; ++i)
132         x = getMSSIM(I1,I2);
133 
134     time = 1000*((double)getTickCount() - time)/getTickFrequency();
135     time /= TIMES;
136 
137     cout << "Time of MSSIM CPU (averaged for " << TIMES << " runs): " << time << " milliseconds."
138         << " With result of B" << x.val[0] << " G" << x.val[1] << " R" << x.val[2] << endl;
139 
140     //------------------------------- SSIM CUDA -----------------------------------------------------
141     time = (double)getTickCount();
142 
143     for (int i = 0; i < TIMES; ++i)
144         x = getMSSIM_CUDA(I1,I2);
145 
146     time = 1000*((double)getTickCount() - time)/getTickFrequency();
147     time /= TIMES;
148 
149     cout << "Time of MSSIM CUDA (averaged for " << TIMES << " runs): " << time << " milliseconds."
150         << " With result of B" << x.val[0] << " G" << x.val[1] << " R" << x.val[2] << endl;
151 
152     //------------------------------- SSIM CUDA Optimized--------------------------------------------
153     time = (double)getTickCount();
154     x = getMSSIM_CUDA_optimized(I1,I2, bufferMSSIM);
155     time = 1000*((double)getTickCount() - time)/getTickFrequency();
156     cout << "Time of MSSIM CUDA Initial Call            " << time << " milliseconds."
157         << " With result of B" << x.val[0] << " G" << x.val[1] << " R" << x.val[2] << endl;
158 
159     time = (double)getTickCount();
160 
161     for (int i = 0; i < TIMES; ++i)
162         x = getMSSIM_CUDA_optimized(I1,I2, bufferMSSIM);
163 
164     time = 1000*((double)getTickCount() - time)/getTickFrequency();
165     time /= TIMES;
166 
167     cout << "Time of MSSIM CUDA OPTIMIZED ( / " << TIMES << " runs): " << time << " milliseconds."
168         << " With result of B" << x.val[0] << " G" << x.val[1] << " R" << x.val[2] << endl << endl;
169     return 0;
170 }
171 
172 //! [getpsnr]
getPSNR(const Mat & I1,const Mat & I2)173 double getPSNR(const Mat& I1, const Mat& I2)
174 {
175     Mat s1;
176     absdiff(I1, I2, s1);       // |I1 - I2|
177     s1.convertTo(s1, CV_32F);  // cannot make a square on 8 bits
178     s1 = s1.mul(s1);           // |I1 - I2|^2
179 
180     Scalar s = sum(s1);         // sum elements per channel
181 
182     double sse = s.val[0] + s.val[1] + s.val[2]; // sum channels
183 
184     if( sse <= 1e-10) // for small values return zero
185         return 0;
186     else
187     {
188         double  mse =sse /(double)(I1.channels() * I1.total());
189         double psnr = 10.0*log10((255*255)/mse);
190         return psnr;
191     }
192 }
193 //! [getpsnr]
194 
195 //! [getpsnropt]
getPSNR_CUDA_optimized(const Mat & I1,const Mat & I2,BufferPSNR & b)196 double getPSNR_CUDA_optimized(const Mat& I1, const Mat& I2, BufferPSNR& b)
197 {
198     b.gI1.upload(I1);
199     b.gI2.upload(I2);
200 
201     b.gI1.convertTo(b.t1, CV_32F);
202     b.gI2.convertTo(b.t2, CV_32F);
203 
204     cuda::absdiff(b.t1.reshape(1), b.t2.reshape(1), b.gs);
205     cuda::multiply(b.gs, b.gs, b.gs);
206 
207     double sse = cuda::sum(b.gs, b.buf)[0];
208 
209     if( sse <= 1e-10) // for small values return zero
210         return 0;
211     else
212     {
213         double mse = sse /(double)(I1.channels() * I1.total());
214         double psnr = 10.0*log10((255*255)/mse);
215         return psnr;
216     }
217 }
218 //! [getpsnropt]
219 
220 //! [getpsnrcuda]
getPSNR_CUDA(const Mat & I1,const Mat & I2)221 double getPSNR_CUDA(const Mat& I1, const Mat& I2)
222 {
223     cuda::GpuMat gI1, gI2, gs, t1,t2;
224 
225     gI1.upload(I1);
226     gI2.upload(I2);
227 
228     gI1.convertTo(t1, CV_32F);
229     gI2.convertTo(t2, CV_32F);
230 
231     cuda::absdiff(t1.reshape(1), t2.reshape(1), gs);
232     cuda::multiply(gs, gs, gs);
233 
234     Scalar s = cuda::sum(gs);
235     double sse = s.val[0] + s.val[1] + s.val[2];
236 
237     if( sse <= 1e-10) // for small values return zero
238         return 0;
239     else
240     {
241         double  mse =sse /(double)(gI1.channels() * I1.total());
242         double psnr = 10.0*log10((255*255)/mse);
243         return psnr;
244     }
245 }
246 //! [getpsnrcuda]
247 
248 //! [getssim]
getMSSIM(const Mat & i1,const Mat & i2)249 Scalar getMSSIM( const Mat& i1, const Mat& i2)
250 {
251     const double C1 = 6.5025, C2 = 58.5225;
252     /***************************** INITS **********************************/
253     int d     = CV_32F;
254 
255     Mat I1, I2;
256     i1.convertTo(I1, d);           // cannot calculate on one byte large values
257     i2.convertTo(I2, d);
258 
259     Mat I2_2   = I2.mul(I2);        // I2^2
260     Mat I1_2   = I1.mul(I1);        // I1^2
261     Mat I1_I2  = I1.mul(I2);        // I1 * I2
262 
263     /*************************** END INITS **********************************/
264 
265     Mat mu1, mu2;   // PRELIMINARY COMPUTING
266     GaussianBlur(I1, mu1, Size(11, 11), 1.5);
267     GaussianBlur(I2, mu2, Size(11, 11), 1.5);
268 
269     Mat mu1_2   =   mu1.mul(mu1);
270     Mat mu2_2   =   mu2.mul(mu2);
271     Mat mu1_mu2 =   mu1.mul(mu2);
272 
273     Mat sigma1_2, sigma2_2, sigma12;
274 
275     GaussianBlur(I1_2, sigma1_2, Size(11, 11), 1.5);
276     sigma1_2 -= mu1_2;
277 
278     GaussianBlur(I2_2, sigma2_2, Size(11, 11), 1.5);
279     sigma2_2 -= mu2_2;
280 
281     GaussianBlur(I1_I2, sigma12, Size(11, 11), 1.5);
282     sigma12 -= mu1_mu2;
283 
284     ///////////////////////////////// FORMULA ////////////////////////////////
285     Mat t1, t2, t3;
286 
287     t1 = 2 * mu1_mu2 + C1;
288     t2 = 2 * sigma12 + C2;
289     t3 = t1.mul(t2);              // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
290 
291     t1 = mu1_2 + mu2_2 + C1;
292     t2 = sigma1_2 + sigma2_2 + C2;
293     t1 = t1.mul(t2);               // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
294 
295     Mat ssim_map;
296     divide(t3, t1, ssim_map);      // ssim_map =  t3./t1;
297 
298     Scalar mssim = mean( ssim_map ); // mssim = average of ssim map
299     return mssim;
300 }
301 //! [getssim]
302 
303 //! [getssimcuda]
getMSSIM_CUDA(const Mat & i1,const Mat & i2)304 Scalar getMSSIM_CUDA( const Mat& i1, const Mat& i2)
305 {
306     const float C1 = 6.5025f, C2 = 58.5225f;
307     /***************************** INITS **********************************/
308     cuda::GpuMat gI1, gI2, gs1, tmp1,tmp2;
309 
310     gI1.upload(i1);
311     gI2.upload(i2);
312 
313     gI1.convertTo(tmp1, CV_MAKE_TYPE(CV_32F, gI1.channels()));
314     gI2.convertTo(tmp2, CV_MAKE_TYPE(CV_32F, gI2.channels()));
315 
316     vector<cuda::GpuMat> vI1, vI2;
317     cuda::split(tmp1, vI1);
318     cuda::split(tmp2, vI2);
319     Scalar mssim;
320 
321     Ptr<cuda::Filter> gauss = cuda::createGaussianFilter(vI2[0].type(), -1, Size(11, 11), 1.5);
322 
323     for( int i = 0; i < gI1.channels(); ++i )
324     {
325         cuda::GpuMat I2_2, I1_2, I1_I2;
326 
327         cuda::multiply(vI2[i], vI2[i], I2_2);        // I2^2
328         cuda::multiply(vI1[i], vI1[i], I1_2);        // I1^2
329         cuda::multiply(vI1[i], vI2[i], I1_I2);       // I1 * I2
330 
331         /*************************** END INITS **********************************/
332         cuda::GpuMat mu1, mu2;   // PRELIMINARY COMPUTING
333         gauss->apply(vI1[i], mu1);
334         gauss->apply(vI2[i], mu2);
335 
336         cuda::GpuMat mu1_2, mu2_2, mu1_mu2;
337         cuda::multiply(mu1, mu1, mu1_2);
338         cuda::multiply(mu2, mu2, mu2_2);
339         cuda::multiply(mu1, mu2, mu1_mu2);
340 
341         cuda::GpuMat sigma1_2, sigma2_2, sigma12;
342 
343         gauss->apply(I1_2, sigma1_2);
344         cuda::subtract(sigma1_2, mu1_2, sigma1_2); // sigma1_2 -= mu1_2;
345 
346         gauss->apply(I2_2, sigma2_2);
347         cuda::subtract(sigma2_2, mu2_2, sigma2_2); // sigma2_2 -= mu2_2;
348 
349         gauss->apply(I1_I2, sigma12);
350         cuda::subtract(sigma12, mu1_mu2, sigma12); // sigma12 -= mu1_mu2;
351 
352         ///////////////////////////////// FORMULA ////////////////////////////////
353         cuda::GpuMat t1, t2, t3;
354 
355         mu1_mu2.convertTo(t1, -1, 2, C1); // t1 = 2 * mu1_mu2 + C1;
356         sigma12.convertTo(t2, -1, 2, C2); // t2 = 2 * sigma12 + C2;
357         cuda::multiply(t1, t2, t3);        // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
358 
359         cuda::addWeighted(mu1_2, 1.0, mu2_2, 1.0, C1, t1);       // t1 = mu1_2 + mu2_2 + C1;
360         cuda::addWeighted(sigma1_2, 1.0, sigma2_2, 1.0, C2, t2); // t2 = sigma1_2 + sigma2_2 + C2;
361         cuda::multiply(t1, t2, t1);                              // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
362 
363         cuda::GpuMat ssim_map;
364         cuda::divide(t3, t1, ssim_map);      // ssim_map =  t3./t1;
365 
366         Scalar s = cuda::sum(ssim_map);
367         mssim.val[i] = s.val[0] / (ssim_map.rows * ssim_map.cols);
368 
369     }
370     return mssim;
371 }
372 //! [getssimcuda]
373 
374 //! [getssimopt]
getMSSIM_CUDA_optimized(const Mat & i1,const Mat & i2,BufferMSSIM & b)375 Scalar getMSSIM_CUDA_optimized( const Mat& i1, const Mat& i2, BufferMSSIM& b)
376 {
377     const float C1 = 6.5025f, C2 = 58.5225f;
378     /***************************** INITS **********************************/
379 
380     b.gI1.upload(i1);
381     b.gI2.upload(i2);
382 
383     cuda::Stream stream;
384 
385     b.gI1.convertTo(b.t1, CV_32F, stream);
386     b.gI2.convertTo(b.t2, CV_32F, stream);
387 
388     cuda::split(b.t1, b.vI1, stream);
389     cuda::split(b.t2, b.vI2, stream);
390     Scalar mssim;
391 
392     Ptr<cuda::Filter> gauss = cuda::createGaussianFilter(b.vI1[0].type(), -1, Size(11, 11), 1.5);
393 
394     for( int i = 0; i < b.gI1.channels(); ++i )
395     {
396         cuda::multiply(b.vI2[i], b.vI2[i], b.I2_2, 1, -1, stream);        // I2^2
397         cuda::multiply(b.vI1[i], b.vI1[i], b.I1_2, 1, -1, stream);        // I1^2
398         cuda::multiply(b.vI1[i], b.vI2[i], b.I1_I2, 1, -1, stream);       // I1 * I2
399 
400         gauss->apply(b.vI1[i], b.mu1, stream);
401         gauss->apply(b.vI2[i], b.mu2, stream);
402 
403         cuda::multiply(b.mu1, b.mu1, b.mu1_2, 1, -1, stream);
404         cuda::multiply(b.mu2, b.mu2, b.mu2_2, 1, -1, stream);
405         cuda::multiply(b.mu1, b.mu2, b.mu1_mu2, 1, -1, stream);
406 
407         gauss->apply(b.I1_2, b.sigma1_2, stream);
408         cuda::subtract(b.sigma1_2, b.mu1_2, b.sigma1_2, cuda::GpuMat(), -1, stream);
409         //b.sigma1_2 -= b.mu1_2;  - This would result in an extra data transfer operation
410 
411         gauss->apply(b.I2_2, b.sigma2_2, stream);
412         cuda::subtract(b.sigma2_2, b.mu2_2, b.sigma2_2, cuda::GpuMat(), -1, stream);
413         //b.sigma2_2 -= b.mu2_2;
414 
415         gauss->apply(b.I1_I2, b.sigma12, stream);
416         cuda::subtract(b.sigma12, b.mu1_mu2, b.sigma12, cuda::GpuMat(), -1, stream);
417         //b.sigma12 -= b.mu1_mu2;
418 
419         //here too it would be an extra data transfer due to call of operator*(Scalar, Mat)
420         cuda::multiply(b.mu1_mu2, 2, b.t1, 1, -1, stream); //b.t1 = 2 * b.mu1_mu2 + C1;
421         cuda::add(b.t1, C1, b.t1, cuda::GpuMat(), -1, stream);
422         cuda::multiply(b.sigma12, 2, b.t2, 1, -1, stream); //b.t2 = 2 * b.sigma12 + C2;
423         cuda::add(b.t2, C2, b.t2, cuda::GpuMat(), -12, stream);
424 
425         cuda::multiply(b.t1, b.t2, b.t3, 1, -1, stream);     // t3 = ((2*mu1_mu2 + C1).*(2*sigma12 + C2))
426 
427         cuda::add(b.mu1_2, b.mu2_2, b.t1, cuda::GpuMat(), -1, stream);
428         cuda::add(b.t1, C1, b.t1, cuda::GpuMat(), -1, stream);
429 
430         cuda::add(b.sigma1_2, b.sigma2_2, b.t2, cuda::GpuMat(), -1, stream);
431         cuda::add(b.t2, C2, b.t2, cuda::GpuMat(), -1, stream);
432 
433 
434         cuda::multiply(b.t1, b.t2, b.t1, 1, -1, stream);     // t1 =((mu1_2 + mu2_2 + C1).*(sigma1_2 + sigma2_2 + C2))
435         cuda::divide(b.t3, b.t1, b.ssim_map, 1, -1, stream);      // ssim_map =  t3./t1;
436 
437         stream.waitForCompletion();
438 
439         Scalar s = cuda::sum(b.ssim_map, b.buf);
440         mssim.val[i] = s.val[0] / (b.ssim_map.rows * b.ssim_map.cols);
441 
442     }
443     return mssim;
444 }
445 //! [getssimopt]
446