1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "../util/ssim.h"  // NOLINT
12 
13 #include <string.h>
14 
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18 
19 typedef unsigned int uint32;    // NOLINT
20 typedef unsigned short uint16;  // NOLINT
21 
22 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
23     (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
24 #define __SSE2__
25 #endif
26 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
27 #include <emmintrin.h>
28 #endif
29 
30 #ifdef _OPENMP
31 #include <omp.h>
32 #endif
33 
34 // SSIM
35 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
36 
37 // Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
38 // The maximum value (11 x 11) must be less than 128 to avoid sign
39 // problems during the calls to _mm_mullo_epi16().
40 static const int K[KERNEL_SIZE] = {
41     1, 3, 7, 11, 7, 3, 1  // ~11 * exp(-0.3 * i * i)
42 };
43 static const double kiW[KERNEL + 1 + 1] = {
44     1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
45     1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
46     1. / 1056.,  // 1 / sum(i:0..5, j..6) K[i]*K[j]
47     1. / 957.,   // 1 / sum(i:0..4, j..6) K[i]*K[j]
48     1. / 726.,   // 1 / sum(i:0..3, j..6) K[i]*K[j]
49 };
50 
51 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
52 
53 #define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)])  // weight product
54 #define MAKE_WEIGHT(L)                                                \
55   {                                                                   \
56     {                                                                 \
57       {                                                               \
58         PWEIGHT(L, 0)                                                 \
59         , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
60             PWEIGHT(L, 5), PWEIGHT(L, 6), 0                           \
61       }                                                               \
62     }                                                                 \
63   }
64 
65 // We need this union trick to be able to initialize constant static __m128i
66 // values. We can't call _mm_set_epi16() for static compile-time initialization.
67 static const struct {
68   union {
69     uint16 i16_[8];
70     __m128i m_;
71   } values_;
72 } W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
73   W3 = MAKE_WEIGHT(3);
74 // ... the rest is symmetric.
75 #undef MAKE_WEIGHT
76 #undef PWEIGHT
77 #endif
78 
79 // Common final expression for SSIM, once the weighted sums are known.
FinalizeSSIM(double iw,double xm,double ym,double xxm,double xym,double yym)80 static double FinalizeSSIM(double iw,
81                            double xm,
82                            double ym,
83                            double xxm,
84                            double xym,
85                            double yym) {
86   const double iwx = xm * iw;
87   const double iwy = ym * iw;
88   double sxx = xxm * iw - iwx * iwx;
89   double syy = yym * iw - iwy * iwy;
90   // small errors are possible, due to rounding. Clamp to zero.
91   if (sxx < 0.)
92     sxx = 0.;
93   if (syy < 0.)
94     syy = 0.;
95   const double sxsy = sqrt(sxx * syy);
96   const double sxy = xym * iw - iwx * iwy;
97   static const double C11 = (0.01 * 0.01) * (255 * 255);
98   static const double C22 = (0.03 * 0.03) * (255 * 255);
99   static const double C33 = (0.015 * 0.015) * (255 * 255);
100   const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
101   const double c = (2. * sxsy + C22) / (sxx + syy + C22);
102   const double s = (sxy + C33) / (sxsy + C33);
103   return l * c * s;
104 }
105 
106 // GetSSIM() does clipping.  GetSSIMFullKernel() does not
107 
108 // TODO(skal): use summed tables?
109 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
110 // with a diff of 255, squared. The maximum error is thus 0x4388241,
111 // which fits into 32 bits integers.
GetSSIM(const uint8 * org,const uint8 * rec,int xo,int yo,int W,int H,int stride)112 double GetSSIM(const uint8* org,
113                const uint8* rec,
114                int xo,
115                int yo,
116                int W,
117                int H,
118                int stride) {
119   uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
120   org += (yo - KERNEL) * stride;
121   org += (xo - KERNEL);
122   rec += (yo - KERNEL) * stride;
123   rec += (xo - KERNEL);
124   for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
125     if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H))
126       continue;
127     const int Wy = K[y_];
128     for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
129       const int Wxy = Wy * K[x_];
130       if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
131         const int org_x = org[x_];
132         const int rec_x = rec[x_];
133         ws += Wxy;
134         xm += Wxy * org_x;
135         ym += Wxy * rec_x;
136         xxm += Wxy * org_x * org_x;
137         xym += Wxy * org_x * rec_x;
138         yym += Wxy * rec_x * rec_x;
139       }
140     }
141   }
142   return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
143 }
144 
GetSSIMFullKernel(const uint8 * org,const uint8 * rec,int xo,int yo,int stride,double area_weight)145 double GetSSIMFullKernel(const uint8* org,
146                          const uint8* rec,
147                          int xo,
148                          int yo,
149                          int stride,
150                          double area_weight) {
151   uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
152 
153 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
154 
155   org += yo * stride + xo;
156   rec += yo * stride + xo;
157   for (int y = 1; y <= KERNEL; y++) {
158     const int dy1 = y * stride;
159     const int dy2 = y * stride;
160     const int Wy = K[KERNEL + y];
161 
162     for (int x = 1; x <= KERNEL; x++) {
163       // Compute the contributions of upper-left (ul), upper-right (ur)
164       // lower-left (ll) and lower-right (lr) points (see the diagram below).
165       // Symmetric Kernel will have same weight on those points.
166       //       -  -  -  -  -  -  -
167       //       -  ul -  -  -  ur -
168       //       -  -  -  -  -  -  -
169       //       -  -  -  0  -  -  -
170       //       -  -  -  -  -  -  -
171       //       -  ll -  -  -  lr -
172       //       -  -  -  -  -  -  -
173       const int Wxy = Wy * K[KERNEL + x];
174       const int ul1 = org[-dy1 - x];
175       const int ur1 = org[-dy1 + x];
176       const int ll1 = org[dy1 - x];
177       const int lr1 = org[dy1 + x];
178 
179       const int ul2 = rec[-dy2 - x];
180       const int ur2 = rec[-dy2 + x];
181       const int ll2 = rec[dy2 - x];
182       const int lr2 = rec[dy2 + x];
183 
184       xm += Wxy * (ul1 + ur1 + ll1 + lr1);
185       ym += Wxy * (ul2 + ur2 + ll2 + lr2);
186       xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
187       xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
188       yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
189     }
190 
191     // Compute the contributions of up (u), down (d), left (l) and right (r)
192     // points across the main axes (see the diagram below).
193     // Symmetric Kernel will have same weight on those points.
194     //       -  -  -  -  -  -  -
195     //       -  -  -  u  -  -  -
196     //       -  -  -  -  -  -  -
197     //       -  l  -  0  -  r  -
198     //       -  -  -  -  -  -  -
199     //       -  -  -  d  -  -  -
200     //       -  -  -  -  -  -  -
201     const int Wxy = Wy * K[KERNEL];
202     const int u1 = org[-dy1];
203     const int d1 = org[dy1];
204     const int l1 = org[-y];
205     const int r1 = org[y];
206 
207     const int u2 = rec[-dy2];
208     const int d2 = rec[dy2];
209     const int l2 = rec[-y];
210     const int r2 = rec[y];
211 
212     xm += Wxy * (u1 + d1 + l1 + r1);
213     ym += Wxy * (u2 + d2 + l2 + r2);
214     xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
215     xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
216     yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
217   }
218 
219   // Lastly the contribution of (x0, y0) point.
220   const int Wxy = K[KERNEL] * K[KERNEL];
221   const int s1 = org[0];
222   const int s2 = rec[0];
223 
224   xm += Wxy * s1;
225   ym += Wxy * s2;
226   xxm += Wxy * s1 * s1;
227   xym += Wxy * s1 * s2;
228   yym += Wxy * s2 * s2;
229 
230 #else  // __SSE2__
231 
232   org += (yo - KERNEL) * stride + (xo - KERNEL);
233   rec += (yo - KERNEL) * stride + (xo - KERNEL);
234 
235   const __m128i zero = _mm_setzero_si128();
236   __m128i x = zero;
237   __m128i y = zero;
238   __m128i xx = zero;
239   __m128i xy = zero;
240   __m128i yy = zero;
241 
242 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
243 // and acccumulate.
244 #define LOAD_LINE_PAIR(L, WEIGHT)                                            \
245   do {                                                                       \
246     const __m128i v0 =                                                       \
247         _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
248     const __m128i v1 =                                                       \
249         _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
250     const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                          \
251     const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                          \
252     const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);            \
253     const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);            \
254     x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                     \
255     y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                     \
256     x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                     \
257     y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                     \
258     xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                         \
259     xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                         \
260     yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                         \
261   } while (0)
262 
263 #define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \
264   do {                                                      \
265     uint32 tmp[4];                                          \
266     _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
267     (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];              \
268   } while (0)
269 
270   LOAD_LINE_PAIR(0, W0);
271   LOAD_LINE_PAIR(1, W1);
272   LOAD_LINE_PAIR(2, W2);
273   LOAD_LINE_PAIR(3, W3);
274   LOAD_LINE_PAIR(4, W2);
275   LOAD_LINE_PAIR(5, W1);
276   LOAD_LINE_PAIR(6, W0);
277 
278   ADD_AND_STORE_FOUR_EPI32(x, xm);
279   ADD_AND_STORE_FOUR_EPI32(y, ym);
280   ADD_AND_STORE_FOUR_EPI32(xx, xxm);
281   ADD_AND_STORE_FOUR_EPI32(xy, xym);
282   ADD_AND_STORE_FOUR_EPI32(yy, yym);
283 
284 #undef LOAD_LINE_PAIR
285 #undef ADD_AND_STORE_FOUR_EPI32
286 #endif
287 
288   return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
289 }
290 
start_max(int x,int y)291 static int start_max(int x, int y) {
292   return (x > y) ? x : y;
293 }
294 
CalcSSIM(const uint8 * org,const uint8 * rec,const int image_width,const int image_height)295 double CalcSSIM(const uint8* org,
296                 const uint8* rec,
297                 const int image_width,
298                 const int image_height) {
299   double SSIM = 0.;
300   const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
301   const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
302   const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
303   const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
304   const int stride = image_width;
305 
306   for (int j = 0; j < KERNEL_Y; ++j) {
307     for (int i = 0; i < image_width; ++i) {
308       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
309     }
310   }
311 
312 #ifdef _OPENMP
313 #pragma omp parallel for reduction(+ : SSIM)
314 #endif
315   for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
316     for (int i = 0; i < KERNEL_X; ++i) {
317       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
318     }
319     for (int i = KERNEL_X; i < start_x; ++i) {
320       SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
321     }
322     if (start_x < image_width) {
323       // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
324       // copy the 8 rightmost pixels on a cache area, and pad this area with
325       // zeros which won't contribute to the overall SSIM value (but we need
326       // to pass the correct normalizing constant!). By using this cache, we can
327       // still call GetSSIMFullKernel() instead of the slower GetSSIM().
328       // NOTE: we could use similar method for the left-most pixels too.
329       const int kScratchWidth = 8;
330       const int kScratchStride = kScratchWidth + KERNEL + 1;
331       uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0};
332       uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
333 
334       for (int k = 0; k < KERNEL_SIZE; ++k) {
335         const int offset =
336             (j - KERNEL + k) * stride + image_width - kScratchWidth;
337         memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
338         memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
339       }
340       for (int k = 0; k <= KERNEL_X + 1; ++k) {
341         SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
342                                   kScratchStride, kiW[k]);
343       }
344     }
345   }
346 
347   for (int j = start_y; j < image_height; ++j) {
348     for (int i = 0; i < image_width; ++i) {
349       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
350     }
351   }
352   return SSIM;
353 }
354 
CalcLSSIM(double ssim)355 double CalcLSSIM(double ssim) {
356   return -10.0 * log10(1.0 - ssim);
357 }
358 
359 #ifdef __cplusplus
360 }  // extern "C"
361 #endif
362