1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "../util/ssim.h"  // NOLINT
12 
13 #include <string.h>
14 
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18 
19 typedef unsigned int uint32_t;    // NOLINT
20 typedef unsigned short uint16_t;  // NOLINT
21 
22 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
23     (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
24 #define __SSE2__
25 #endif
26 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
27 #include <emmintrin.h>
28 #endif
29 
30 #ifdef _OPENMP
31 #include <omp.h>
32 #endif
33 
34 // SSIM
35 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
36 
37 // Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
38 // The maximum value (11 x 11) must be less than 128 to avoid sign
39 // problems during the calls to _mm_mullo_epi16().
40 static const int K[KERNEL_SIZE] = {
41     1, 3, 7, 11, 7, 3, 1  // ~11 * exp(-0.3 * i * i)
42 };
43 static const double kiW[KERNEL + 1 + 1] = {
44     1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
45     1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
46     1. / 1056.,  // 1 / sum(i:0..5, j..6) K[i]*K[j]
47     1. / 957.,   // 1 / sum(i:0..4, j..6) K[i]*K[j]
48     1. / 726.,   // 1 / sum(i:0..3, j..6) K[i]*K[j]
49 };
50 
51 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
52 
53 #define PWEIGHT(A, B) static_cast<uint16_t>(K[(A)] * K[(B)])  // weight product
54 #define MAKE_WEIGHT(L)                                                \
55   {                                                                   \
56     {                                                                 \
57       {                                                               \
58         PWEIGHT(L, 0)                                                 \
59         , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
60             PWEIGHT(L, 5), PWEIGHT(L, 6), 0                           \
61       }                                                               \
62     }                                                                 \
63   }
64 
65 // We need this union trick to be able to initialize constant static __m128i
66 // values. We can't call _mm_set_epi16() for static compile-time initialization.
67 static const struct {
68   union {
69     uint16_t i16_[8];
70     __m128i m_;
71   } values_;
72 } W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
73   W3 = MAKE_WEIGHT(3);
74 // ... the rest is symmetric.
75 #undef MAKE_WEIGHT
76 #undef PWEIGHT
77 #endif
78 
79 // Common final expression for SSIM, once the weighted sums are known.
FinalizeSSIM(double iw,double xm,double ym,double xxm,double xym,double yym)80 static double FinalizeSSIM(double iw,
81                            double xm,
82                            double ym,
83                            double xxm,
84                            double xym,
85                            double yym) {
86   const double iwx = xm * iw;
87   const double iwy = ym * iw;
88   double sxx = xxm * iw - iwx * iwx;
89   double syy = yym * iw - iwy * iwy;
90   // small errors are possible, due to rounding. Clamp to zero.
91   if (sxx < 0.) {
92     sxx = 0.;
93   }
94   if (syy < 0.) {
95     syy = 0.;
96   }
97   const double sxsy = sqrt(sxx * syy);
98   const double sxy = xym * iw - iwx * iwy;
99   static const double C11 = (0.01 * 0.01) * (255 * 255);
100   static const double C22 = (0.03 * 0.03) * (255 * 255);
101   static const double C33 = (0.015 * 0.015) * (255 * 255);
102   const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
103   const double c = (2. * sxsy + C22) / (sxx + syy + C22);
104   const double s = (sxy + C33) / (sxsy + C33);
105   return l * c * s;
106 }
107 
108 // GetSSIM() does clipping.  GetSSIMFullKernel() does not
109 
110 // TODO(skal): use summed tables?
111 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
112 // with a diff of 255, squared. The maximum error is thus 0x4388241,
113 // which fits into 32 bits integers.
GetSSIM(const uint8_t * org,const uint8_t * rec,int xo,int yo,int W,int H,int stride)114 double GetSSIM(const uint8_t* org,
115                const uint8_t* rec,
116                int xo,
117                int yo,
118                int W,
119                int H,
120                int stride) {
121   uint32_t ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
122   org += (yo - KERNEL) * stride;
123   org += (xo - KERNEL);
124   rec += (yo - KERNEL) * stride;
125   rec += (xo - KERNEL);
126   for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
127     if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) {
128       continue;
129     }
130     const int Wy = K[y_];
131     for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
132       const int Wxy = Wy * K[x_];
133       if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
134         const int org_x = org[x_];
135         const int rec_x = rec[x_];
136         ws += Wxy;
137         xm += Wxy * org_x;
138         ym += Wxy * rec_x;
139         xxm += Wxy * org_x * org_x;
140         xym += Wxy * org_x * rec_x;
141         yym += Wxy * rec_x * rec_x;
142       }
143     }
144   }
145   return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
146 }
147 
GetSSIMFullKernel(const uint8_t * org,const uint8_t * rec,int xo,int yo,int stride,double area_weight)148 double GetSSIMFullKernel(const uint8_t* org,
149                          const uint8_t* rec,
150                          int xo,
151                          int yo,
152                          int stride,
153                          double area_weight) {
154   uint32_t xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
155 
156 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
157 
158   org += yo * stride + xo;
159   rec += yo * stride + xo;
160   for (int y = 1; y <= KERNEL; y++) {
161     const int dy1 = y * stride;
162     const int dy2 = y * stride;
163     const int Wy = K[KERNEL + y];
164 
165     for (int x = 1; x <= KERNEL; x++) {
166       // Compute the contributions of upper-left (ul), upper-right (ur)
167       // lower-left (ll) and lower-right (lr) points (see the diagram below).
168       // Symmetric Kernel will have same weight on those points.
169       //       -  -  -  -  -  -  -
170       //       -  ul -  -  -  ur -
171       //       -  -  -  -  -  -  -
172       //       -  -  -  0  -  -  -
173       //       -  -  -  -  -  -  -
174       //       -  ll -  -  -  lr -
175       //       -  -  -  -  -  -  -
176       const int Wxy = Wy * K[KERNEL + x];
177       const int ul1 = org[-dy1 - x];
178       const int ur1 = org[-dy1 + x];
179       const int ll1 = org[dy1 - x];
180       const int lr1 = org[dy1 + x];
181 
182       const int ul2 = rec[-dy2 - x];
183       const int ur2 = rec[-dy2 + x];
184       const int ll2 = rec[dy2 - x];
185       const int lr2 = rec[dy2 + x];
186 
187       xm += Wxy * (ul1 + ur1 + ll1 + lr1);
188       ym += Wxy * (ul2 + ur2 + ll2 + lr2);
189       xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
190       xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
191       yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
192     }
193 
194     // Compute the contributions of up (u), down (d), left (l) and right (r)
195     // points across the main axes (see the diagram below).
196     // Symmetric Kernel will have same weight on those points.
197     //       -  -  -  -  -  -  -
198     //       -  -  -  u  -  -  -
199     //       -  -  -  -  -  -  -
200     //       -  l  -  0  -  r  -
201     //       -  -  -  -  -  -  -
202     //       -  -  -  d  -  -  -
203     //       -  -  -  -  -  -  -
204     const int Wxy = Wy * K[KERNEL];
205     const int u1 = org[-dy1];
206     const int d1 = org[dy1];
207     const int l1 = org[-y];
208     const int r1 = org[y];
209 
210     const int u2 = rec[-dy2];
211     const int d2 = rec[dy2];
212     const int l2 = rec[-y];
213     const int r2 = rec[y];
214 
215     xm += Wxy * (u1 + d1 + l1 + r1);
216     ym += Wxy * (u2 + d2 + l2 + r2);
217     xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
218     xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
219     yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
220   }
221 
222   // Lastly the contribution of (x0, y0) point.
223   const int Wxy = K[KERNEL] * K[KERNEL];
224   const int s1 = org[0];
225   const int s2 = rec[0];
226 
227   xm += Wxy * s1;
228   ym += Wxy * s2;
229   xxm += Wxy * s1 * s1;
230   xym += Wxy * s1 * s2;
231   yym += Wxy * s2 * s2;
232 
233 #else  // __SSE2__
234 
235   org += (yo - KERNEL) * stride + (xo - KERNEL);
236   rec += (yo - KERNEL) * stride + (xo - KERNEL);
237 
238   const __m128i zero = _mm_setzero_si128();
239   __m128i x = zero;
240   __m128i y = zero;
241   __m128i xx = zero;
242   __m128i xy = zero;
243   __m128i yy = zero;
244 
245 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
246 // and acccumulate.
247 #define LOAD_LINE_PAIR(L, WEIGHT)                                            \
248   do {                                                                       \
249     const __m128i v0 =                                                       \
250         _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
251     const __m128i v1 =                                                       \
252         _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
253     const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                          \
254     const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                          \
255     const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);            \
256     const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);            \
257     x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                     \
258     y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                     \
259     x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                     \
260     y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                     \
261     xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                         \
262     xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                         \
263     yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                         \
264   } while (0)
265 
266 #define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \
267   do {                                                      \
268     uint32_t tmp[4];                                        \
269     _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
270     (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];              \
271   } while (0)
272 
273   LOAD_LINE_PAIR(0, W0);
274   LOAD_LINE_PAIR(1, W1);
275   LOAD_LINE_PAIR(2, W2);
276   LOAD_LINE_PAIR(3, W3);
277   LOAD_LINE_PAIR(4, W2);
278   LOAD_LINE_PAIR(5, W1);
279   LOAD_LINE_PAIR(6, W0);
280 
281   ADD_AND_STORE_FOUR_EPI32(x, xm);
282   ADD_AND_STORE_FOUR_EPI32(y, ym);
283   ADD_AND_STORE_FOUR_EPI32(xx, xxm);
284   ADD_AND_STORE_FOUR_EPI32(xy, xym);
285   ADD_AND_STORE_FOUR_EPI32(yy, yym);
286 
287 #undef LOAD_LINE_PAIR
288 #undef ADD_AND_STORE_FOUR_EPI32
289 #endif
290 
291   return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
292 }
293 
start_max(int x,int y)294 static int start_max(int x, int y) {
295   return (x > y) ? x : y;
296 }
297 
CalcSSIM(const uint8_t * org,const uint8_t * rec,const int image_width,const int image_height)298 double CalcSSIM(const uint8_t* org,
299                 const uint8_t* rec,
300                 const int image_width,
301                 const int image_height) {
302   double SSIM = 0.;
303   const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
304   const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
305   const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
306   const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
307   const int stride = image_width;
308 
309   for (int j = 0; j < KERNEL_Y; ++j) {
310     for (int i = 0; i < image_width; ++i) {
311       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
312     }
313   }
314 
315 #ifdef _OPENMP
316 #pragma omp parallel for reduction(+ : SSIM)
317 #endif
318   for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
319     for (int i = 0; i < KERNEL_X; ++i) {
320       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
321     }
322     for (int i = KERNEL_X; i < start_x; ++i) {
323       SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
324     }
325     if (start_x < image_width) {
326       // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
327       // copy the 8 rightmost pixels on a cache area, and pad this area with
328       // zeros which won't contribute to the overall SSIM value (but we need
329       // to pass the correct normalizing constant!). By using this cache, we can
330       // still call GetSSIMFullKernel() instead of the slower GetSSIM().
331       // NOTE: we could use similar method for the left-most pixels too.
332       const int kScratchWidth = 8;
333       const int kScratchStride = kScratchWidth + KERNEL + 1;
334       uint8_t scratch_org[KERNEL_SIZE * kScratchStride] = {0};
335       uint8_t scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
336 
337       for (int k = 0; k < KERNEL_SIZE; ++k) {
338         const int offset =
339             (j - KERNEL + k) * stride + image_width - kScratchWidth;
340         memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
341         memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
342       }
343       for (int k = 0; k <= KERNEL_X + 1; ++k) {
344         SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
345                                   kScratchStride, kiW[k]);
346       }
347     }
348   }
349 
350   for (int j = start_y; j < image_height; ++j) {
351     for (int i = 0; i < image_width; ++i) {
352       SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
353     }
354   }
355   return SSIM;
356 }
357 
CalcLSSIM(double ssim)358 double CalcLSSIM(double ssim) {
359   return -10.0 * log10(1.0 - ssim);
360 }
361 
362 #ifdef __cplusplus
363 }  // extern "C"
364 #endif
365