1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "../util/ssim.h" // NOLINT
12
13 #include <string.h>
14
15 #ifdef __cplusplus
16 extern "C" {
17 #endif
18
19 typedef unsigned int uint32_t; // NOLINT
20 typedef unsigned short uint16_t; // NOLINT
21
22 #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
23 (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
24 #define __SSE2__
25 #endif
26 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
27 #include <emmintrin.h>
28 #endif
29
30 #ifdef _OPENMP
31 #include <omp.h>
32 #endif
33
34 // SSIM
35 enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
36
37 // Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i)
38 // The maximum value (11 x 11) must be less than 128 to avoid sign
39 // problems during the calls to _mm_mullo_epi16().
40 static const int K[KERNEL_SIZE] = {
41 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
42 };
43 static const double kiW[KERNEL + 1 + 1] = {
44 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
45 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
46 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
47 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
48 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
49 };
50
51 #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
52
53 #define PWEIGHT(A, B) static_cast<uint16_t>(K[(A)] * K[(B)]) // weight product
54 #define MAKE_WEIGHT(L) \
55 { \
56 { \
57 { \
58 PWEIGHT(L, 0) \
59 , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
60 PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \
61 } \
62 } \
63 }
64
65 // We need this union trick to be able to initialize constant static __m128i
66 // values. We can't call _mm_set_epi16() for static compile-time initialization.
67 static const struct {
68 union {
69 uint16_t i16_[8];
70 __m128i m_;
71 } values_;
72 } W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
73 W3 = MAKE_WEIGHT(3);
74 // ... the rest is symmetric.
75 #undef MAKE_WEIGHT
76 #undef PWEIGHT
77 #endif
78
79 // Common final expression for SSIM, once the weighted sums are known.
FinalizeSSIM(double iw,double xm,double ym,double xxm,double xym,double yym)80 static double FinalizeSSIM(double iw,
81 double xm,
82 double ym,
83 double xxm,
84 double xym,
85 double yym) {
86 const double iwx = xm * iw;
87 const double iwy = ym * iw;
88 double sxx = xxm * iw - iwx * iwx;
89 double syy = yym * iw - iwy * iwy;
90 // small errors are possible, due to rounding. Clamp to zero.
91 if (sxx < 0.) {
92 sxx = 0.;
93 }
94 if (syy < 0.) {
95 syy = 0.;
96 }
97 const double sxsy = sqrt(sxx * syy);
98 const double sxy = xym * iw - iwx * iwy;
99 static const double C11 = (0.01 * 0.01) * (255 * 255);
100 static const double C22 = (0.03 * 0.03) * (255 * 255);
101 static const double C33 = (0.015 * 0.015) * (255 * 255);
102 const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
103 const double c = (2. * sxsy + C22) / (sxx + syy + C22);
104 const double s = (sxy + C33) / (sxsy + C33);
105 return l * c * s;
106 }
107
108 // GetSSIM() does clipping. GetSSIMFullKernel() does not
109
110 // TODO(skal): use summed tables?
111 // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
112 // with a diff of 255, squared. The maximum error is thus 0x4388241,
113 // which fits into 32 bits integers.
GetSSIM(const uint8_t * org,const uint8_t * rec,int xo,int yo,int W,int H,int stride)114 double GetSSIM(const uint8_t* org,
115 const uint8_t* rec,
116 int xo,
117 int yo,
118 int W,
119 int H,
120 int stride) {
121 uint32_t ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
122 org += (yo - KERNEL) * stride;
123 org += (xo - KERNEL);
124 rec += (yo - KERNEL) * stride;
125 rec += (xo - KERNEL);
126 for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
127 if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) {
128 continue;
129 }
130 const int Wy = K[y_];
131 for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
132 const int Wxy = Wy * K[x_];
133 if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
134 const int org_x = org[x_];
135 const int rec_x = rec[x_];
136 ws += Wxy;
137 xm += Wxy * org_x;
138 ym += Wxy * rec_x;
139 xxm += Wxy * org_x * org_x;
140 xym += Wxy * org_x * rec_x;
141 yym += Wxy * rec_x * rec_x;
142 }
143 }
144 }
145 return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
146 }
147
GetSSIMFullKernel(const uint8_t * org,const uint8_t * rec,int xo,int yo,int stride,double area_weight)148 double GetSSIMFullKernel(const uint8_t* org,
149 const uint8_t* rec,
150 int xo,
151 int yo,
152 int stride,
153 double area_weight) {
154 uint32_t xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
155
156 #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
157
158 org += yo * stride + xo;
159 rec += yo * stride + xo;
160 for (int y = 1; y <= KERNEL; y++) {
161 const int dy1 = y * stride;
162 const int dy2 = y * stride;
163 const int Wy = K[KERNEL + y];
164
165 for (int x = 1; x <= KERNEL; x++) {
166 // Compute the contributions of upper-left (ul), upper-right (ur)
167 // lower-left (ll) and lower-right (lr) points (see the diagram below).
168 // Symmetric Kernel will have same weight on those points.
169 // - - - - - - -
170 // - ul - - - ur -
171 // - - - - - - -
172 // - - - 0 - - -
173 // - - - - - - -
174 // - ll - - - lr -
175 // - - - - - - -
176 const int Wxy = Wy * K[KERNEL + x];
177 const int ul1 = org[-dy1 - x];
178 const int ur1 = org[-dy1 + x];
179 const int ll1 = org[dy1 - x];
180 const int lr1 = org[dy1 + x];
181
182 const int ul2 = rec[-dy2 - x];
183 const int ur2 = rec[-dy2 + x];
184 const int ll2 = rec[dy2 - x];
185 const int lr2 = rec[dy2 + x];
186
187 xm += Wxy * (ul1 + ur1 + ll1 + lr1);
188 ym += Wxy * (ul2 + ur2 + ll2 + lr2);
189 xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
190 xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
191 yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
192 }
193
194 // Compute the contributions of up (u), down (d), left (l) and right (r)
195 // points across the main axes (see the diagram below).
196 // Symmetric Kernel will have same weight on those points.
197 // - - - - - - -
198 // - - - u - - -
199 // - - - - - - -
200 // - l - 0 - r -
201 // - - - - - - -
202 // - - - d - - -
203 // - - - - - - -
204 const int Wxy = Wy * K[KERNEL];
205 const int u1 = org[-dy1];
206 const int d1 = org[dy1];
207 const int l1 = org[-y];
208 const int r1 = org[y];
209
210 const int u2 = rec[-dy2];
211 const int d2 = rec[dy2];
212 const int l2 = rec[-y];
213 const int r2 = rec[y];
214
215 xm += Wxy * (u1 + d1 + l1 + r1);
216 ym += Wxy * (u2 + d2 + l2 + r2);
217 xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
218 xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
219 yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
220 }
221
222 // Lastly the contribution of (x0, y0) point.
223 const int Wxy = K[KERNEL] * K[KERNEL];
224 const int s1 = org[0];
225 const int s2 = rec[0];
226
227 xm += Wxy * s1;
228 ym += Wxy * s2;
229 xxm += Wxy * s1 * s1;
230 xym += Wxy * s1 * s2;
231 yym += Wxy * s2 * s2;
232
233 #else // __SSE2__
234
235 org += (yo - KERNEL) * stride + (xo - KERNEL);
236 rec += (yo - KERNEL) * stride + (xo - KERNEL);
237
238 const __m128i zero = _mm_setzero_si128();
239 __m128i x = zero;
240 __m128i y = zero;
241 __m128i xx = zero;
242 __m128i xy = zero;
243 __m128i yy = zero;
244
245 // Read 8 pixels at line #L, and convert to 16bit, perform weighting
246 // and acccumulate.
247 #define LOAD_LINE_PAIR(L, WEIGHT) \
248 do { \
249 const __m128i v0 = \
250 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
251 const __m128i v1 = \
252 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
253 const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
254 const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
255 const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
256 const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
257 x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
258 y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
259 x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
260 y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
261 xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
262 xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
263 yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
264 } while (0)
265
266 #define ADD_AND_STORE_FOUR_EPI32(M, OUT) \
267 do { \
268 uint32_t tmp[4]; \
269 _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
270 (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
271 } while (0)
272
273 LOAD_LINE_PAIR(0, W0);
274 LOAD_LINE_PAIR(1, W1);
275 LOAD_LINE_PAIR(2, W2);
276 LOAD_LINE_PAIR(3, W3);
277 LOAD_LINE_PAIR(4, W2);
278 LOAD_LINE_PAIR(5, W1);
279 LOAD_LINE_PAIR(6, W0);
280
281 ADD_AND_STORE_FOUR_EPI32(x, xm);
282 ADD_AND_STORE_FOUR_EPI32(y, ym);
283 ADD_AND_STORE_FOUR_EPI32(xx, xxm);
284 ADD_AND_STORE_FOUR_EPI32(xy, xym);
285 ADD_AND_STORE_FOUR_EPI32(yy, yym);
286
287 #undef LOAD_LINE_PAIR
288 #undef ADD_AND_STORE_FOUR_EPI32
289 #endif
290
291 return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
292 }
293
start_max(int x,int y)294 static int start_max(int x, int y) {
295 return (x > y) ? x : y;
296 }
297
CalcSSIM(const uint8_t * org,const uint8_t * rec,const int image_width,const int image_height)298 double CalcSSIM(const uint8_t* org,
299 const uint8_t* rec,
300 const int image_width,
301 const int image_height) {
302 double SSIM = 0.;
303 const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
304 const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
305 const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
306 const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
307 const int stride = image_width;
308
309 for (int j = 0; j < KERNEL_Y; ++j) {
310 for (int i = 0; i < image_width; ++i) {
311 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
312 }
313 }
314
315 #ifdef _OPENMP
316 #pragma omp parallel for reduction(+ : SSIM)
317 #endif
318 for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
319 for (int i = 0; i < KERNEL_X; ++i) {
320 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
321 }
322 for (int i = KERNEL_X; i < start_x; ++i) {
323 SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
324 }
325 if (start_x < image_width) {
326 // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
327 // copy the 8 rightmost pixels on a cache area, and pad this area with
328 // zeros which won't contribute to the overall SSIM value (but we need
329 // to pass the correct normalizing constant!). By using this cache, we can
330 // still call GetSSIMFullKernel() instead of the slower GetSSIM().
331 // NOTE: we could use similar method for the left-most pixels too.
332 const int kScratchWidth = 8;
333 const int kScratchStride = kScratchWidth + KERNEL + 1;
334 uint8_t scratch_org[KERNEL_SIZE * kScratchStride] = {0};
335 uint8_t scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
336
337 for (int k = 0; k < KERNEL_SIZE; ++k) {
338 const int offset =
339 (j - KERNEL + k) * stride + image_width - kScratchWidth;
340 memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
341 memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
342 }
343 for (int k = 0; k <= KERNEL_X + 1; ++k) {
344 SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
345 kScratchStride, kiW[k]);
346 }
347 }
348 }
349
350 for (int j = start_y; j < image_height; ++j) {
351 for (int i = 0; i < image_width; ++i) {
352 SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
353 }
354 }
355 return SSIM;
356 }
357
CalcLSSIM(double ssim)358 double CalcLSSIM(double ssim) {
359 return -10.0 * log10(1.0 - ssim);
360 }
361
362 #ifdef __cplusplus
363 } // extern "C"
364 #endif
365