1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./psnr.h"  // NOLINT
12 
13 #ifdef _OPENMP
14 #include <omp.h>
15 #endif
16 #ifdef _MSC_VER
17 #include <intrin.h>  // For __cpuid()
18 #endif
19 
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
23 
24 typedef unsigned int uint32;  // NOLINT
25 #ifdef _MSC_VER
26 typedef unsigned __int64 uint64;
27 #else  // COMPILER_MSVC
28 #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
29 typedef unsigned long uint64;  // NOLINT
30 #else   // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
31 typedef unsigned long long uint64;  // NOLINT
32 #endif  // __LP64__
33 #endif  // _MSC_VER
34 
35 // libyuv provides this function when linking library for jpeg support.
36 #if !defined(HAVE_JPEG)
37 
38 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
39     !defined(__aarch64__)
40 #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)41 static uint32 SumSquareError_NEON(const uint8* src_a,
42                                   const uint8* src_b,
43                                   int count) {
44   volatile uint32 sse;
45   asm volatile(
46       "vmov.u8    q7, #0                         \n"
47       "vmov.u8    q9, #0                         \n"
48       "vmov.u8    q8, #0                         \n"
49       "vmov.u8    q10, #0                        \n"
50 
51       "1:                                        \n"
52       "vld1.u8    {q0}, [%0]!                    \n"
53       "vld1.u8    {q1}, [%1]!                    \n"
54       "vsubl.u8   q2, d0, d2                     \n"
55       "vsubl.u8   q3, d1, d3                     \n"
56       "vmlal.s16  q7, d4, d4                     \n"
57       "vmlal.s16  q8, d6, d6                     \n"
58       "vmlal.s16  q8, d5, d5                     \n"
59       "vmlal.s16  q10, d7, d7                    \n"
60       "subs       %2, %2, #16                    \n"
61       "bhi        1b                             \n"
62 
63       "vadd.u32   q7, q7, q8                     \n"
64       "vadd.u32   q9, q9, q10                    \n"
65       "vadd.u32   q10, q7, q9                    \n"
66       "vpaddl.u32 q1, q10                        \n"
67       "vadd.u64   d0, d2, d3                     \n"
68       "vmov.32    %3, d0[0]                      \n"
69       : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
70       :
71       : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
72   return sse;
73 }
74 #elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
75 #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)76 static uint32 SumSquareError_NEON(const uint8* src_a,
77                                   const uint8* src_b,
78                                   int count) {
79   volatile uint32 sse;
80   asm volatile(
81       "eor        v16.16b, v16.16b, v16.16b      \n"
82       "eor        v18.16b, v18.16b, v18.16b      \n"
83       "eor        v17.16b, v17.16b, v17.16b      \n"
84       "eor        v19.16b, v19.16b, v19.16b      \n"
85 
86       "1:                                        \n"
87       "ld1        {v0.16b}, [%0], #16            \n"
88       "ld1        {v1.16b}, [%1], #16            \n"
89       "subs       %w2, %w2, #16                  \n"
90       "usubl      v2.8h, v0.8b, v1.8b            \n"
91       "usubl2     v3.8h, v0.16b, v1.16b          \n"
92       "smlal      v16.4s, v2.4h, v2.4h           \n"
93       "smlal      v17.4s, v3.4h, v3.4h           \n"
94       "smlal2     v18.4s, v2.8h, v2.8h           \n"
95       "smlal2     v19.4s, v3.8h, v3.8h           \n"
96       "b.gt       1b                             \n"
97 
98       "add        v16.4s, v16.4s, v17.4s         \n"
99       "add        v18.4s, v18.4s, v19.4s         \n"
100       "add        v19.4s, v16.4s, v18.4s         \n"
101       "addv       s0, v19.4s                     \n"
102       "fmov       %w3, s0                        \n"
103       : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
104       :
105       : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
106   return sse;
107 }
108 #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
109 #define HAS_SUMSQUAREERROR_SSE2
SumSquareError_SSE2(const uint8 *,const uint8 *,int)110 __declspec(naked) static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
111                                                     const uint8* /*src_b*/,
112                                                     int /*count*/) {
113   __asm {
114     mov        eax, [esp + 4]  // src_a
115     mov        edx, [esp + 8]  // src_b
116     mov        ecx, [esp + 12]  // count
117     pxor       xmm0, xmm0
118     pxor       xmm5, xmm5
119     sub        edx, eax
120 
121   wloop:
122     movdqu     xmm1, [eax]
123     movdqu     xmm2, [eax + edx]
124     lea        eax,  [eax + 16]
125     movdqu     xmm3, xmm1
126     psubusb    xmm1, xmm2
127     psubusb    xmm2, xmm3
128     por        xmm1, xmm2
129     movdqu     xmm2, xmm1
130     punpcklbw  xmm1, xmm5
131     punpckhbw  xmm2, xmm5
132     pmaddwd    xmm1, xmm1
133     pmaddwd    xmm2, xmm2
134     paddd      xmm0, xmm1
135     paddd      xmm0, xmm2
136     sub        ecx, 16
137     ja         wloop
138 
139     pshufd     xmm1, xmm0, 0EEh
140     paddd      xmm0, xmm1
141     pshufd     xmm1, xmm0, 01h
142     paddd      xmm0, xmm1
143     movd       eax, xmm0
144     ret
145   }
146 }
147 #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
148 #define HAS_SUMSQUAREERROR_SSE2
SumSquareError_SSE2(const uint8 * src_a,const uint8 * src_b,int count)149 static uint32 SumSquareError_SSE2(const uint8* src_a,
150                                   const uint8* src_b,
151                                   int count) {
152   uint32 sse;
153   asm volatile(  // NOLINT
154       "pxor      %%xmm0,%%xmm0                   \n"
155       "pxor      %%xmm5,%%xmm5                   \n"
156       "sub       %0,%1                           \n"
157 
158       "1:                                        \n"
159       "movdqu    (%0),%%xmm1                     \n"
160       "movdqu    (%0,%1,1),%%xmm2                \n"
161       "lea       0x10(%0),%0                     \n"
162       "movdqu    %%xmm1,%%xmm3                   \n"
163       "psubusb   %%xmm2,%%xmm1                   \n"
164       "psubusb   %%xmm3,%%xmm2                   \n"
165       "por       %%xmm2,%%xmm1                   \n"
166       "movdqu    %%xmm1,%%xmm2                   \n"
167       "punpcklbw %%xmm5,%%xmm1                   \n"
168       "punpckhbw %%xmm5,%%xmm2                   \n"
169       "pmaddwd   %%xmm1,%%xmm1                   \n"
170       "pmaddwd   %%xmm2,%%xmm2                   \n"
171       "paddd     %%xmm1,%%xmm0                   \n"
172       "paddd     %%xmm2,%%xmm0                   \n"
173       "sub       $0x10,%2                        \n"
174       "ja        1b                              \n"
175 
176       "pshufd    $0xee,%%xmm0,%%xmm1             \n"
177       "paddd     %%xmm1,%%xmm0                   \n"
178       "pshufd    $0x1,%%xmm0,%%xmm1              \n"
179       "paddd     %%xmm1,%%xmm0                   \n"
180       "movd      %%xmm0,%3                       \n"
181 
182       : "+r"(src_a),  // %0
183         "+r"(src_b),  // %1
184         "+r"(count),  // %2
185         "=g"(sse)     // %3
186       :
187       : "memory", "cc"
188 #if defined(__SSE2__)
189         ,
190         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
191 #endif
192       );  // NOLINT
193   return sse;
194 }
195 #endif  // LIBYUV_DISABLE_X86 etc
196 
197 #if defined(HAS_SUMSQUAREERROR_SSE2)
198 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
__cpuid(int cpu_info[4],int info_type)199 static __inline void __cpuid(int cpu_info[4], int info_type) {
200   asm volatile(  // NOLINT
201       "mov %%ebx, %%edi                          \n"
202       "cpuid                                     \n"
203       "xchg %%edi, %%ebx                         \n"
204       : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]),
205         "=d"(cpu_info[3])
206       : "a"(info_type));
207 }
208 // For gcc/clang but not clangcl.
209 #elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
__cpuid(int cpu_info[4],int info_type)210 static __inline void __cpuid(int cpu_info[4], int info_type) {
211   asm volatile(  // NOLINT
212       "cpuid                                     \n"
213       : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
214         "=d"(cpu_info[3])
215       : "a"(info_type));
216 }
217 #endif
218 
CpuHasSSE2()219 static int CpuHasSSE2() {
220 #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
221   int cpu_info[4];
222   __cpuid(cpu_info, 1);
223   if (cpu_info[3] & 0x04000000) {
224     return 1;
225   }
226 #endif
227   return 0;
228 }
229 #endif  // HAS_SUMSQUAREERROR_SSE2
230 
SumSquareError_C(const uint8 * src_a,const uint8 * src_b,int count)231 static uint32 SumSquareError_C(const uint8* src_a,
232                                const uint8* src_b,
233                                int count) {
234   uint32 sse = 0u;
235   for (int x = 0; x < count; ++x) {
236     int diff = src_a[x] - src_b[x];
237     sse += static_cast<uint32>(diff * diff);
238   }
239   return sse;
240 }
241 
ComputeSumSquareError(const uint8 * src_a,const uint8 * src_b,int count)242 double ComputeSumSquareError(const uint8* src_a,
243                              const uint8* src_b,
244                              int count) {
245   uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
246       SumSquareError_C;
247 #if defined(HAS_SUMSQUAREERROR_NEON)
248   SumSquareError = SumSquareError_NEON;
249 #endif
250 #if defined(HAS_SUMSQUAREERROR_SSE2)
251   if (CpuHasSSE2()) {
252     SumSquareError = SumSquareError_SSE2;
253   }
254 #endif
255   const int kBlockSize = 1 << 15;
256   uint64 sse = 0;
257 #ifdef _OPENMP
258 #pragma omp parallel for reduction(+ : sse)
259 #endif
260   for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
261     sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
262   }
263   src_a += count & ~(kBlockSize - 1);
264   src_b += count & ~(kBlockSize - 1);
265   int remainder = count & (kBlockSize - 1) & ~15;
266   if (remainder) {
267     sse += SumSquareError(src_a, src_b, remainder);
268     src_a += remainder;
269     src_b += remainder;
270   }
271   remainder = count & 15;
272   if (remainder) {
273     sse += SumSquareError_C(src_a, src_b, remainder);
274   }
275   return static_cast<double>(sse);
276 }
277 #endif
278 
279 // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
280 // Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
ComputePSNR(double sse,double size)281 double ComputePSNR(double sse, double size) {
282   const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
283   if (sse <= kMINSSE)
284     sse = kMINSSE;  // Produces max PSNR of 128
285   return 10.0 * log10(255.0 * 255.0 * size / sse);
286 }
287 
288 #ifdef __cplusplus
289 }  // extern "C"
290 #endif
291