• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3   *
4   *  Use of this source code is governed by a BSD-style license
5   *  that can be found in the LICENSE file in the root of the source
6   *  tree. An additional intellectual property rights grant can be found
7   *  in the file PATENTS. All contributing project authors may
8   *  be found in the AUTHORS file in the root of the source tree.
9   */
10  
11  #include "./psnr.h"  // NOLINT
12  
13  #ifdef _OPENMP
14  #include <omp.h>
15  #endif
16  #ifdef _MSC_VER
17  #include <intrin.h>  // For __cpuid()
18  #endif
19  
20  #ifdef __cplusplus
21  extern "C" {
22  #endif
23  
24  typedef unsigned int uint32;  // NOLINT
25  #ifdef _MSC_VER
26  typedef unsigned __int64 uint64;
27  #else  // COMPILER_MSVC
28  #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
29  typedef unsigned long uint64;  // NOLINT
30  #else   // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
31  typedef unsigned long long uint64;  // NOLINT
32  #endif  // __LP64__
33  #endif  // _MSC_VER
34  
35  // libyuv provides this function when linking library for jpeg support.
36  #if !defined(HAVE_JPEG)
37  
38  #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
39      !defined(__aarch64__)
40  #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)41  static uint32 SumSquareError_NEON(const uint8* src_a,
42                                    const uint8* src_b,
43                                    int count) {
44    volatile uint32 sse;
45    asm volatile(
46        "vmov.u8    q7, #0                         \n"
47        "vmov.u8    q9, #0                         \n"
48        "vmov.u8    q8, #0                         \n"
49        "vmov.u8    q10, #0                        \n"
50  
51        "1:                                        \n"
52        "vld1.u8    {q0}, [%0]!                    \n"
53        "vld1.u8    {q1}, [%1]!                    \n"
54        "vsubl.u8   q2, d0, d2                     \n"
55        "vsubl.u8   q3, d1, d3                     \n"
56        "vmlal.s16  q7, d4, d4                     \n"
57        "vmlal.s16  q8, d6, d6                     \n"
58        "vmlal.s16  q8, d5, d5                     \n"
59        "vmlal.s16  q10, d7, d7                    \n"
60        "subs       %2, %2, #16                    \n"
61        "bhi        1b                             \n"
62  
63        "vadd.u32   q7, q7, q8                     \n"
64        "vadd.u32   q9, q9, q10                    \n"
65        "vadd.u32   q10, q7, q9                    \n"
66        "vpaddl.u32 q1, q10                        \n"
67        "vadd.u64   d0, d2, d3                     \n"
68        "vmov.32    %3, d0[0]                      \n"
69        : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
70        :
71        : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
72    return sse;
73  }
74  #elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
75  #define HAS_SUMSQUAREERROR_NEON
SumSquareError_NEON(const uint8 * src_a,const uint8 * src_b,int count)76  static uint32 SumSquareError_NEON(const uint8* src_a,
77                                    const uint8* src_b,
78                                    int count) {
79    volatile uint32 sse;
80    asm volatile(
81        "eor        v16.16b, v16.16b, v16.16b      \n"
82        "eor        v18.16b, v18.16b, v18.16b      \n"
83        "eor        v17.16b, v17.16b, v17.16b      \n"
84        "eor        v19.16b, v19.16b, v19.16b      \n"
85  
86        "1:                                        \n"
87        "ld1        {v0.16b}, [%0], #16            \n"
88        "ld1        {v1.16b}, [%1], #16            \n"
89        "subs       %w2, %w2, #16                  \n"
90        "usubl      v2.8h, v0.8b, v1.8b            \n"
91        "usubl2     v3.8h, v0.16b, v1.16b          \n"
92        "smlal      v16.4s, v2.4h, v2.4h           \n"
93        "smlal      v17.4s, v3.4h, v3.4h           \n"
94        "smlal2     v18.4s, v2.8h, v2.8h           \n"
95        "smlal2     v19.4s, v3.8h, v3.8h           \n"
96        "b.gt       1b                             \n"
97  
98        "add        v16.4s, v16.4s, v17.4s         \n"
99        "add        v18.4s, v18.4s, v19.4s         \n"
100        "add        v19.4s, v16.4s, v18.4s         \n"
101        "addv       s0, v19.4s                     \n"
102        "fmov       %w3, s0                        \n"
103        : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
104        :
105        : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
106    return sse;
107  }
108  #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
109  #define HAS_SUMSQUAREERROR_SSE2
SumSquareError_SSE2(const uint8 *,const uint8 *,int)110  __declspec(naked) static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
111                                                      const uint8* /*src_b*/,
112                                                      int /*count*/) {
113    __asm {
114      mov        eax, [esp + 4]  // src_a
115      mov        edx, [esp + 8]  // src_b
116      mov        ecx, [esp + 12]  // count
117      pxor       xmm0, xmm0
118      pxor       xmm5, xmm5
119      sub        edx, eax
120  
121    wloop:
122      movdqu     xmm1, [eax]
123      movdqu     xmm2, [eax + edx]
124      lea        eax,  [eax + 16]
125      movdqu     xmm3, xmm1
126      psubusb    xmm1, xmm2
127      psubusb    xmm2, xmm3
128      por        xmm1, xmm2
129      movdqu     xmm2, xmm1
130      punpcklbw  xmm1, xmm5
131      punpckhbw  xmm2, xmm5
132      pmaddwd    xmm1, xmm1
133      pmaddwd    xmm2, xmm2
134      paddd      xmm0, xmm1
135      paddd      xmm0, xmm2
136      sub        ecx, 16
137      ja         wloop
138  
139      pshufd     xmm1, xmm0, 0EEh
140      paddd      xmm0, xmm1
141      pshufd     xmm1, xmm0, 01h
142      paddd      xmm0, xmm1
143      movd       eax, xmm0
144      ret
145    }
146  }
147  #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
148  #define HAS_SUMSQUAREERROR_SSE2
SumSquareError_SSE2(const uint8 * src_a,const uint8 * src_b,int count)149  static uint32 SumSquareError_SSE2(const uint8* src_a,
150                                    const uint8* src_b,
151                                    int count) {
152    uint32 sse;
153    asm volatile(  // NOLINT
154        "pxor      %%xmm0,%%xmm0                   \n"
155        "pxor      %%xmm5,%%xmm5                   \n"
156        "sub       %0,%1                           \n"
157  
158        "1:                                        \n"
159        "movdqu    (%0),%%xmm1                     \n"
160        "movdqu    (%0,%1,1),%%xmm2                \n"
161        "lea       0x10(%0),%0                     \n"
162        "movdqu    %%xmm1,%%xmm3                   \n"
163        "psubusb   %%xmm2,%%xmm1                   \n"
164        "psubusb   %%xmm3,%%xmm2                   \n"
165        "por       %%xmm2,%%xmm1                   \n"
166        "movdqu    %%xmm1,%%xmm2                   \n"
167        "punpcklbw %%xmm5,%%xmm1                   \n"
168        "punpckhbw %%xmm5,%%xmm2                   \n"
169        "pmaddwd   %%xmm1,%%xmm1                   \n"
170        "pmaddwd   %%xmm2,%%xmm2                   \n"
171        "paddd     %%xmm1,%%xmm0                   \n"
172        "paddd     %%xmm2,%%xmm0                   \n"
173        "sub       $0x10,%2                        \n"
174        "ja        1b                              \n"
175  
176        "pshufd    $0xee,%%xmm0,%%xmm1             \n"
177        "paddd     %%xmm1,%%xmm0                   \n"
178        "pshufd    $0x1,%%xmm0,%%xmm1              \n"
179        "paddd     %%xmm1,%%xmm0                   \n"
180        "movd      %%xmm0,%3                       \n"
181  
182        : "+r"(src_a),  // %0
183          "+r"(src_b),  // %1
184          "+r"(count),  // %2
185          "=g"(sse)     // %3
186        :
187        : "memory", "cc"
188  #if defined(__SSE2__)
189          ,
190          "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
191  #endif
192        );  // NOLINT
193    return sse;
194  }
195  #endif  // LIBYUV_DISABLE_X86 etc
196  
197  #if defined(HAS_SUMSQUAREERROR_SSE2)
198  #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
__cpuid(int cpu_info[4],int info_type)199  static __inline void __cpuid(int cpu_info[4], int info_type) {
200    asm volatile(  // NOLINT
201        "mov %%ebx, %%edi                          \n"
202        "cpuid                                     \n"
203        "xchg %%edi, %%ebx                         \n"
204        : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]),
205          "=d"(cpu_info[3])
206        : "a"(info_type));
207  }
208  // For gcc/clang but not clangcl.
209  #elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
__cpuid(int cpu_info[4],int info_type)210  static __inline void __cpuid(int cpu_info[4], int info_type) {
211    asm volatile(  // NOLINT
212        "cpuid                                     \n"
213        : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
214          "=d"(cpu_info[3])
215        : "a"(info_type));
216  }
217  #endif
218  
CpuHasSSE2()219  static int CpuHasSSE2() {
220  #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
221    int cpu_info[4];
222    __cpuid(cpu_info, 1);
223    if (cpu_info[3] & 0x04000000) {
224      return 1;
225    }
226  #endif
227    return 0;
228  }
229  #endif  // HAS_SUMSQUAREERROR_SSE2
230  
SumSquareError_C(const uint8 * src_a,const uint8 * src_b,int count)231  static uint32 SumSquareError_C(const uint8* src_a,
232                                 const uint8* src_b,
233                                 int count) {
234    uint32 sse = 0u;
235    for (int x = 0; x < count; ++x) {
236      int diff = src_a[x] - src_b[x];
237      sse += static_cast<uint32>(diff * diff);
238    }
239    return sse;
240  }
241  
ComputeSumSquareError(const uint8 * src_a,const uint8 * src_b,int count)242  double ComputeSumSquareError(const uint8* src_a,
243                               const uint8* src_b,
244                               int count) {
245    uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
246        SumSquareError_C;
247  #if defined(HAS_SUMSQUAREERROR_NEON)
248    SumSquareError = SumSquareError_NEON;
249  #endif
250  #if defined(HAS_SUMSQUAREERROR_SSE2)
251    if (CpuHasSSE2()) {
252      SumSquareError = SumSquareError_SSE2;
253    }
254  #endif
255    const int kBlockSize = 1 << 15;
256    uint64 sse = 0;
257  #ifdef _OPENMP
258  #pragma omp parallel for reduction(+ : sse)
259  #endif
260    for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
261      sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
262    }
263    src_a += count & ~(kBlockSize - 1);
264    src_b += count & ~(kBlockSize - 1);
265    int remainder = count & (kBlockSize - 1) & ~15;
266    if (remainder) {
267      sse += SumSquareError(src_a, src_b, remainder);
268      src_a += remainder;
269      src_b += remainder;
270    }
271    remainder = count & 15;
272    if (remainder) {
273      sse += SumSquareError_C(src_a, src_b, remainder);
274    }
275    return static_cast<double>(sse);
276  }
277  #endif
278  
279  // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
280  // Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
ComputePSNR(double sse,double size)281  double ComputePSNR(double sse, double size) {
282    const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
283    if (sse <= kMINSSE)
284      sse = kMINSSE;  // Produces max PSNR of 128
285    return 10.0 * log10(255.0 * 255.0 * size / sse);
286  }
287  
288  #ifdef __cplusplus
289  }  // extern "C"
290  #endif
291