1/*
2 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "./psnr.h"  // NOLINT
12
13#ifdef _OPENMP
14#include <omp.h>
15#endif
16#ifdef _MSC_VER
17#include <intrin.h>  // For __cpuid()
18#endif
19
20#ifdef __cplusplus
21extern "C" {
22#endif
23
24typedef unsigned int uint32;  // NOLINT
25#ifdef _MSC_VER
26typedef unsigned __int64 uint64;
27#else  // COMPILER_MSVC
28#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
29typedef unsigned long uint64;  // NOLINT
30#else   // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
31typedef unsigned long long uint64;  // NOLINT
32#endif  // __LP64__
33#endif  // _MSC_VER
34
35// libyuv provides this function when linking library for jpeg support.
36#if !defined(HAVE_JPEG)
37
38#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
39    !defined(__aarch64__)
40#define HAS_SUMSQUAREERROR_NEON
41static uint32 SumSquareError_NEON(const uint8* src_a,
42                                  const uint8* src_b,
43                                  int count) {
44  volatile uint32 sse;
45  asm volatile(
46      "vmov.u8    q7, #0                         \n"
47      "vmov.u8    q9, #0                         \n"
48      "vmov.u8    q8, #0                         \n"
49      "vmov.u8    q10, #0                        \n"
50
51      "1:                                        \n"
52      "vld1.u8    {q0}, [%0]!                    \n"
53      "vld1.u8    {q1}, [%1]!                    \n"
54      "vsubl.u8   q2, d0, d2                     \n"
55      "vsubl.u8   q3, d1, d3                     \n"
56      "vmlal.s16  q7, d4, d4                     \n"
57      "vmlal.s16  q8, d6, d6                     \n"
58      "vmlal.s16  q8, d5, d5                     \n"
59      "vmlal.s16  q10, d7, d7                    \n"
60      "subs       %2, %2, #16                    \n"
61      "bhi        1b                             \n"
62
63      "vadd.u32   q7, q7, q8                     \n"
64      "vadd.u32   q9, q9, q10                    \n"
65      "vadd.u32   q10, q7, q9                    \n"
66      "vpaddl.u32 q1, q10                        \n"
67      "vadd.u64   d0, d2, d3                     \n"
68      "vmov.32    %3, d0[0]                      \n"
69      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
70      :
71      : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
72  return sse;
73}
74#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
75#define HAS_SUMSQUAREERROR_NEON
76static uint32 SumSquareError_NEON(const uint8* src_a,
77                                  const uint8* src_b,
78                                  int count) {
79  volatile uint32 sse;
80  asm volatile(
81      "eor        v16.16b, v16.16b, v16.16b      \n"
82      "eor        v18.16b, v18.16b, v18.16b      \n"
83      "eor        v17.16b, v17.16b, v17.16b      \n"
84      "eor        v19.16b, v19.16b, v19.16b      \n"
85
86      "1:                                        \n"
87      "ld1        {v0.16b}, [%0], #16            \n"
88      "ld1        {v1.16b}, [%1], #16            \n"
89      "subs       %w2, %w2, #16                  \n"
90      "usubl      v2.8h, v0.8b, v1.8b            \n"
91      "usubl2     v3.8h, v0.16b, v1.16b          \n"
92      "smlal      v16.4s, v2.4h, v2.4h           \n"
93      "smlal      v17.4s, v3.4h, v3.4h           \n"
94      "smlal2     v18.4s, v2.8h, v2.8h           \n"
95      "smlal2     v19.4s, v3.8h, v3.8h           \n"
96      "b.gt       1b                             \n"
97
98      "add        v16.4s, v16.4s, v17.4s         \n"
99      "add        v18.4s, v18.4s, v19.4s         \n"
100      "add        v19.4s, v16.4s, v18.4s         \n"
101      "addv       s0, v19.4s                     \n"
102      "fmov       %w3, s0                        \n"
103      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
104      :
105      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
106  return sse;
107}
108#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
109#define HAS_SUMSQUAREERROR_SSE2
110__declspec(naked) static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
111                                                    const uint8* /*src_b*/,
112                                                    int /*count*/) {
113  __asm {
114    mov        eax, [esp + 4]  // src_a
115    mov        edx, [esp + 8]  // src_b
116    mov        ecx, [esp + 12]  // count
117    pxor       xmm0, xmm0
118    pxor       xmm5, xmm5
119    sub        edx, eax
120
121  wloop:
122    movdqu     xmm1, [eax]
123    movdqu     xmm2, [eax + edx]
124    lea        eax,  [eax + 16]
125    movdqu     xmm3, xmm1
126    psubusb    xmm1, xmm2
127    psubusb    xmm2, xmm3
128    por        xmm1, xmm2
129    movdqu     xmm2, xmm1
130    punpcklbw  xmm1, xmm5
131    punpckhbw  xmm2, xmm5
132    pmaddwd    xmm1, xmm1
133    pmaddwd    xmm2, xmm2
134    paddd      xmm0, xmm1
135    paddd      xmm0, xmm2
136    sub        ecx, 16
137    ja         wloop
138
139    pshufd     xmm1, xmm0, 0EEh
140    paddd      xmm0, xmm1
141    pshufd     xmm1, xmm0, 01h
142    paddd      xmm0, xmm1
143    movd       eax, xmm0
144    ret
145  }
146}
147#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
148#define HAS_SUMSQUAREERROR_SSE2
149static uint32 SumSquareError_SSE2(const uint8* src_a,
150                                  const uint8* src_b,
151                                  int count) {
152  uint32 sse;
153  asm volatile(  // NOLINT
154      "pxor      %%xmm0,%%xmm0                   \n"
155      "pxor      %%xmm5,%%xmm5                   \n"
156      "sub       %0,%1                           \n"
157
158      "1:                                        \n"
159      "movdqu    (%0),%%xmm1                     \n"
160      "movdqu    (%0,%1,1),%%xmm2                \n"
161      "lea       0x10(%0),%0                     \n"
162      "movdqu    %%xmm1,%%xmm3                   \n"
163      "psubusb   %%xmm2,%%xmm1                   \n"
164      "psubusb   %%xmm3,%%xmm2                   \n"
165      "por       %%xmm2,%%xmm1                   \n"
166      "movdqu    %%xmm1,%%xmm2                   \n"
167      "punpcklbw %%xmm5,%%xmm1                   \n"
168      "punpckhbw %%xmm5,%%xmm2                   \n"
169      "pmaddwd   %%xmm1,%%xmm1                   \n"
170      "pmaddwd   %%xmm2,%%xmm2                   \n"
171      "paddd     %%xmm1,%%xmm0                   \n"
172      "paddd     %%xmm2,%%xmm0                   \n"
173      "sub       $0x10,%2                        \n"
174      "ja        1b                              \n"
175
176      "pshufd    $0xee,%%xmm0,%%xmm1             \n"
177      "paddd     %%xmm1,%%xmm0                   \n"
178      "pshufd    $0x1,%%xmm0,%%xmm1              \n"
179      "paddd     %%xmm1,%%xmm0                   \n"
180      "movd      %%xmm0,%3                       \n"
181
182      : "+r"(src_a),  // %0
183        "+r"(src_b),  // %1
184        "+r"(count),  // %2
185        "=g"(sse)     // %3
186      :
187      : "memory", "cc"
188#if defined(__SSE2__)
189        ,
190        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
191#endif
192      );  // NOLINT
193  return sse;
194}
195#endif  // LIBYUV_DISABLE_X86 etc
196
197#if defined(HAS_SUMSQUAREERROR_SSE2)
198#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
199static __inline void __cpuid(int cpu_info[4], int info_type) {
200  asm volatile(  // NOLINT
201      "mov %%ebx, %%edi                          \n"
202      "cpuid                                     \n"
203      "xchg %%edi, %%ebx                         \n"
204      : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]),
205        "=d"(cpu_info[3])
206      : "a"(info_type));
207}
208// For gcc/clang but not clangcl.
209#elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__))
210static __inline void __cpuid(int cpu_info[4], int info_type) {
211  asm volatile(  // NOLINT
212      "cpuid                                     \n"
213      : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
214        "=d"(cpu_info[3])
215      : "a"(info_type));
216}
217#endif
218
219static int CpuHasSSE2() {
220#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
221  int cpu_info[4];
222  __cpuid(cpu_info, 1);
223  if (cpu_info[3] & 0x04000000) {
224    return 1;
225  }
226#endif
227  return 0;
228}
229#endif  // HAS_SUMSQUAREERROR_SSE2
230
231static uint32 SumSquareError_C(const uint8* src_a,
232                               const uint8* src_b,
233                               int count) {
234  uint32 sse = 0u;
235  for (int x = 0; x < count; ++x) {
236    int diff = src_a[x] - src_b[x];
237    sse += static_cast<uint32>(diff * diff);
238  }
239  return sse;
240}
241
242double ComputeSumSquareError(const uint8* src_a,
243                             const uint8* src_b,
244                             int count) {
245  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
246      SumSquareError_C;
247#if defined(HAS_SUMSQUAREERROR_NEON)
248  SumSquareError = SumSquareError_NEON;
249#endif
250#if defined(HAS_SUMSQUAREERROR_SSE2)
251  if (CpuHasSSE2()) {
252    SumSquareError = SumSquareError_SSE2;
253  }
254#endif
255  const int kBlockSize = 1 << 15;
256  uint64 sse = 0;
257#ifdef _OPENMP
258#pragma omp parallel for reduction(+ : sse)
259#endif
260  for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
261    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
262  }
263  src_a += count & ~(kBlockSize - 1);
264  src_b += count & ~(kBlockSize - 1);
265  int remainder = count & (kBlockSize - 1) & ~15;
266  if (remainder) {
267    sse += SumSquareError(src_a, src_b, remainder);
268    src_a += remainder;
269    src_b += remainder;
270  }
271  remainder = count & 15;
272  if (remainder) {
273    sse += SumSquareError_C(src_a, src_b, remainder);
274  }
275  return static_cast<double>(sse);
276}
277#endif
278
279// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
280// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
281double ComputePSNR(double sse, double size) {
282  const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
283  if (sse <= kMINSSE)
284    sse = kMINSSE;  // Produces max PSNR of 128
285  return 10.0 * log10(255.0 * 255.0 * size / sse);
286}
287
288#ifdef __cplusplus
289}  // extern "C"
290#endif
291