1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/compare.h"
12
13#include <float.h>
14#include <math.h>
15#ifdef _OPENMP
16#include <omp.h>
17#endif
18
19#include "libyuv/basic_types.h"
20#include "libyuv/cpu_id.h"
21#include "libyuv/row.h"
22
23#ifdef __cplusplus
24namespace libyuv {
25extern "C" {
26#endif
27
28// hash seed of 5381 recommended.
29// Internal C version of HashDjb2 with int sized count for efficiency.
30uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
31
32// This module is for Visual C x86
33#if !defined(LIBYUV_DISABLE_X86) && \
34    (defined(_M_IX86) || \
35    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
36#define HAS_HASHDJB2_SSE41
37uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
38
39#if _MSC_VER >= 1700
40#define HAS_HASHDJB2_AVX2
41uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
42#endif
43
44#endif  // HAS_HASHDJB2_SSE41
45
46// hash seed of 5381 recommended.
47LIBYUV_API
48uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
49  const int kBlockSize = 1 << 15;  // 32768;
50  int remainder;
51  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
52#if defined(HAS_HASHDJB2_SSE41)
53  if (TestCpuFlag(kCpuHasSSE41)) {
54    HashDjb2_SSE = HashDjb2_SSE41;
55  }
56#endif
57#if defined(HAS_HASHDJB2_AVX2)
58  if (TestCpuFlag(kCpuHasAVX2)) {
59    HashDjb2_SSE = HashDjb2_AVX2;
60  }
61#endif
62
63  while (count >= (uint64)(kBlockSize)) {
64    seed = HashDjb2_SSE(src, kBlockSize, seed);
65    src += kBlockSize;
66    count -= kBlockSize;
67  }
68  remainder = (int)(count) & ~15;
69  if (remainder) {
70    seed = HashDjb2_SSE(src, remainder, seed);
71    src += remainder;
72    count -= remainder;
73  }
74  remainder = (int)(count) & 15;
75  if (remainder) {
76    seed = HashDjb2_C(src, remainder, seed);
77  }
78  return seed;
79}
80
81uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
82#if !defined(LIBYUV_DISABLE_NEON) && \
83    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
84#define HAS_SUMSQUAREERROR_NEON
85uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
86#endif
87#if !defined(LIBYUV_DISABLE_X86) && \
88    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
89#define HAS_SUMSQUAREERROR_SSE2
90uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
91#endif
92// Visual C 2012 required for AVX2.
93#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
94#define HAS_SUMSQUAREERROR_AVX2
95uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
96#endif
97
98// TODO(fbarchard): Refactor into row function.
99LIBYUV_API
100uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
101                             int count) {
102  // SumSquareError returns values 0 to 65535 for each squared difference.
103  // Up to 65536 of those can be summed and remain within a uint32.
104  // After each block of 65536 pixels, accumulate into a uint64.
105  const int kBlockSize = 65536;
106  int remainder = count & (kBlockSize - 1) & ~31;
107  uint64 sse = 0;
108  int i;
109  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
110      SumSquareError_C;
111#if defined(HAS_SUMSQUAREERROR_NEON)
112  if (TestCpuFlag(kCpuHasNEON)) {
113    SumSquareError = SumSquareError_NEON;
114  }
115#endif
116#if defined(HAS_SUMSQUAREERROR_SSE2)
117  if (TestCpuFlag(kCpuHasSSE2) &&
118      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
119    // Note only used for multiples of 16 so count is not checked.
120    SumSquareError = SumSquareError_SSE2;
121  }
122#endif
123#if defined(HAS_SUMSQUAREERROR_AVX2)
124  if (TestCpuFlag(kCpuHasAVX2)) {
125    // Note only used for multiples of 32 so count is not checked.
126    SumSquareError = SumSquareError_AVX2;
127  }
128#endif
129#ifdef _OPENMP
130#pragma omp parallel for reduction(+: sse)
131#endif
132  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
133    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
134  }
135  src_a += count & ~(kBlockSize - 1);
136  src_b += count & ~(kBlockSize - 1);
137  if (remainder) {
138    sse += SumSquareError(src_a, src_b, remainder);
139    src_a += remainder;
140    src_b += remainder;
141  }
142  remainder = count & 31;
143  if (remainder) {
144    sse += SumSquareError_C(src_a, src_b, remainder);
145  }
146  return sse;
147}
148
149LIBYUV_API
150uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
151                                  const uint8* src_b, int stride_b,
152                                  int width, int height) {
153  uint64 sse = 0;
154  int h;
155  // Coalesce rows.
156  if (stride_a == width &&
157      stride_b == width) {
158    width *= height;
159    height = 1;
160    stride_a = stride_b = 0;
161  }
162  for (h = 0; h < height; ++h) {
163    sse += ComputeSumSquareError(src_a, src_b, width);
164    src_a += stride_a;
165    src_b += stride_b;
166  }
167  return sse;
168}
169
170LIBYUV_API
171double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
172  double psnr;
173  if (sse > 0) {
174    double mse = (double)(count) / (double)(sse);
175    psnr = 10.0 * log10(255.0 * 255.0 * mse);
176  } else {
177    psnr = kMaxPsnr;      // Limit to prevent divide by 0
178  }
179
180  if (psnr > kMaxPsnr)
181    psnr = kMaxPsnr;
182
183  return psnr;
184}
185
186LIBYUV_API
187double CalcFramePsnr(const uint8* src_a, int stride_a,
188                     const uint8* src_b, int stride_b,
189                     int width, int height) {
190  const uint64 samples = width * height;
191  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
192                                                src_b, stride_b,
193                                                width, height);
194  return SumSquareErrorToPsnr(sse, samples);
195}
196
197LIBYUV_API
198double I420Psnr(const uint8* src_y_a, int stride_y_a,
199                const uint8* src_u_a, int stride_u_a,
200                const uint8* src_v_a, int stride_v_a,
201                const uint8* src_y_b, int stride_y_b,
202                const uint8* src_u_b, int stride_u_b,
203                const uint8* src_v_b, int stride_v_b,
204                int width, int height) {
205  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
206                                                  src_y_b, stride_y_b,
207                                                  width, height);
208  const int width_uv = (width + 1) >> 1;
209  const int height_uv = (height + 1) >> 1;
210  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
211                                                  src_u_b, stride_u_b,
212                                                  width_uv, height_uv);
213  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
214                                                  src_v_b, stride_v_b,
215                                                  width_uv, height_uv);
216  const uint64 samples = width * height + 2 * (width_uv * height_uv);
217  const uint64 sse = sse_y + sse_u + sse_v;
218  return SumSquareErrorToPsnr(sse, samples);
219}
220
221static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
222static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
223
224static double Ssim8x8_C(const uint8* src_a, int stride_a,
225                        const uint8* src_b, int stride_b) {
226  int64 sum_a = 0;
227  int64 sum_b = 0;
228  int64 sum_sq_a = 0;
229  int64 sum_sq_b = 0;
230  int64 sum_axb = 0;
231
232  int i;
233  for (i = 0; i < 8; ++i) {
234    int j;
235    for (j = 0; j < 8; ++j) {
236      sum_a += src_a[j];
237      sum_b += src_b[j];
238      sum_sq_a += src_a[j] * src_a[j];
239      sum_sq_b += src_b[j] * src_b[j];
240      sum_axb += src_a[j] * src_b[j];
241    }
242
243    src_a += stride_a;
244    src_b += stride_b;
245  }
246
247  {
248    const int64 count = 64;
249    // scale the constants by number of pixels
250    const int64 c1 = (cc1 * count * count) >> 12;
251    const int64 c2 = (cc2 * count * count) >> 12;
252
253    const int64 sum_a_x_sum_b = sum_a * sum_b;
254
255    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
256                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
257
258    const int64 sum_a_sq = sum_a*sum_a;
259    const int64 sum_b_sq = sum_b*sum_b;
260
261    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
262                         (count * sum_sq_a - sum_a_sq +
263                          count * sum_sq_b - sum_b_sq + c2);
264
265    if (ssim_d == 0.0) {
266      return DBL_MAX;
267    }
268    return ssim_n * 1.0 / ssim_d;
269  }
270}
271
272// We are using a 8x8 moving window with starting location of each 8x8 window
273// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
274// block boundaries to penalize blocking artifacts.
275LIBYUV_API
276double CalcFrameSsim(const uint8* src_a, int stride_a,
277                     const uint8* src_b, int stride_b,
278                     int width, int height) {
279  int samples = 0;
280  double ssim_total = 0;
281  double (*Ssim8x8)(const uint8* src_a, int stride_a,
282                    const uint8* src_b, int stride_b) = Ssim8x8_C;
283
284  // sample point start with each 4x4 location
285  int i;
286  for (i = 0; i < height - 8; i += 4) {
287    int j;
288    for (j = 0; j < width - 8; j += 4) {
289      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
290      samples++;
291    }
292
293    src_a += stride_a * 4;
294    src_b += stride_b * 4;
295  }
296
297  ssim_total /= samples;
298  return ssim_total;
299}
300
301LIBYUV_API
302double I420Ssim(const uint8* src_y_a, int stride_y_a,
303                const uint8* src_u_a, int stride_u_a,
304                const uint8* src_v_a, int stride_v_a,
305                const uint8* src_y_b, int stride_y_b,
306                const uint8* src_u_b, int stride_u_b,
307                const uint8* src_v_b, int stride_v_b,
308                int width, int height) {
309  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
310                                      src_y_b, stride_y_b, width, height);
311  const int width_uv = (width + 1) >> 1;
312  const int height_uv = (height + 1) >> 1;
313  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
314                                      src_u_b, stride_u_b,
315                                      width_uv, height_uv);
316  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
317                                      src_v_b, stride_v_b,
318                                      width_uv, height_uv);
319  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
320}
321
322#ifdef __cplusplus
323}  // extern "C"
324}  // namespace libyuv
325#endif
326