1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/compare.h"
12
13#include <float.h>
14#include <math.h>
15#ifdef _OPENMP
16#include <omp.h>
17#endif
18
19#include "libyuv/basic_types.h"
20#include "libyuv/cpu_id.h"
21#include "libyuv/row.h"
22#include "libyuv/video_common.h"
23
24#ifdef __cplusplus
25namespace libyuv {
26extern "C" {
27#endif
28
29// hash seed of 5381 recommended.
30// Internal C version of HashDjb2 with int sized count for efficiency.
31uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
32
33// This module is for Visual C x86
34#if !defined(LIBYUV_DISABLE_X86) && \
35    (defined(_M_IX86) || \
36    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
37#define HAS_HASHDJB2_SSE41
38uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
39
40#ifdef VISUALC_HAS_AVX2
41#define HAS_HASHDJB2_AVX2
42uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
43#endif
44
45#endif  // HAS_HASHDJB2_SSE41
46
47// hash seed of 5381 recommended.
48LIBYUV_API
49uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
50  const int kBlockSize = 1 << 15;  // 32768;
51  int remainder;
52  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
53#if defined(HAS_HASHDJB2_SSE41)
54  if (TestCpuFlag(kCpuHasSSE41)) {
55    HashDjb2_SSE = HashDjb2_SSE41;
56  }
57#endif
58#if defined(HAS_HASHDJB2_AVX2)
59  if (TestCpuFlag(kCpuHasAVX2)) {
60    HashDjb2_SSE = HashDjb2_AVX2;
61  }
62#endif
63
64  while (count >= (uint64)(kBlockSize)) {
65    seed = HashDjb2_SSE(src, kBlockSize, seed);
66    src += kBlockSize;
67    count -= kBlockSize;
68  }
69  remainder = (int)(count) & ~15;
70  if (remainder) {
71    seed = HashDjb2_SSE(src, remainder, seed);
72    src += remainder;
73    count -= remainder;
74  }
75  remainder = (int)(count) & 15;
76  if (remainder) {
77    seed = HashDjb2_C(src, remainder, seed);
78  }
79  return seed;
80}
81
82static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
83  int x;
84  for (x = 0; x < width - 1; x += 2) {
85    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
86      return FOURCC_BGRA;
87    }
88    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
89      return FOURCC_ARGB;
90    }
91    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
92      return FOURCC_BGRA;
93    }
94    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
95      return FOURCC_ARGB;
96    }
97    argb += 8;
98  }
99  if (width & 1) {
100    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
101      return FOURCC_BGRA;
102    }
103    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
104      return FOURCC_ARGB;
105    }
106  }
107  return 0;
108}
109
110// Scan an opaque argb image and return fourcc based on alpha offset.
111// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
112LIBYUV_API
113uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
114  uint32 fourcc = 0;
115  int h;
116
117  // Coalesce rows.
118  if (stride_argb == width * 4) {
119    width *= height;
120    height = 1;
121    stride_argb = 0;
122  }
123  for (h = 0; h < height && fourcc == 0; ++h) {
124    fourcc = ARGBDetectRow_C(argb, width);
125    argb += stride_argb;
126  }
127  return fourcc;
128}
129
130uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
131#if !defined(LIBYUV_DISABLE_NEON) && \
132    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
133#define HAS_SUMSQUAREERROR_NEON
134uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
135#endif
136#if !defined(LIBYUV_DISABLE_X86) && \
137    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
138#define HAS_SUMSQUAREERROR_SSE2
139uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
140#endif
141
142#ifdef VISUALC_HAS_AVX2
143#define HAS_SUMSQUAREERROR_AVX2
144uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
145#endif
146
147// TODO(fbarchard): Refactor into row function.
148LIBYUV_API
149uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
150                             int count) {
151  // SumSquareError returns values 0 to 65535 for each squared difference.
152  // Up to 65536 of those can be summed and remain within a uint32.
153  // After each block of 65536 pixels, accumulate into a uint64.
154  const int kBlockSize = 65536;
155  int remainder = count & (kBlockSize - 1) & ~31;
156  uint64 sse = 0;
157  int i;
158  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
159      SumSquareError_C;
160#if defined(HAS_SUMSQUAREERROR_NEON)
161  if (TestCpuFlag(kCpuHasNEON)) {
162    SumSquareError = SumSquareError_NEON;
163  }
164#endif
165#if defined(HAS_SUMSQUAREERROR_SSE2)
166  if (TestCpuFlag(kCpuHasSSE2)) {
167    // Note only used for multiples of 16 so count is not checked.
168    SumSquareError = SumSquareError_SSE2;
169  }
170#endif
171#if defined(HAS_SUMSQUAREERROR_AVX2)
172  if (TestCpuFlag(kCpuHasAVX2)) {
173    // Note only used for multiples of 32 so count is not checked.
174    SumSquareError = SumSquareError_AVX2;
175  }
176#endif
177#ifdef _OPENMP
178#pragma omp parallel for reduction(+: sse)
179#endif
180  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
181    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
182  }
183  src_a += count & ~(kBlockSize - 1);
184  src_b += count & ~(kBlockSize - 1);
185  if (remainder) {
186    sse += SumSquareError(src_a, src_b, remainder);
187    src_a += remainder;
188    src_b += remainder;
189  }
190  remainder = count & 31;
191  if (remainder) {
192    sse += SumSquareError_C(src_a, src_b, remainder);
193  }
194  return sse;
195}
196
197LIBYUV_API
198uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
199                                  const uint8* src_b, int stride_b,
200                                  int width, int height) {
201  uint64 sse = 0;
202  int h;
203  // Coalesce rows.
204  if (stride_a == width &&
205      stride_b == width) {
206    width *= height;
207    height = 1;
208    stride_a = stride_b = 0;
209  }
210  for (h = 0; h < height; ++h) {
211    sse += ComputeSumSquareError(src_a, src_b, width);
212    src_a += stride_a;
213    src_b += stride_b;
214  }
215  return sse;
216}
217
218LIBYUV_API
219double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
220  double psnr;
221  if (sse > 0) {
222    double mse = (double)(count) / (double)(sse);
223    psnr = 10.0 * log10(255.0 * 255.0 * mse);
224  } else {
225    psnr = kMaxPsnr;      // Limit to prevent divide by 0
226  }
227
228  if (psnr > kMaxPsnr)
229    psnr = kMaxPsnr;
230
231  return psnr;
232}
233
234LIBYUV_API
235double CalcFramePsnr(const uint8* src_a, int stride_a,
236                     const uint8* src_b, int stride_b,
237                     int width, int height) {
238  const uint64 samples = width * height;
239  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
240                                                src_b, stride_b,
241                                                width, height);
242  return SumSquareErrorToPsnr(sse, samples);
243}
244
245LIBYUV_API
246double I420Psnr(const uint8* src_y_a, int stride_y_a,
247                const uint8* src_u_a, int stride_u_a,
248                const uint8* src_v_a, int stride_v_a,
249                const uint8* src_y_b, int stride_y_b,
250                const uint8* src_u_b, int stride_u_b,
251                const uint8* src_v_b, int stride_v_b,
252                int width, int height) {
253  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
254                                                  src_y_b, stride_y_b,
255                                                  width, height);
256  const int width_uv = (width + 1) >> 1;
257  const int height_uv = (height + 1) >> 1;
258  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
259                                                  src_u_b, stride_u_b,
260                                                  width_uv, height_uv);
261  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
262                                                  src_v_b, stride_v_b,
263                                                  width_uv, height_uv);
264  const uint64 samples = width * height + 2 * (width_uv * height_uv);
265  const uint64 sse = sse_y + sse_u + sse_v;
266  return SumSquareErrorToPsnr(sse, samples);
267}
268
269static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
270static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
271
272static double Ssim8x8_C(const uint8* src_a, int stride_a,
273                        const uint8* src_b, int stride_b) {
274  int64 sum_a = 0;
275  int64 sum_b = 0;
276  int64 sum_sq_a = 0;
277  int64 sum_sq_b = 0;
278  int64 sum_axb = 0;
279
280  int i;
281  for (i = 0; i < 8; ++i) {
282    int j;
283    for (j = 0; j < 8; ++j) {
284      sum_a += src_a[j];
285      sum_b += src_b[j];
286      sum_sq_a += src_a[j] * src_a[j];
287      sum_sq_b += src_b[j] * src_b[j];
288      sum_axb += src_a[j] * src_b[j];
289    }
290
291    src_a += stride_a;
292    src_b += stride_b;
293  }
294
295  {
296    const int64 count = 64;
297    // scale the constants by number of pixels
298    const int64 c1 = (cc1 * count * count) >> 12;
299    const int64 c2 = (cc2 * count * count) >> 12;
300
301    const int64 sum_a_x_sum_b = sum_a * sum_b;
302
303    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
304                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
305
306    const int64 sum_a_sq = sum_a*sum_a;
307    const int64 sum_b_sq = sum_b*sum_b;
308
309    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
310                         (count * sum_sq_a - sum_a_sq +
311                          count * sum_sq_b - sum_b_sq + c2);
312
313    if (ssim_d == 0.0) {
314      return DBL_MAX;
315    }
316    return ssim_n * 1.0 / ssim_d;
317  }
318}
319
320// We are using a 8x8 moving window with starting location of each 8x8 window
321// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
322// block boundaries to penalize blocking artifacts.
323LIBYUV_API
324double CalcFrameSsim(const uint8* src_a, int stride_a,
325                     const uint8* src_b, int stride_b,
326                     int width, int height) {
327  int samples = 0;
328  double ssim_total = 0;
329  double (*Ssim8x8)(const uint8* src_a, int stride_a,
330                    const uint8* src_b, int stride_b) = Ssim8x8_C;
331
332  // sample point start with each 4x4 location
333  int i;
334  for (i = 0; i < height - 8; i += 4) {
335    int j;
336    for (j = 0; j < width - 8; j += 4) {
337      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
338      samples++;
339    }
340
341    src_a += stride_a * 4;
342    src_b += stride_b * 4;
343  }
344
345  ssim_total /= samples;
346  return ssim_total;
347}
348
349LIBYUV_API
350double I420Ssim(const uint8* src_y_a, int stride_y_a,
351                const uint8* src_u_a, int stride_u_a,
352                const uint8* src_v_a, int stride_v_a,
353                const uint8* src_y_b, int stride_y_b,
354                const uint8* src_u_b, int stride_u_b,
355                const uint8* src_v_b, int stride_v_b,
356                int width, int height) {
357  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
358                                      src_y_b, stride_y_b, width, height);
359  const int width_uv = (width + 1) >> 1;
360  const int height_uv = (height + 1) >> 1;
361  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
362                                      src_u_b, stride_u_b,
363                                      width_uv, height_uv);
364  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
365                                      src_v_b, stride_v_b,
366                                      width_uv, height_uv);
367  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
368}
369
370#ifdef __cplusplus
371}  // extern "C"
372}  // namespace libyuv
373#endif
374