1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/compare.h"
12
13#include <float.h>
14#include <math.h>
15#ifdef _OPENMP
16#include <omp.h>
17#endif
18
19#include "libyuv/basic_types.h"
20#include "libyuv/compare_row.h"
21#include "libyuv/cpu_id.h"
22#include "libyuv/row.h"
23#include "libyuv/video_common.h"
24
25#ifdef __cplusplus
26namespace libyuv {
27extern "C" {
28#endif
29
30// hash seed of 5381 recommended.
31LIBYUV_API
32uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
33  const int kBlockSize = 1 << 15;  // 32768;
34  int remainder;
35  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
36      HashDjb2_C;
37#if defined(HAS_HASHDJB2_SSE41)
38  if (TestCpuFlag(kCpuHasSSE41)) {
39    HashDjb2_SSE = HashDjb2_SSE41;
40  }
41#endif
42#if defined(HAS_HASHDJB2_AVX2)
43  if (TestCpuFlag(kCpuHasAVX2)) {
44    HashDjb2_SSE = HashDjb2_AVX2;
45  }
46#endif
47
48  while (count >= (uint64)(kBlockSize)) {
49    seed = HashDjb2_SSE(src, kBlockSize, seed);
50    src += kBlockSize;
51    count -= kBlockSize;
52  }
53  remainder = (int)(count) & ~15;
54  if (remainder) {
55    seed = HashDjb2_SSE(src, remainder, seed);
56    src += remainder;
57    count -= remainder;
58  }
59  remainder = (int)(count) & 15;
60  if (remainder) {
61    seed = HashDjb2_C(src, remainder, seed);
62  }
63  return seed;
64}
65
66static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
67  int x;
68  for (x = 0; x < width - 1; x += 2) {
69    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
70      return FOURCC_BGRA;
71    }
72    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
73      return FOURCC_ARGB;
74    }
75    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
76      return FOURCC_BGRA;
77    }
78    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
79      return FOURCC_ARGB;
80    }
81    argb += 8;
82  }
83  if (width & 1) {
84    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
85      return FOURCC_BGRA;
86    }
87    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
88      return FOURCC_ARGB;
89    }
90  }
91  return 0;
92}
93
94// Scan an opaque argb image and return fourcc based on alpha offset.
95// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
96LIBYUV_API
97uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
98  uint32 fourcc = 0;
99  int h;
100
101  // Coalesce rows.
102  if (stride_argb == width * 4) {
103    width *= height;
104    height = 1;
105    stride_argb = 0;
106  }
107  for (h = 0; h < height && fourcc == 0; ++h) {
108    fourcc = ARGBDetectRow_C(argb, width);
109    argb += stride_argb;
110  }
111  return fourcc;
112}
113
114// TODO(fbarchard): Refactor into row function.
115LIBYUV_API
116uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
117                             int count) {
118  // SumSquareError returns values 0 to 65535 for each squared difference.
119  // Up to 65536 of those can be summed and remain within a uint32.
120  // After each block of 65536 pixels, accumulate into a uint64.
121  const int kBlockSize = 65536;
122  int remainder = count & (kBlockSize - 1) & ~31;
123  uint64 sse = 0;
124  int i;
125  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
126      SumSquareError_C;
127#if defined(HAS_SUMSQUAREERROR_NEON)
128  if (TestCpuFlag(kCpuHasNEON)) {
129    SumSquareError = SumSquareError_NEON;
130  }
131#endif
132#if defined(HAS_SUMSQUAREERROR_SSE2)
133  if (TestCpuFlag(kCpuHasSSE2)) {
134    // Note only used for multiples of 16 so count is not checked.
135    SumSquareError = SumSquareError_SSE2;
136  }
137#endif
138#if defined(HAS_SUMSQUAREERROR_AVX2)
139  if (TestCpuFlag(kCpuHasAVX2)) {
140    // Note only used for multiples of 32 so count is not checked.
141    SumSquareError = SumSquareError_AVX2;
142  }
143#endif
144#ifdef _OPENMP
145#pragma omp parallel for reduction(+: sse)
146#endif
147  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
148    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
149  }
150  src_a += count & ~(kBlockSize - 1);
151  src_b += count & ~(kBlockSize - 1);
152  if (remainder) {
153    sse += SumSquareError(src_a, src_b, remainder);
154    src_a += remainder;
155    src_b += remainder;
156  }
157  remainder = count & 31;
158  if (remainder) {
159    sse += SumSquareError_C(src_a, src_b, remainder);
160  }
161  return sse;
162}
163
164LIBYUV_API
165uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
166                                  const uint8* src_b, int stride_b,
167                                  int width, int height) {
168  uint64 sse = 0;
169  int h;
170  // Coalesce rows.
171  if (stride_a == width &&
172      stride_b == width) {
173    width *= height;
174    height = 1;
175    stride_a = stride_b = 0;
176  }
177  for (h = 0; h < height; ++h) {
178    sse += ComputeSumSquareError(src_a, src_b, width);
179    src_a += stride_a;
180    src_b += stride_b;
181  }
182  return sse;
183}
184
185LIBYUV_API
186double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
187  double psnr;
188  if (sse > 0) {
189    double mse = (double)(count) / (double)(sse);
190    psnr = 10.0 * log10(255.0 * 255.0 * mse);
191  } else {
192    psnr = kMaxPsnr;      // Limit to prevent divide by 0
193  }
194
195  if (psnr > kMaxPsnr)
196    psnr = kMaxPsnr;
197
198  return psnr;
199}
200
201LIBYUV_API
202double CalcFramePsnr(const uint8* src_a, int stride_a,
203                     const uint8* src_b, int stride_b,
204                     int width, int height) {
205  const uint64 samples = width * height;
206  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
207                                                src_b, stride_b,
208                                                width, height);
209  return SumSquareErrorToPsnr(sse, samples);
210}
211
212LIBYUV_API
213double I420Psnr(const uint8* src_y_a, int stride_y_a,
214                const uint8* src_u_a, int stride_u_a,
215                const uint8* src_v_a, int stride_v_a,
216                const uint8* src_y_b, int stride_y_b,
217                const uint8* src_u_b, int stride_u_b,
218                const uint8* src_v_b, int stride_v_b,
219                int width, int height) {
220  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
221                                                  src_y_b, stride_y_b,
222                                                  width, height);
223  const int width_uv = (width + 1) >> 1;
224  const int height_uv = (height + 1) >> 1;
225  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
226                                                  src_u_b, stride_u_b,
227                                                  width_uv, height_uv);
228  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
229                                                  src_v_b, stride_v_b,
230                                                  width_uv, height_uv);
231  const uint64 samples = width * height + 2 * (width_uv * height_uv);
232  const uint64 sse = sse_y + sse_u + sse_v;
233  return SumSquareErrorToPsnr(sse, samples);
234}
235
236static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
237static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
238
239static double Ssim8x8_C(const uint8* src_a, int stride_a,
240                        const uint8* src_b, int stride_b) {
241  int64 sum_a = 0;
242  int64 sum_b = 0;
243  int64 sum_sq_a = 0;
244  int64 sum_sq_b = 0;
245  int64 sum_axb = 0;
246
247  int i;
248  for (i = 0; i < 8; ++i) {
249    int j;
250    for (j = 0; j < 8; ++j) {
251      sum_a += src_a[j];
252      sum_b += src_b[j];
253      sum_sq_a += src_a[j] * src_a[j];
254      sum_sq_b += src_b[j] * src_b[j];
255      sum_axb += src_a[j] * src_b[j];
256    }
257
258    src_a += stride_a;
259    src_b += stride_b;
260  }
261
262  {
263    const int64 count = 64;
264    // scale the constants by number of pixels
265    const int64 c1 = (cc1 * count * count) >> 12;
266    const int64 c2 = (cc2 * count * count) >> 12;
267
268    const int64 sum_a_x_sum_b = sum_a * sum_b;
269
270    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
271                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
272
273    const int64 sum_a_sq = sum_a*sum_a;
274    const int64 sum_b_sq = sum_b*sum_b;
275
276    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
277                         (count * sum_sq_a - sum_a_sq +
278                          count * sum_sq_b - sum_b_sq + c2);
279
280    if (ssim_d == 0.0) {
281      return DBL_MAX;
282    }
283    return ssim_n * 1.0 / ssim_d;
284  }
285}
286
287// We are using a 8x8 moving window with starting location of each 8x8 window
288// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
289// block boundaries to penalize blocking artifacts.
290LIBYUV_API
291double CalcFrameSsim(const uint8* src_a, int stride_a,
292                     const uint8* src_b, int stride_b,
293                     int width, int height) {
294  int samples = 0;
295  double ssim_total = 0;
296  double (*Ssim8x8)(const uint8* src_a, int stride_a,
297                    const uint8* src_b, int stride_b) = Ssim8x8_C;
298
299  // sample point start with each 4x4 location
300  int i;
301  for (i = 0; i < height - 8; i += 4) {
302    int j;
303    for (j = 0; j < width - 8; j += 4) {
304      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
305      samples++;
306    }
307
308    src_a += stride_a * 4;
309    src_b += stride_b * 4;
310  }
311
312  ssim_total /= samples;
313  return ssim_total;
314}
315
316LIBYUV_API
317double I420Ssim(const uint8* src_y_a, int stride_y_a,
318                const uint8* src_u_a, int stride_u_a,
319                const uint8* src_v_a, int stride_v_a,
320                const uint8* src_y_b, int stride_y_b,
321                const uint8* src_u_b, int stride_u_b,
322                const uint8* src_v_b, int stride_v_b,
323                int width, int height) {
324  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
325                                      src_y_b, stride_y_b, width, height);
326  const int width_uv = (width + 1) >> 1;
327  const int height_uv = (height + 1) >> 1;
328  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
329                                      src_u_b, stride_u_b,
330                                      width_uv, height_uv);
331  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
332                                      src_v_b, stride_v_b,
333                                      width_uv, height_uv);
334  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
335}
336
337#ifdef __cplusplus
338}  // extern "C"
339}  // namespace libyuv
340#endif
341