1/* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/compare.h" 12 13#include <float.h> 14#include <math.h> 15#ifdef _OPENMP 16#include <omp.h> 17#endif 18 19#include "libyuv/basic_types.h" 20#include "libyuv/cpu_id.h" 21#include "libyuv/row.h" 22 23#ifdef __cplusplus 24namespace libyuv { 25extern "C" { 26#endif 27 28// hash seed of 5381 recommended. 29// Internal C version of HashDjb2 with int sized count for efficiency. 30uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); 31 32// This module is for Visual C x86 33#if !defined(LIBYUV_DISABLE_X86) && \ 34 (defined(_M_IX86) || \ 35 (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) 36#define HAS_HASHDJB2_SSE41 37uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); 38 39#if _MSC_VER >= 1700 40#define HAS_HASHDJB2_AVX2 41uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); 42#endif 43 44#endif // HAS_HASHDJB2_SSE41 45 46// hash seed of 5381 recommended. 47LIBYUV_API 48uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { 49 const int kBlockSize = 1 << 15; // 32768; 50 int remainder; 51 uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; 52#if defined(HAS_HASHDJB2_SSE41) 53 if (TestCpuFlag(kCpuHasSSE41)) { 54 HashDjb2_SSE = HashDjb2_SSE41; 55 } 56#endif 57#if defined(HAS_HASHDJB2_AVX2) 58 if (TestCpuFlag(kCpuHasAVX2)) { 59 HashDjb2_SSE = HashDjb2_AVX2; 60 } 61#endif 62 63 while (count >= (uint64)(kBlockSize)) { 64 seed = HashDjb2_SSE(src, kBlockSize, seed); 65 src += kBlockSize; 66 count -= kBlockSize; 67 } 68 remainder = (int)(count) & ~15; 69 if (remainder) { 70 seed = HashDjb2_SSE(src, remainder, seed); 71 src += remainder; 72 count -= remainder; 73 } 74 remainder = (int)(count) & 15; 75 if (remainder) { 76 seed = HashDjb2_C(src, remainder, seed); 77 } 78 return seed; 79} 80 81uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); 82#if !defined(LIBYUV_DISABLE_NEON) && \ 83 (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) 84#define HAS_SUMSQUAREERROR_NEON 85uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); 86#endif 87#if !defined(LIBYUV_DISABLE_X86) && \ 88 (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) 89#define HAS_SUMSQUAREERROR_SSE2 90uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); 91#endif 92// Visual C 2012 required for AVX2. 93#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700 94#define HAS_SUMSQUAREERROR_AVX2 95uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); 96#endif 97 98// TODO(fbarchard): Refactor into row function. 99LIBYUV_API 100uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, 101 int count) { 102 // SumSquareError returns values 0 to 65535 for each squared difference. 103 // Up to 65536 of those can be summed and remain within a uint32. 104 // After each block of 65536 pixels, accumulate into a uint64. 105 const int kBlockSize = 65536; 106 int remainder = count & (kBlockSize - 1) & ~31; 107 uint64 sse = 0; 108 int i; 109 uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = 110 SumSquareError_C; 111#if defined(HAS_SUMSQUAREERROR_NEON) 112 if (TestCpuFlag(kCpuHasNEON)) { 113 SumSquareError = SumSquareError_NEON; 114 } 115#endif 116#if defined(HAS_SUMSQUAREERROR_SSE2) 117 if (TestCpuFlag(kCpuHasSSE2) && 118 IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { 119 // Note only used for multiples of 16 so count is not checked. 120 SumSquareError = SumSquareError_SSE2; 121 } 122#endif 123#if defined(HAS_SUMSQUAREERROR_AVX2) 124 if (TestCpuFlag(kCpuHasAVX2)) { 125 // Note only used for multiples of 32 so count is not checked. 126 SumSquareError = SumSquareError_AVX2; 127 } 128#endif 129#ifdef _OPENMP 130#pragma omp parallel for reduction(+: sse) 131#endif 132 for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { 133 sse += SumSquareError(src_a + i, src_b + i, kBlockSize); 134 } 135 src_a += count & ~(kBlockSize - 1); 136 src_b += count & ~(kBlockSize - 1); 137 if (remainder) { 138 sse += SumSquareError(src_a, src_b, remainder); 139 src_a += remainder; 140 src_b += remainder; 141 } 142 remainder = count & 31; 143 if (remainder) { 144 sse += SumSquareError_C(src_a, src_b, remainder); 145 } 146 return sse; 147} 148 149LIBYUV_API 150uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, 151 const uint8* src_b, int stride_b, 152 int width, int height) { 153 uint64 sse = 0; 154 int h; 155 // Coalesce rows. 156 if (stride_a == width && 157 stride_b == width) { 158 width *= height; 159 height = 1; 160 stride_a = stride_b = 0; 161 } 162 for (h = 0; h < height; ++h) { 163 sse += ComputeSumSquareError(src_a, src_b, width); 164 src_a += stride_a; 165 src_b += stride_b; 166 } 167 return sse; 168} 169 170LIBYUV_API 171double SumSquareErrorToPsnr(uint64 sse, uint64 count) { 172 double psnr; 173 if (sse > 0) { 174 double mse = (double)(count) / (double)(sse); 175 psnr = 10.0 * log10(255.0 * 255.0 * mse); 176 } else { 177 psnr = kMaxPsnr; // Limit to prevent divide by 0 178 } 179 180 if (psnr > kMaxPsnr) 181 psnr = kMaxPsnr; 182 183 return psnr; 184} 185 186LIBYUV_API 187double CalcFramePsnr(const uint8* src_a, int stride_a, 188 const uint8* src_b, int stride_b, 189 int width, int height) { 190 const uint64 samples = width * height; 191 const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, 192 src_b, stride_b, 193 width, height); 194 return SumSquareErrorToPsnr(sse, samples); 195} 196 197LIBYUV_API 198double I420Psnr(const uint8* src_y_a, int stride_y_a, 199 const uint8* src_u_a, int stride_u_a, 200 const uint8* src_v_a, int stride_v_a, 201 const uint8* src_y_b, int stride_y_b, 202 const uint8* src_u_b, int stride_u_b, 203 const uint8* src_v_b, int stride_v_b, 204 int width, int height) { 205 const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, 206 src_y_b, stride_y_b, 207 width, height); 208 const int width_uv = (width + 1) >> 1; 209 const int height_uv = (height + 1) >> 1; 210 const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a, 211 src_u_b, stride_u_b, 212 width_uv, height_uv); 213 const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a, 214 src_v_b, stride_v_b, 215 width_uv, height_uv); 216 const uint64 samples = width * height + 2 * (width_uv * height_uv); 217 const uint64 sse = sse_y + sse_u + sse_v; 218 return SumSquareErrorToPsnr(sse, samples); 219} 220 221static const int64 cc1 = 26634; // (64^2*(.01*255)^2 222static const int64 cc2 = 239708; // (64^2*(.03*255)^2 223 224static double Ssim8x8_C(const uint8* src_a, int stride_a, 225 const uint8* src_b, int stride_b) { 226 int64 sum_a = 0; 227 int64 sum_b = 0; 228 int64 sum_sq_a = 0; 229 int64 sum_sq_b = 0; 230 int64 sum_axb = 0; 231 232 int i; 233 for (i = 0; i < 8; ++i) { 234 int j; 235 for (j = 0; j < 8; ++j) { 236 sum_a += src_a[j]; 237 sum_b += src_b[j]; 238 sum_sq_a += src_a[j] * src_a[j]; 239 sum_sq_b += src_b[j] * src_b[j]; 240 sum_axb += src_a[j] * src_b[j]; 241 } 242 243 src_a += stride_a; 244 src_b += stride_b; 245 } 246 247 { 248 const int64 count = 64; 249 // scale the constants by number of pixels 250 const int64 c1 = (cc1 * count * count) >> 12; 251 const int64 c2 = (cc2 * count * count) >> 12; 252 253 const int64 sum_a_x_sum_b = sum_a * sum_b; 254 255 const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * 256 (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); 257 258 const int64 sum_a_sq = sum_a*sum_a; 259 const int64 sum_b_sq = sum_b*sum_b; 260 261 const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) * 262 (count * sum_sq_a - sum_a_sq + 263 count * sum_sq_b - sum_b_sq + c2); 264 265 if (ssim_d == 0.0) { 266 return DBL_MAX; 267 } 268 return ssim_n * 1.0 / ssim_d; 269 } 270} 271 272// We are using a 8x8 moving window with starting location of each 8x8 window 273// on the 4x4 pixel grid. Such arrangement allows the windows to overlap 274// block boundaries to penalize blocking artifacts. 275LIBYUV_API 276double CalcFrameSsim(const uint8* src_a, int stride_a, 277 const uint8* src_b, int stride_b, 278 int width, int height) { 279 int samples = 0; 280 double ssim_total = 0; 281 double (*Ssim8x8)(const uint8* src_a, int stride_a, 282 const uint8* src_b, int stride_b) = Ssim8x8_C; 283 284 // sample point start with each 4x4 location 285 int i; 286 for (i = 0; i < height - 8; i += 4) { 287 int j; 288 for (j = 0; j < width - 8; j += 4) { 289 ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); 290 samples++; 291 } 292 293 src_a += stride_a * 4; 294 src_b += stride_b * 4; 295 } 296 297 ssim_total /= samples; 298 return ssim_total; 299} 300 301LIBYUV_API 302double I420Ssim(const uint8* src_y_a, int stride_y_a, 303 const uint8* src_u_a, int stride_u_a, 304 const uint8* src_v_a, int stride_v_a, 305 const uint8* src_y_b, int stride_y_b, 306 const uint8* src_u_b, int stride_u_b, 307 const uint8* src_v_b, int stride_v_b, 308 int width, int height) { 309 const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, 310 src_y_b, stride_y_b, width, height); 311 const int width_uv = (width + 1) >> 1; 312 const int height_uv = (height + 1) >> 1; 313 const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, 314 src_u_b, stride_u_b, 315 width_uv, height_uv); 316 const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, 317 src_v_b, stride_v_b, 318 width_uv, height_uv); 319 return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); 320} 321 322#ifdef __cplusplus 323} // extern "C" 324} // namespace libyuv 325#endif 326