ssim.cc revision f047e7ca6983218eed7703c7afd51fed7bd3b5c9
1f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang/* 2f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * 4f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * Use of this source code is governed by a BSD-style license 5f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * that can be found in the LICENSE file in the root of the source 6f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * tree. An additional intellectual property rights grant can be found 7f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * in the file PATENTS. All contributing project authors may 8f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * be found in the AUTHORS file in the root of the source tree. 9f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang */ 10f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 11f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include "../util/ssim.h" // NOLINT 12f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 13f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <string.h> 14f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 15f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus 16f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangextern "C" { 17f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 18f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 19f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangtypedef unsigned int uint32; // NOLINT 20f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangtypedef unsigned short uint16; // NOLINT 21f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 22f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \ 23f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) 24f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define __SSE2__ 25f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 26f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) 27f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <emmintrin.h> 28f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 29f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 30f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef _OPENMP 31f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <omp.h> 32f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 33f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 34f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// SSIM 35f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangenum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 }; 36f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 37f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i) 38f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// The maximum value (11 x 11) must be less than 128 to avoid sign 39f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// problems during the calls to _mm_mullo_epi16(). 40f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const int K[KERNEL_SIZE] = { 41f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) 42f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}; 43f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const double kiW[KERNEL + 1 + 1] = { 44f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 45f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 46f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] 47f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] 48f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] 49f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}; 50f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 51f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) 52f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 53f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product 54f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define MAKE_WEIGHT(L) \ 55f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \ 56f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } } 57f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 58f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// We need this union trick to be able to initialize constant static __m128i 59f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// values. We can't call _mm_set_epi16() for static compile-time initialization. 60f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const struct { 61f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang union { 62f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang uint16 i16_[8]; 63f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i m_; 64f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } values_; 65f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} W0 = MAKE_WEIGHT(0), 66f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang W1 = MAKE_WEIGHT(1), 67f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang W2 = MAKE_WEIGHT(2), 68f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang W3 = MAKE_WEIGHT(3); 69f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // ... the rest is symmetric. 70f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef MAKE_WEIGHT 71f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef PWEIGHT 72f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 73f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 74f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Common final expression for SSIM, once the weighted sums are known. 75f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic double FinalizeSSIM(double iw, double xm, double ym, 76f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang double xxm, double xym, double yym) { 77f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double iwx = xm * iw; 78f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double iwy = ym * iw; 79f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang double sxx = xxm * iw - iwx * iwx; 80f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang double syy = yym * iw - iwy * iwy; 81f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // small errors are possible, due to rounding. Clamp to zero. 82f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang if (sxx < 0.) sxx = 0.; 83f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang if (syy < 0.) syy = 0.; 84f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double sxsy = sqrt(sxx * syy); 85f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double sxy = xym * iw - iwx * iwy; 86f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang static const double C11 = (0.01 * 0.01) * (255 * 255); 87f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang static const double C22 = (0.03 * 0.03) * (255 * 255); 88f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang static const double C33 = (0.015 * 0.015) * (255 * 255); 89f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); 90f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double c = (2. * sxsy + C22) / (sxx + syy + C22); 91f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double s = (sxy + C33) / (sxsy + C33); 92f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return l * c * s; 93f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 94f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 95f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// GetSSIM() does clipping. GetSSIMFullKernel() does not 96f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 97f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// TODO(skal): use summed tables? 98f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1) 99f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// with a diff of 255, squared. The maximum error is thus 0x4388241, 100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// which fits into 32 bits integers. 101f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble GetSSIM(const uint8 *org, const uint8 *rec, 102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang int xo, int yo, int W, int H, int stride) { 103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; 104f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang org += (yo - KERNEL) * stride; 105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang org += (xo - KERNEL); 106f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang rec += (yo - KERNEL) * stride; 107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang rec += (xo - KERNEL); 108f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) { 109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue; 110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wy = K[y_]; 111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) { 112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wxy = Wy * K[x_]; 113f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) { 114f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int org_x = org[x_]; 115f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int rec_x = rec[x_]; 116f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ws += Wxy; 117f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xm += Wxy * org_x; 118f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ym += Wxy * rec_x; 119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xxm += Wxy * org_x * org_x; 120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xym += Wxy * org_x * rec_x; 121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang yym += Wxy * rec_x * rec_x; 122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 125f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym); 126f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble GetSSIMFullKernel(const uint8 *org, const uint8 *rec, 129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang int xo, int yo, int stride, 130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang double area_weight) { 131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; 132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__) 134f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 135f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang org += yo * stride + xo; 136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang rec += yo * stride + xo; 137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int y = 1; y <= KERNEL; y++) { 138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int dy1 = y * stride; 139f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int dy2 = y * stride; 140f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wy = K[KERNEL + y]; 141f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int x = 1; x <= KERNEL; x++) { 143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Compute the contributions of upper-left (ul), upper-right (ur) 144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // lower-left (ll) and lower-right (lr) points (see the diagram below). 145f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Symmetric Kernel will have same weight on those points. 146f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 147f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - ul - - - ur - 148f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 149f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - 0 - - - 150f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - ll - - - lr - 152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wxy = Wy * K[KERNEL + x]; 154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ul1 = org[-dy1 - x]; 155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ur1 = org[-dy1 + x]; 156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ll1 = org[dy1 - x]; 157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int lr1 = org[dy1 + x]; 158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ul2 = rec[-dy2 - x]; 160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ur2 = rec[-dy2 + x]; 161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ll2 = rec[dy2 - x]; 162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int lr2 = rec[dy2 + x]; 163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xm += Wxy * (ul1 + ur1 + ll1 + lr1); 165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ym += Wxy * (ul2 + ur2 + ll2 + lr2); 166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1); 167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2); 168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2); 169f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Compute the contributions of up (u), down (d), left (l) and right (r) 172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // points across the main axes (see the diagram below). 173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Symmetric Kernel will have same weight on those points. 174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - u - - - 176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - l - 0 - r - 178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - d - - - 180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wxy = Wy * K[KERNEL]; 182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int u1 = org[-dy1]; 183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int d1 = org[dy1]; 184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int l1 = org[-y]; 185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int r1 = org[y]; 186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int u2 = rec[-dy2]; 188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int d2 = rec[dy2]; 189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int l2 = rec[-y]; 190f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int r2 = rec[y]; 191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xm += Wxy * (u1 + d1 + l1 + r1); 193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ym += Wxy * (u2 + d2 + l2 + r2); 194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1); 195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2); 196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2); 197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Lastly the contribution of (x0, y0) point. 200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wxy = K[KERNEL] * K[KERNEL]; 201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int s1 = org[0]; 202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int s2 = rec[0]; 203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xm += Wxy * s1; 205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ym += Wxy * s2; 206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xxm += Wxy * s1 * s1; 207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xym += Wxy * s1 * s2; 208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang yym += Wxy * s2 * s2; 209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#else // __SSE2__ 211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang org += (yo - KERNEL) * stride + (xo - KERNEL); 213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang rec += (yo - KERNEL) * stride + (xo - KERNEL); 214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const __m128i zero = _mm_setzero_si128(); 216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i x = zero; 217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i y = zero; 218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i xx = zero; 219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i xy = zero; 220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i yy = zero; 221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 pixels at line #L, and convert to 16bit, perform weighting 223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// and acccumulate. 224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define LOAD_LINE_PAIR(L, WEIGHT) do { \ 225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const __m128i v0 = \ 226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \ 227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const __m128i v1 = \ 228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \ 229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ 230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ 231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ 232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ 233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ 234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ 235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ 236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ 237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ 238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ 239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ 240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} while (0) 241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \ 243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang uint32 tmp[4]; \ 244f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ 245f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ 246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} while (0) 247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(0, W0); 249f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(1, W1); 250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(2, W2); 251f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(3, W3); 252f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(4, W2); 253f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(5, W1); 254f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(6, W0); 255f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 256f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(x, xm); 257f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(y, ym); 258f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(xx, xxm); 259f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(xy, xym); 260f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(yy, yym); 261f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 262f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef LOAD_LINE_PAIR 263f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef ADD_AND_STORE_FOUR_EPI32 264f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 265f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 266f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym); 267f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 268f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic int start_max(int x, int y) { return (x > y) ? x : y; } 270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble CalcSSIM(const uint8 *org, const uint8 *rec, 272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int image_width, const int image_height) { 273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang double SSIM = 0.; 274f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL; 275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL; 276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X); 277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y); 278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int stride = image_width; 279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int j = 0; j < KERNEL_Y; ++j) { 281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int i = 0; i < image_width; ++i) { 282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 286f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef _OPENMP 287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang #pragma omp parallel for reduction(+: SSIM) 288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) { 290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int i = 0; i < KERNEL_X; ++i) { 291f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 292f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 293f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int i = KERNEL_X; i < start_x; ++i) { 294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]); 295f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 296f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang if (start_x < image_width) { 297f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we 298f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // copy the 8 rightmost pixels on a cache area, and pad this area with 299f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // zeros which won't contribute to the overall SSIM value (but we need 300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // to pass the correct normalizing constant!). By using this cache, we can 301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // still call GetSSIMFullKernel() instead of the slower GetSSIM(). 302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // NOTE: we could use similar method for the left-most pixels too. 303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int kScratchWidth = 8; 304f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int kScratchStride = kScratchWidth + KERNEL + 1; 305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 }; 306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 }; 307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 308f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int k = 0; k < KERNEL_SIZE; ++k) { 309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int offset = 310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang (j - KERNEL + k) * stride + image_width - kScratchWidth; 311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth); 312f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth); 313f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int k = 0; k <= KERNEL_X + 1; ++k) { 315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, 316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang KERNEL + k, KERNEL, kScratchStride, kiW[k]); 317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 319f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int j = start_y; j < image_height; ++j) { 322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int i = 0; i < image_width; ++i) { 323f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 325f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return SSIM; 327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 328f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble CalcLSSIM(double ssim) { 330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return -10.0 * log10(1.0 - ssim); 331f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 332f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus 334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} // extern "C" 335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 337