1f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang/* 2f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * 4f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * Use of this source code is governed by a BSD-style license 5f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * that can be found in the LICENSE file in the root of the source 6f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * tree. An additional intellectual property rights grant can be found 7f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * in the file PATENTS. All contributing project authors may 8f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang * be found in the AUTHORS file in the root of the source tree. 9f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang */ 10f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 11f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include "../util/ssim.h" // NOLINT 12f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 13f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <string.h> 14f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 15f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus 16f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangextern "C" { 17f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 18f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 19b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardtypedef unsigned int uint32; // NOLINT 20b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardtypedef unsigned short uint16; // NOLINT 21f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 22f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \ 23b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) 24f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define __SSE2__ 25f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 26f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) 27f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <emmintrin.h> 28f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 29f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 30f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef _OPENMP 31f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <omp.h> 32f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 33f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 34f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// SSIM 35f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangenum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 }; 36f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 37f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i) 38f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// The maximum value (11 x 11) must be less than 128 to avoid sign 39f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// problems during the calls to _mm_mullo_epi16(). 40f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const int K[KERNEL_SIZE] = { 41b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) 42f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}; 43f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const double kiW[KERNEL + 1 + 1] = { 44b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 45b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 46b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] 47b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] 48b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] 49f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}; 50f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 51f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) 52f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 53b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product 54b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define MAKE_WEIGHT(L) \ 55b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard { \ 56b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard { \ 57b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard { \ 58b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard PWEIGHT(L, 0) \ 59b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \ 60b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \ 61b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard } \ 62b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard } \ 63b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard } 64f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 65f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// We need this union trick to be able to initialize constant static __m128i 66f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// values. We can't call _mm_set_epi16() for static compile-time initialization. 67f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const struct { 68f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang union { 69f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang uint16 i16_[8]; 70f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i m_; 71f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } values_; 72b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2), 73f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang W3 = MAKE_WEIGHT(3); 74b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// ... the rest is symmetric. 75f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef MAKE_WEIGHT 76f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef PWEIGHT 77f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 78f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 79f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Common final expression for SSIM, once the weighted sums are known. 80b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic double FinalizeSSIM(double iw, 81b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard double xm, 82b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard double ym, 83b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard double xxm, 84b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard double xym, 85b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard double yym) { 86f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double iwx = xm * iw; 87f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double iwy = ym * iw; 88f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang double sxx = xxm * iw - iwx * iwx; 89f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang double syy = yym * iw - iwy * iwy; 90f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // small errors are possible, due to rounding. Clamp to zero. 91b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard if (sxx < 0.) 92b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard sxx = 0.; 93b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard if (syy < 0.) 94b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard syy = 0.; 95f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double sxsy = sqrt(sxx * syy); 96f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double sxy = xym * iw - iwx * iwy; 97f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang static const double C11 = (0.01 * 0.01) * (255 * 255); 98f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang static const double C22 = (0.03 * 0.03) * (255 * 255); 99f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang static const double C33 = (0.015 * 0.015) * (255 * 255); 100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); 101b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const double c = (2. * sxsy + C22) / (sxx + syy + C22); 102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const double s = (sxy + C33) / (sxsy + C33); 103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return l * c * s; 104f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 106f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// GetSSIM() does clipping. GetSSIMFullKernel() does not 107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 108f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// TODO(skal): use summed tables? 109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1) 110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// with a diff of 255, squared. The maximum error is thus 0x4388241, 111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// which fits into 32 bits integers. 112b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barcharddouble GetSSIM(const uint8* org, 113b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const uint8* rec, 114b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard int xo, 115b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard int yo, 116b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard int W, 117b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard int H, 118b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard int stride) { 119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; 120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang org += (yo - KERNEL) * stride; 121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang org += (xo - KERNEL); 122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang rec += (yo - KERNEL) * stride; 123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang rec += (xo - KERNEL); 124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) { 125b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) 126b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard continue; 127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wy = K[y_]; 128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) { 129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wxy = Wy * K[x_]; 130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) { 131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int org_x = org[x_]; 132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int rec_x = rec[x_]; 133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ws += Wxy; 134b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard xm += Wxy * org_x; 135b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard ym += Wxy * rec_x; 136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xxm += Wxy * org_x * org_x; 137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xym += Wxy * org_x * rec_x; 138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang yym += Wxy * rec_x * rec_x; 139f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 140f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 141f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym); 143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 145b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barcharddouble GetSSIMFullKernel(const uint8* org, 146b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const uint8* rec, 147b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard int xo, 148b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard int yo, 149b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard int stride, 150f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang double area_weight) { 151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; 152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__) 154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang org += yo * stride + xo; 156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang rec += yo * stride + xo; 157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int y = 1; y <= KERNEL; y++) { 158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int dy1 = y * stride; 159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int dy2 = y * stride; 160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wy = K[KERNEL + y]; 161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int x = 1; x <= KERNEL; x++) { 163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Compute the contributions of upper-left (ul), upper-right (ur) 164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // lower-left (ll) and lower-right (lr) points (see the diagram below). 165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Symmetric Kernel will have same weight on those points. 166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - ul - - - ur - 168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 169f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - 0 - - - 170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - ll - - - lr - 172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wxy = Wy * K[KERNEL + x]; 174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ul1 = org[-dy1 - x]; 175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ur1 = org[-dy1 + x]; 176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ll1 = org[dy1 - x]; 177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int lr1 = org[dy1 + x]; 178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ul2 = rec[-dy2 - x]; 180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ur2 = rec[-dy2 + x]; 181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int ll2 = rec[dy2 - x]; 182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int lr2 = rec[dy2 + x]; 183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 184b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard xm += Wxy * (ul1 + ur1 + ll1 + lr1); 185b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard ym += Wxy * (ul2 + ur2 + ll2 + lr2); 186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1); 187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2); 188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2); 189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 190f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Compute the contributions of up (u), down (d), left (l) and right (r) 192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // points across the main axes (see the diagram below). 193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Symmetric Kernel will have same weight on those points. 194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - u - - - 196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - l - 0 - r - 198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - d - - - 200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // - - - - - - - 201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wxy = Wy * K[KERNEL]; 202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int u1 = org[-dy1]; 203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int d1 = org[dy1]; 204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int l1 = org[-y]; 205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int r1 = org[y]; 206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int u2 = rec[-dy2]; 208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int d2 = rec[dy2]; 209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int l2 = rec[-y]; 210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int r2 = rec[y]; 211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 212b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard xm += Wxy * (u1 + d1 + l1 + r1); 213b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard ym += Wxy * (u2 + d2 + l2 + r2); 214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1); 215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2); 216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2); 217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // Lastly the contribution of (x0, y0) point. 220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int Wxy = K[KERNEL] * K[KERNEL]; 221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int s1 = org[0]; 222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int s2 = rec[0]; 223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 224b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard xm += Wxy * s1; 225b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard ym += Wxy * s2; 226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xxm += Wxy * s1 * s1; 227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang xym += Wxy * s1 * s2; 228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang yym += Wxy * s2 * s2; 229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 230b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#else // __SSE2__ 231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang org += (yo - KERNEL) * stride + (xo - KERNEL); 233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang rec += (yo - KERNEL) * stride + (xo - KERNEL); 234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const __m128i zero = _mm_setzero_si128(); 236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i x = zero; 237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i y = zero; 238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i xx = zero; 239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i xy = zero; 240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang __m128i yy = zero; 241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 pixels at line #L, and convert to 16bit, perform weighting 243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// and acccumulate. 244b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define LOAD_LINE_PAIR(L, WEIGHT) \ 245b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard do { \ 246b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const __m128i v0 = \ 247b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \ 248b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const __m128i v1 = \ 249b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \ 250b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ 251b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ 252b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ 253b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ 254b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ 255b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ 256b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ 257b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ 258b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ 259b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ 260b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ 261b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard } while (0) 262b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard 263b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define ADD_AND_STORE_FOUR_EPI32(M, OUT) \ 264b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard do { \ 265b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard uint32 tmp[4]; \ 266b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ 267b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ 268b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard } while (0) 269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(0, W0); 271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(1, W1); 272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(2, W2); 273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(3, W3); 274f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(4, W2); 275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(5, W1); 276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang LOAD_LINE_PAIR(6, W0); 277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(x, xm); 279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(y, ym); 280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(xx, xxm); 281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(xy, xym); 282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang ADD_AND_STORE_FOUR_EPI32(yy, yym); 283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef LOAD_LINE_PAIR 285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef ADD_AND_STORE_FOUR_EPI32 286f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym); 289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 291b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic int start_max(int x, int y) { 292b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard return (x > y) ? x : y; 293b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard} 294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 295b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barcharddouble CalcSSIM(const uint8* org, 296b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const uint8* rec, 297b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const int image_width, 298b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard const int image_height) { 299f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang double SSIM = 0.; 300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL; 301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL; 302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X); 303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y); 304f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int stride = image_width; 305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int j = 0; j < KERNEL_Y; ++j) { 307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int i = 0; i < image_width; ++i) { 308f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 312f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef _OPENMP 313b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#pragma omp parallel for reduction(+ : SSIM) 314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) { 316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int i = 0; i < KERNEL_X; ++i) { 317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 319f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int i = KERNEL_X; i < start_x; ++i) { 320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]); 321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang if (start_x < image_width) { 323f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we 324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // copy the 8 rightmost pixels on a cache area, and pad this area with 325f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // zeros which won't contribute to the overall SSIM value (but we need 326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // to pass the correct normalizing constant!). By using this cache, we can 327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // still call GetSSIMFullKernel() instead of the slower GetSSIM(). 328f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang // NOTE: we could use similar method for the left-most pixels too. 329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int kScratchWidth = 8; 330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int kScratchStride = kScratchWidth + KERNEL + 1; 331b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0}; 332b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0}; 333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int k = 0; k < KERNEL_SIZE; ++k) { 335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang const int offset = 336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang (j - KERNEL + k) * stride + image_width - kScratchWidth; 337f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth); 338f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth); 339f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 340b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard for (int k = 0; k <= KERNEL_X + 1; ++k) { 341b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL, 342b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard kScratchStride, kiW[k]); 343f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 344f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 345f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 346f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 347f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int j = start_y; j < image_height; ++j) { 348f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang for (int i = 0; i < image_width; ++i) { 349f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); 350f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 351f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang } 352f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return SSIM; 353f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 354f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 355f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble CalcLSSIM(double ssim) { 356f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang return -10.0 * log10(1.0 - ssim); 357f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} 358f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang 359f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus 360f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} // extern "C" 361f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif 362