ssim.cc revision f047e7ca6983218eed7703c7afd51fed7bd3b5c9
1f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang/*
2f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *
4f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Use of this source code is governed by a BSD-style license
5f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  that can be found in the LICENSE file in the root of the source
6f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  tree. An additional intellectual property rights grant can be found
7f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  in the file PATENTS. All contributing project authors may
8f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  be found in the AUTHORS file in the root of the source tree.
9f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang */
10f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
11f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include "../util/ssim.h"  // NOLINT
12f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
13f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <string.h>
14f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
15f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
16f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangextern "C" {
17f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
18f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
19f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangtypedef unsigned int uint32;     // NOLINT
20f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangtypedef unsigned short uint16;   // NOLINT
21f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
22f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
23f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
24f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define __SSE2__
25f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
26f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
27f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <emmintrin.h>
28f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
29f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
30f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef _OPENMP
31f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <omp.h>
32f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
33f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
34f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// SSIM
35f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangenum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
36f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
37f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
38f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// The maximum value (11 x 11) must be less than 128 to avoid sign
39f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// problems during the calls to _mm_mullo_epi16().
40f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const int K[KERNEL_SIZE] = {
41f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  1, 3, 7, 11, 7, 3, 1    // ~11 * exp(-0.3 * i * i)
42f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang};
43f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const double kiW[KERNEL + 1 + 1] = {
44f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
45f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  1. / 1089.,   // 1 / sum(i:0..6, j..6) K[i]*K[j]
46f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  1. / 1056.,   // 1 / sum(i:0..5, j..6) K[i]*K[j]
47f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  1. / 957.,    // 1 / sum(i:0..4, j..6) K[i]*K[j]
48f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  1. / 726.,    // 1 / sum(i:0..3, j..6) K[i]*K[j]
49f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang};
50f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
51f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
52f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
53f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define PWEIGHT(A, B)  static_cast<uint16>(K[(A)] * K[(B)])   // weight product
54f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define MAKE_WEIGHT(L)                                               \
55f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3),  \
56f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
57f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
58f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// We need this union trick to be able to initialize constant static __m128i
59f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// values. We can't call _mm_set_epi16() for static compile-time initialization.
60f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const struct {
61f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  union {
62f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    uint16 i16_[8];
63f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    __m128i m_;
64f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  } values_;
65f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} W0 = MAKE_WEIGHT(0),
66f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  W1 = MAKE_WEIGHT(1),
67f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  W2 = MAKE_WEIGHT(2),
68f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  W3 = MAKE_WEIGHT(3);
69f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // ... the rest is symmetric.
70f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef MAKE_WEIGHT
71f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef PWEIGHT
72f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
73f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
74f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Common final expression for SSIM, once the weighted sums are known.
75f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic double FinalizeSSIM(double iw, double xm, double ym,
76f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                           double xxm, double xym, double yym) {
77f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double iwx = xm * iw;
78f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double iwy = ym * iw;
79f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  double sxx = xxm * iw - iwx * iwx;
80f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  double syy = yym * iw - iwy * iwy;
81f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // small errors are possible, due to rounding. Clamp to zero.
82f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  if (sxx < 0.) sxx = 0.;
83f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  if (syy < 0.) syy = 0.;
84f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double sxsy = sqrt(sxx * syy);
85f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double sxy = xym * iw - iwx * iwy;
86f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  static const double C11 = (0.01 * 0.01) * (255 * 255);
87f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  static const double C22 = (0.03 * 0.03) * (255 * 255);
88f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  static const double C33 = (0.015 * 0.015) * (255 * 255);
89f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
90f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
91f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double s = (sxy + C33) / (sxsy + C33);
92f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return l * c * s;
93f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
94f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
95f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// GetSSIM() does clipping.  GetSSIMFullKernel() does not
96f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
97f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// TODO(skal): use summed tables?
98f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
99f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// with a diff of 255, squared. The maximum error is thus 0x4388241,
100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// which fits into 32 bits integers.
101f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble GetSSIM(const uint8 *org, const uint8 *rec,
102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang               int xo, int yo, int W, int H, int stride) {
103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
104f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  org += (yo - KERNEL) * stride;
105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  org += (xo - KERNEL);
106f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  rec += (yo - KERNEL) * stride;
107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  rec += (xo - KERNEL);
108f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int Wy = K[y_];
111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int Wxy = Wy * K[x_];
113f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
114f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        const int org_x = org[x_];
115f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        const int rec_x = rec[x_];
116f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        ws += Wxy;
117f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        xm  += Wxy * org_x;
118f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        ym  += Wxy * rec_x;
119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        xxm += Wxy * org_x * org_x;
120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        xym += Wxy * org_x * rec_x;
121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        yym += Wxy * rec_x * rec_x;
122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      }
123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
125f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
126f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                         int xo, int yo, int stride,
130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                         double area_weight) {
131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
134f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
135f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  org += yo * stride + xo;
136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  rec += yo * stride + xo;
137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int y = 1; y <= KERNEL; y++) {
138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int dy1 = y * stride;
139f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int dy2 = y * stride;
140f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int Wy = K[KERNEL + y];
141f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int x = 1; x <= KERNEL; x++) {
143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // Compute the contributions of upper-left (ul), upper-right (ur)
144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // lower-left (ll) and lower-right (lr) points (see the diagram below).
145f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // Symmetric Kernel will have same weight on those points.
146f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  -  -  -  -
147f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  ul -  -  -  ur -
148f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  -  -  -  -
149f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  0  -  -  -
150f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  -  -  -  -
151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  ll -  -  -  lr -
152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  -  -  -  -
153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int Wxy = Wy * K[KERNEL + x];
154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ul1 = org[-dy1 - x];
155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ur1 = org[-dy1 + x];
156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ll1 = org[dy1 - x];
157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int lr1 = org[dy1 + x];
158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ul2 = rec[-dy2 - x];
160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ur2 = rec[-dy2 + x];
161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ll2 = rec[dy2 - x];
162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int lr2 = rec[dy2 + x];
163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      xm  += Wxy * (ul1 + ur1 + ll1 + lr1);
165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      ym  += Wxy * (ul2 + ur2 + ll2 + lr2);
166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
169f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Compute the contributions of up (u), down (d), left (l) and right (r)
172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // points across the main axes (see the diagram below).
173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Symmetric Kernel will have same weight on those points.
174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  -  -  -  -
175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  u  -  -  -
176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  -  -  -  -
177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  l  -  0  -  r  -
178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  -  -  -  -
179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  d  -  -  -
180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  -  -  -  -
181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int Wxy = Wy * K[KERNEL];
182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int u1 = org[-dy1];
183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int d1 = org[dy1];
184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int l1 = org[-y];
185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int r1 = org[y];
186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int u2 = rec[-dy2];
188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int d2 = rec[dy2];
189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int l2 = rec[-y];
190f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int r2 = rec[y];
191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    xm  += Wxy * (u1 + d1 + l1 + r1);
193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    ym  += Wxy * (u2 + d2 + l2 + r2);
194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // Lastly the contribution of (x0, y0) point.
200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int Wxy = K[KERNEL] * K[KERNEL];
201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int s1 = org[0];
202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int s2 = rec[0];
203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  xm  += Wxy * s1;
205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ym  += Wxy * s2;
206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  xxm += Wxy * s1 * s1;
207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  xym += Wxy * s1 * s2;
208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  yym += Wxy * s2 * s2;
209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#else   // __SSE2__
211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  org += (yo - KERNEL) * stride + (xo - KERNEL);
213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  rec += (yo - KERNEL) * stride + (xo - KERNEL);
214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const __m128i zero = _mm_setzero_si128();
216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i x = zero;
217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i y = zero;
218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i xx = zero;
219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i xy = zero;
220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i yy = zero;
221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 pixels at line #L, and convert to 16bit, perform weighting
223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// and acccumulate.
224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define LOAD_LINE_PAIR(L, WEIGHT) do {                                       \
225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const __m128i v0 =                                                         \
226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const __m128i v1 =                                                         \
228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                            \
230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                            \
231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);              \
232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);              \
233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                       \
234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                       \
235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                       \
236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                       \
237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                           \
238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                           \
239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                           \
240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} while (0)
241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do {                                \
243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uint32 tmp[4];                                                             \
244f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M));                    \
245f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];                                 \
246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang} while (0)
247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(0, W0);
249f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(1, W1);
250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(2, W2);
251f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(3, W3);
252f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(4, W2);
253f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(5, W1);
254f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(6, W0);
255f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
256f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(x, xm);
257f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(y, ym);
258f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(xx, xxm);
259f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(xy, xym);
260f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(yy, yym);
261f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
262f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef LOAD_LINE_PAIR
263f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef ADD_AND_STORE_FOUR_EPI32
264f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
265f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
266f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
267f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
268f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic int start_max(int x, int y) { return (x > y) ? x : y; }
270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble CalcSSIM(const uint8 *org, const uint8 *rec,
272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                const int image_width, const int image_height) {
273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  double SSIM = 0.;
274f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int stride = image_width;
279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int j = 0; j < KERNEL_Y; ++j) {
281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int i = 0; i < image_width; ++i) {
282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
286f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef _OPENMP
287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  #pragma omp parallel for reduction(+: SSIM)
288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int i = 0; i < KERNEL_X; ++i) {
291f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
292f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
293f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int i = KERNEL_X; i < start_x; ++i) {
294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
295f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
296f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    if (start_x < image_width) {
297f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
298f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // copy the 8 rightmost pixels on a cache area, and pad this area with
299f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // zeros which won't contribute to the overall SSIM value (but we need
300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // to pass the correct normalizing constant!). By using this cache, we can
301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // still call GetSSIMFullKernel() instead of the slower GetSSIM().
302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // NOTE: we could use similar method for the left-most pixels too.
303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int kScratchWidth = 8;
304f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int kScratchStride = kScratchWidth + KERNEL + 1;
305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
308f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      for (int k = 0; k < KERNEL_SIZE; ++k) {
309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        const int offset =
310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang            (j - KERNEL + k) * stride + image_width - kScratchWidth;
311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
312f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
313f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      }
314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      for (int k = 0;  k <= KERNEL_X + 1; ++k) {
315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                  KERNEL + k, KERNEL, kScratchStride, kiW[k]);
317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      }
318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
319f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int j = start_y; j < image_height; ++j) {
322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int i = 0; i < image_width; ++i) {
323f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
325f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return SSIM;
327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
328f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble CalcLSSIM(double ssim) {
330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return -10.0 * log10(1.0 - ssim);
331f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
332f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}  // extern "C"
335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
337