1f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang/*
2f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *
4f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Use of this source code is governed by a BSD-style license
5f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  that can be found in the LICENSE file in the root of the source
6f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  tree. An additional intellectual property rights grant can be found
7f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  in the file PATENTS. All contributing project authors may
8f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  be found in the AUTHORS file in the root of the source tree.
9f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang */
10f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
11f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include "../util/ssim.h"  // NOLINT
12f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
13f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <string.h>
14f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
15f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
16f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangextern "C" {
17f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
18f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
19b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardtypedef unsigned int uint32;    // NOLINT
20b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardtypedef unsigned short uint16;  // NOLINT
21f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
22f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
23b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
24f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define __SSE2__
25f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
26f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
27f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <emmintrin.h>
28f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
29f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
30f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef _OPENMP
31f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include <omp.h>
32f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
33f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
34f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// SSIM
35f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangenum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
36f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
37f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Symmetric Gaussian kernel:  K[i] = ~11 * exp(-0.3 * i * i)
38f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// The maximum value (11 x 11) must be less than 128 to avoid sign
39f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// problems during the calls to _mm_mullo_epi16().
40f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const int K[KERNEL_SIZE] = {
41b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    1, 3, 7, 11, 7, 3, 1  // ~11 * exp(-0.3 * i * i)
42f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang};
43f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const double kiW[KERNEL + 1 + 1] = {
44b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
45b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    1. / 1089.,  // 1 / sum(i:0..6, j..6) K[i]*K[j]
46b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    1. / 1056.,  // 1 / sum(i:0..5, j..6) K[i]*K[j]
47b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    1. / 957.,   // 1 / sum(i:0..4, j..6) K[i]*K[j]
48b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    1. / 726.,   // 1 / sum(i:0..3, j..6) K[i]*K[j]
49f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang};
50f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
51f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
52f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
53b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)])  // weight product
54b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define MAKE_WEIGHT(L)                                                \
55b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  {                                                                   \
56b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    {                                                                 \
57b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      {                                                               \
58b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        PWEIGHT(L, 0)                                                 \
59b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \
60b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard            PWEIGHT(L, 5), PWEIGHT(L, 6), 0                           \
61b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      }                                                               \
62b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    }                                                                 \
63b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  }
64f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
65f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// We need this union trick to be able to initialize constant static __m128i
66f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// values. We can't call _mm_set_epi16() for static compile-time initialization.
67f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const struct {
68f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  union {
69f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    uint16 i16_[8];
70f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    __m128i m_;
71f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  } values_;
72b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
73f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  W3 = MAKE_WEIGHT(3);
74b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard// ... the rest is symmetric.
75f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef MAKE_WEIGHT
76f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef PWEIGHT
77f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
78f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
79f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Common final expression for SSIM, once the weighted sums are known.
80b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic double FinalizeSSIM(double iw,
81b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           double xm,
82b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           double ym,
83b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           double xxm,
84b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           double xym,
85b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           double yym) {
86f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double iwx = xm * iw;
87f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double iwy = ym * iw;
88f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  double sxx = xxm * iw - iwx * iwx;
89f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  double syy = yym * iw - iwy * iwy;
90f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // small errors are possible, due to rounding. Clamp to zero.
91b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  if (sxx < 0.)
92b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    sxx = 0.;
93b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  if (syy < 0.)
94b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    syy = 0.;
95f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double sxsy = sqrt(sxx * syy);
96f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double sxy = xym * iw - iwx * iwy;
97f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  static const double C11 = (0.01 * 0.01) * (255 * 255);
98f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  static const double C22 = (0.03 * 0.03) * (255 * 255);
99f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  static const double C33 = (0.015 * 0.015) * (255 * 255);
100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
101b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  const double c = (2. * sxsy + C22) / (sxx + syy + C22);
102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const double s = (sxy + C33) / (sxsy + C33);
103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return l * c * s;
104f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
106f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// GetSSIM() does clipping.  GetSSIMFullKernel() does not
107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
108f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// TODO(skal): use summed tables?
109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// with a diff of 255, squared. The maximum error is thus 0x4388241,
111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// which fits into 32 bits integers.
112b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barcharddouble GetSSIM(const uint8* org,
113b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               const uint8* rec,
114b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               int xo,
115b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               int yo,
116b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               int W,
117b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               int H,
118b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               int stride) {
119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  org += (yo - KERNEL) * stride;
121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  org += (xo - KERNEL);
122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  rec += (yo - KERNEL) * stride;
123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  rec += (xo - KERNEL);
124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
125b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H))
126b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      continue;
127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int Wy = K[y_];
128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int Wxy = Wy * K[x_];
130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        const int org_x = org[x_];
132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        const int rec_x = rec[x_];
133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        ws += Wxy;
134b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        xm += Wxy * org_x;
135b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        ym += Wxy * rec_x;
136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        xxm += Wxy * org_x * org_x;
137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        xym += Wxy * org_x * rec_x;
138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        yym += Wxy * rec_x * rec_x;
139f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      }
140f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
141f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
145b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barcharddouble GetSSIMFullKernel(const uint8* org,
146b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         const uint8* rec,
147b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int xo,
148b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int yo,
149b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int stride,
150f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                         double area_weight) {
151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  org += yo * stride + xo;
156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  rec += yo * stride + xo;
157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int y = 1; y <= KERNEL; y++) {
158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int dy1 = y * stride;
159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int dy2 = y * stride;
160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int Wy = K[KERNEL + y];
161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int x = 1; x <= KERNEL; x++) {
163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // Compute the contributions of upper-left (ul), upper-right (ur)
164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // lower-left (ll) and lower-right (lr) points (see the diagram below).
165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // Symmetric Kernel will have same weight on those points.
166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  -  -  -  -
167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  ul -  -  -  ur -
168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  -  -  -  -
169f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  0  -  -  -
170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  -  -  -  -
171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  ll -  -  -  lr -
172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      //       -  -  -  -  -  -  -
173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int Wxy = Wy * K[KERNEL + x];
174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ul1 = org[-dy1 - x];
175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ur1 = org[-dy1 + x];
176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ll1 = org[dy1 - x];
177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int lr1 = org[dy1 + x];
178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ul2 = rec[-dy2 - x];
180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ur2 = rec[-dy2 + x];
181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int ll2 = rec[dy2 - x];
182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int lr2 = rec[dy2 + x];
183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
184b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      xm += Wxy * (ul1 + ur1 + ll1 + lr1);
185b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      ym += Wxy * (ul2 + ur2 + ll2 + lr2);
186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
190f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Compute the contributions of up (u), down (d), left (l) and right (r)
192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // points across the main axes (see the diagram below).
193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Symmetric Kernel will have same weight on those points.
194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  -  -  -  -
195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  u  -  -  -
196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  -  -  -  -
197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  l  -  0  -  r  -
198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  -  -  -  -
199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  d  -  -  -
200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    //       -  -  -  -  -  -  -
201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int Wxy = Wy * K[KERNEL];
202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int u1 = org[-dy1];
203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int d1 = org[dy1];
204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int l1 = org[-y];
205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int r1 = org[y];
206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int u2 = rec[-dy2];
208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int d2 = rec[dy2];
209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int l2 = rec[-y];
210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    const int r2 = rec[y];
211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
212b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    xm += Wxy * (u1 + d1 + l1 + r1);
213b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    ym += Wxy * (u2 + d2 + l2 + r2);
214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // Lastly the contribution of (x0, y0) point.
220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int Wxy = K[KERNEL] * K[KERNEL];
221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int s1 = org[0];
222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int s2 = rec[0];
223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
224b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  xm += Wxy * s1;
225b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  ym += Wxy * s2;
226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  xxm += Wxy * s1 * s1;
227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  xym += Wxy * s1 * s2;
228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  yym += Wxy * s2 * s2;
229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
230b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#else  // __SSE2__
231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  org += (yo - KERNEL) * stride + (xo - KERNEL);
233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  rec += (yo - KERNEL) * stride + (xo - KERNEL);
234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const __m128i zero = _mm_setzero_si128();
236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i x = zero;
237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i y = zero;
238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i xx = zero;
239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i xy = zero;
240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __m128i yy = zero;
241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 pixels at line #L, and convert to 16bit, perform weighting
243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// and acccumulate.
244b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define LOAD_LINE_PAIR(L, WEIGHT)                                            \
245b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  do {                                                                       \
246b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    const __m128i v0 =                                                       \
247b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L)*stride)); \
248b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    const __m128i v1 =                                                       \
249b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L)*stride)); \
250b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    const __m128i w0 = _mm_unpacklo_epi8(v0, zero);                          \
251b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    const __m128i w1 = _mm_unpacklo_epi8(v1, zero);                          \
252b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_);            \
253b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_);            \
254b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero));                     \
255b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero));                     \
256b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero));                     \
257b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero));                     \
258b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0));                         \
259b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1));                         \
260b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1));                         \
261b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  } while (0)
262b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
263b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define ADD_AND_STORE_FOUR_EPI32(M, OUT)                    \
264b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  do {                                                      \
265b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    uint32 tmp[4];                                          \
266b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
267b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0];              \
268b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  } while (0)
269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(0, W0);
271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(1, W1);
272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(2, W2);
273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(3, W3);
274f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(4, W2);
275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(5, W1);
276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  LOAD_LINE_PAIR(6, W0);
277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(x, xm);
279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(y, ym);
280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(xx, xxm);
281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(xy, xym);
282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  ADD_AND_STORE_FOUR_EPI32(yy, yym);
283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef LOAD_LINE_PAIR
285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#undef ADD_AND_STORE_FOUR_EPI32
286f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
291b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic int start_max(int x, int y) {
292b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  return (x > y) ? x : y;
293b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard}
294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
295b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barcharddouble CalcSSIM(const uint8* org,
296b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                const uint8* rec,
297b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                const int image_width,
298b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                const int image_height) {
299f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  double SSIM = 0.;
300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
304f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const int stride = image_width;
305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int j = 0; j < KERNEL_Y; ++j) {
307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int i = 0; i < image_width; ++i) {
308f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
312f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef _OPENMP
313b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#pragma omp parallel for reduction(+ : SSIM)
314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int i = 0; i < KERNEL_X; ++i) {
317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
319f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int i = KERNEL_X; i < start_x; ++i) {
320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    if (start_x < image_width) {
323f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // copy the 8 rightmost pixels on a cache area, and pad this area with
325f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // zeros which won't contribute to the overall SSIM value (but we need
326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // to pass the correct normalizing constant!). By using this cache, we can
327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // still call GetSSIMFullKernel() instead of the slower GetSSIM().
328f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      // NOTE: we could use similar method for the left-most pixels too.
329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int kScratchWidth = 8;
330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      const int kScratchStride = kScratchWidth + KERNEL + 1;
331b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0};
332b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      for (int k = 0; k < KERNEL_SIZE; ++k) {
335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        const int offset =
336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang            (j - KERNEL + k) * stride + image_width - kScratchWidth;
337f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
338f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang        memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
339f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      }
340b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      for (int k = 0; k <= KERNEL_X + 1; ++k) {
341b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL,
342b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  kScratchStride, kiW[k]);
343f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      }
344f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
345f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
346f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
347f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  for (int j = start_y; j < image_height; ++j) {
348f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    for (int i = 0; i < image_width; ++i) {
349f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
350f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    }
351f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
352f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return SSIM;
353f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
354f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
355f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangdouble CalcLSSIM(double ssim) {
356f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  return -10.0 * log10(1.0 - ssim);
357f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
358f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
359f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
360f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}  // extern "C"
361f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
362