133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp/*
233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *
433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  Use of this source code is governed by a BSD-style license
533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  that can be found in the LICENSE file in the root of the source
633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  tree. An additional intellectual property rights grant can be found
733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  in the file PATENTS.  All contributing project authors may
833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  be found in the AUTHORS file in the root of the source tree.
933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp */
1033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/compare.h"
1233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include <float.h>
1433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include <math.h>
1533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef _OPENMP
1633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include <omp.h>
1733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
1833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/basic_types.h"
2033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/cpu_id.h"
2133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/row.h"
2233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
2333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
2433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampnamespace libyuv {
2533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampextern "C" {
2633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
2733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
2833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// hash seed of 5381 recommended.
2933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Internal C version of HashDjb2 with int sized count for efficiency.
3033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
3133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint32 hash = seed;
3233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < count; ++i) {
3333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    hash += (hash << 5) + src[i];
3433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
3533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return hash;
3633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
3733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
3833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// This module is for Visual C x86
3933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
4033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_HASHDJB2_SSE41
4133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
4233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHashMul0 = {
4333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x0c3525e1,  // 33 ^ 15
4433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0xa3476dc1,  // 33 ^ 14
4533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x3b4039a1,  // 33 ^ 13
4633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x4f5f0981,  // 33 ^ 12
4733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
4833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHashMul1 = {
4933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x30f35d61,  // 33 ^ 11
5033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x855cb541,  // 33 ^ 10
5133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x040a9121,  // 33 ^ 9
5233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x747c7101,  // 33 ^ 8
5333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
5433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHashMul2 = {
5533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0xec41d4e1,  // 33 ^ 7
5633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x4cfa3cc1,  // 33 ^ 6
5733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x025528a1,  // 33 ^ 5
5833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00121881,  // 33 ^ 4
5933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
6033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHashMul3 = {
6133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00008c61,  // 33 ^ 3
6233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00000441,  // 33 ^ 2
6333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00000021,  // 33 ^ 1
6433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00000001,  // 33 ^ 0
6533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
6633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
6733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
6833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
6933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
7033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
7133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
7233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
7333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    _asm _emit 0x40 _asm _emit reg
7433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
7533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
7633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
7733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
7833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]    // src
7933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8]    // count
8033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm0, [esp + 12]  // seed
8133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
8233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm7, xmm7        // constant 0 for unpck
8333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, kHash16x33
8433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
8533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
8633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  wloop:
8733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax]       // src[0-15]
8833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
8933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
9033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kHashMul0
9133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, xmm1
9233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm7        // src[0-7]
9333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm2
9433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm3, xmm7        // src[0-3]
9533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulld(0xdd)                 // pmulld     xmm3, xmm5
9633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kHashMul1
9733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm2
9833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm4, xmm7        // src[4-7]
9933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulld(0xe5)                 // pmulld     xmm4, xmm5
10033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kHashMul2
10133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw  xmm1, xmm7        // src[8-15]
10233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, xmm1
10333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm2, xmm7        // src[8-11]
10433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulld(0xd5)                 // pmulld     xmm2, xmm5
10533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kHashMul3
10633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm7        // src[12-15]
10733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulld(0xcd)                 // pmulld     xmm1, xmm5
10833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm3, xmm4        // add 16 results
10933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm1, xmm2
11033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
11133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm1, xmm3
11233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
11333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm2, xmm1, 14    // upper 2 dwords
11433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm1, xmm2
11533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm2, xmm1, 1
11633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm1, xmm2
11733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm1
11833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         wloop
11933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
12033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       eax, xmm0        // return hash
12133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
12233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
12333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
12433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
12533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif !defined(YUV_DISABLE_ASM) && \
12633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
12733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// GCC 4.2 on OSX has link error when passing static or const to inline.
12833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
12933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __APPLE__
13033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define CONST
13133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#else
13233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define CONST static const
13333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
13433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_HASHDJB2_SSE41
13533cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
13633cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHashMul0 = {
13733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x0c3525e1,  // 33 ^ 15
13833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0xa3476dc1,  // 33 ^ 14
13933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x3b4039a1,  // 33 ^ 13
14033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x4f5f0981,  // 33 ^ 12
14133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
14233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHashMul1 = {
14333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x30f35d61,  // 33 ^ 11
14433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x855cb541,  // 33 ^ 10
14533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x040a9121,  // 33 ^ 9
14633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x747c7101,  // 33 ^ 8
14733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
14833cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHashMul2 = {
14933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0xec41d4e1,  // 33 ^ 7
15033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x4cfa3cc1,  // 33 ^ 6
15133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x025528a1,  // 33 ^ 5
15233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00121881,  // 33 ^ 4
15333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
15433cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHashMul3 = {
15533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00008c61,  // 33 ^ 3
15633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00000441,  // 33 ^ 2
15733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00000021,  // 33 ^ 1
15833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0x00000001,  // 33 ^ 0
15933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
16033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
16133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint32 hash;
16233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
16333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %2,%%xmm0                       \n"
16433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm7,%%xmm7                   \n"
16533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm6                       \n"
16633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
16733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
16833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm1                     \n"
16933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
17033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulld    %%xmm6,%%xmm0                   \n"
17133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %5,%%xmm5                       \n"
17233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,%%xmm2                   \n"
17333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm7,%%xmm2                   \n"
17433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm3                   \n"
17533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm7,%%xmm3                   \n"
17633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulld    %%xmm5,%%xmm3                   \n"
17733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %6,%%xmm5                       \n"
17833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm4                   \n"
17933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm7,%%xmm4                   \n"
18033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulld    %%xmm5,%%xmm4                   \n"
18133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %7,%%xmm5                       \n"
18233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm7,%%xmm1                   \n"
18333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,%%xmm2                   \n"
18433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm7,%%xmm2                   \n"
18533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulld    %%xmm5,%%xmm2                   \n"
18633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %8,%%xmm5                       \n"
18733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm7,%%xmm1                   \n"
18833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulld    %%xmm5,%%xmm1                   \n"
18933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm4,%%xmm3                   \n"
19033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm2,%%xmm1                   \n"
19133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%1                        \n"
19233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm3,%%xmm1                   \n"
19333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
19433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm2,%%xmm1                   \n"
19533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
19633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm2,%%xmm1                   \n"
19733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm1,%%xmm0                   \n"
19833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
19933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,%3                       \n"
20033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),        // %0
20133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(count),      // %1
20233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(seed),      // %2
20333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "=g"(hash)        // %3
20433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kHash16x33),  // %4
20533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kHashMul0),   // %5
20633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kHashMul1),   // %6
20733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kHashMul2),   // %7
20833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kHashMul3)    // %8
20933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
21033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
21133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
21233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
21333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
21433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return hash;
21533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
21633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_HASHDJB2_SSE41
21733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
21833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// hash seed of 5381 recommended.
21933cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
22033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampuint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
22133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
22233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_HASHDJB2_SSE41)
22333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSE41)) {
22433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    HashDjb2_SSE = HashDjb2_SSE41;
22533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
22633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
22733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
22833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int kBlockSize = 1 << 15;  // 32768;
22933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  while (count >= static_cast<uint64>(kBlockSize)) {
23033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    seed = HashDjb2_SSE(src, kBlockSize, seed);
23133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src += kBlockSize;
23233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    count -= kBlockSize;
23333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
23433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int remainder = static_cast<int>(count) & ~15;
23533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (remainder) {
23633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    seed = HashDjb2_SSE(src, remainder, seed);
23733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src += remainder;
23833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    count -= remainder;
23933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
24033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  remainder = static_cast<int>(count) & 15;
24133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (remainder) {
24233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    seed = HashDjb2_C(src, remainder, seed);
24333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
24433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return seed;
24533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
24633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
24733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
24833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_SUMSQUAREERROR_NEON
24933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
25033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampuint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
25133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
25233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
25333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_SUMSQUAREERROR_SSE2
25433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
25533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
25633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                  int count) {
25733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
25833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]    // src_a
25933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]    // src_b
26033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]   // count
26133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm0, xmm0
26233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm5, xmm5
26333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edx, eax
26433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
26533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
26633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  wloop:
26733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax]
26833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax + edx]
26933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 16]
27033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
27133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm1  // abs trick
27233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubusb    xmm1, xmm2
27333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubusb    xmm2, xmm3
27433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm1, xmm2
27533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, xmm1
27633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm1, xmm5
27733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw  xmm2, xmm5
27833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddwd    xmm1, xmm1
27933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddwd    xmm2, xmm2
28033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm1
28133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm2
28233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         wloop
28333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
28433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm1, xmm0, 0EEh
28533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm1
28633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm1, xmm0, 01h
28733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm1
28833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       eax, xmm0
28933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
29033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
29133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
29233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
29333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
29433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_SUMSQUAREERROR_SSE2
29533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
29633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                  int count) {
29733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint32 sse;
29833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
29933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm0,%%xmm0                   \n"
30033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm5,%%xmm5                   \n"
30133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
30233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
30333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "1:                                        \n"
30433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm1                     \n"
30533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0,%1,1),%%xmm2                \n"
30633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
30733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
30833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,%%xmm3                   \n"
30933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubusb   %%xmm2,%%xmm1                   \n"
31033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubusb   %%xmm3,%%xmm2                   \n"
31133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm2,%%xmm1                   \n"
31233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,%%xmm2                   \n"
31333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm1                   \n"
31433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm5,%%xmm2                   \n"
31533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddwd   %%xmm1,%%xmm1                   \n"
31633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddwd   %%xmm2,%%xmm2                   \n"
31733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm1,%%xmm0                   \n"
31833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm2,%%xmm0                   \n"
31933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
32033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
32133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
32233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm1,%%xmm0                   \n"
32333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
32433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm1,%%xmm0                   \n"
32533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,%3                       \n"
32633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
32733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_a),      // %0
32833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(src_b),      // %1
32933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(count),      // %2
33033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "=g"(sse)         // %3
33133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
33233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
33333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
33433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm5"
33533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
33633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
33733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return sse;
33833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
33933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
34033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
34133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
34233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               int count) {
34333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint32 sse = 0u;
34433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < count; ++i) {
34533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    int diff = src_a[i] - src_b[i];
34633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sse += static_cast<uint32>(diff * diff);
34733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
34833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return sse;
34933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
35033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
35133cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
35233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampuint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
35333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                             int count) {
35433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
35533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      SumSquareError_C;
35633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_SUMSQUAREERROR_NEON)
35733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasNEON)) {
35833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    SumSquareError = SumSquareError_NEON;
35933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
36033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif defined(HAS_SUMSQUAREERROR_SSE2)
36133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSE2) &&
36233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
36333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Note only used for multiples of 16 so count is not checked.
36433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    SumSquareError = SumSquareError_SSE2;
36533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
36633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
36733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // 32K values will fit a 32bit int return value from SumSquareError.
36833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // After each block of 32K, accumulate into 64 bit int.
36933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int kBlockSize = 1 << 15;  // 32768;
37033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint64 sse = 0;
37133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef _OPENMP
37233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#pragma omp parallel for reduction(+: sse)
37333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
37433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
37533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
37633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
37733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  src_a += count & ~(kBlockSize - 1);
37833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  src_b += count & ~(kBlockSize - 1);
37933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int remainder = count & (kBlockSize - 1) & ~15;
38033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (remainder) {
38133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sse += SumSquareError(src_a, src_b, remainder);
38233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src_a += remainder;
38333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src_b += remainder;
38433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
38533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  remainder = count & 15;
38633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (remainder) {
38733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sse += SumSquareError_C(src_a, src_b, remainder);
38833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
38933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return sse;
39033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
39133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
39233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
39333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampuint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
39433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                  const uint8* src_b, int stride_b,
39533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                  int width, int height) {
39633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
39733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      SumSquareError_C;
39833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_SUMSQUAREERROR_NEON)
39933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasNEON)) {
40033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    SumSquareError = SumSquareError_NEON;
40133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
40233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif defined(HAS_SUMSQUAREERROR_SSE2)
40333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
40433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
40533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
40633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    SumSquareError = SumSquareError_SSE2;
40733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
40833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
40933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
41033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint64 sse = 0;
41133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int h = 0; h < height; ++h) {
41233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sse += SumSquareError(src_a, src_b, width);
41333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src_a += stride_a;
41433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src_b += stride_b;
41533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
41633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
41733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return sse;
41833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
41933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
42033cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
42133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble SumSquareErrorToPsnr(uint64 sse, uint64 count) {
42233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  double psnr;
42333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (sse > 0) {
42433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    double mse = static_cast<double>(count) / static_cast<double>(sse);
42533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psnr = 10.0 * log10(255.0 * 255.0 * mse);
42633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  } else {
42733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psnr = kMaxPsnr;      // Limit to prevent divide by 0
42833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
42933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
43033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (psnr > kMaxPsnr)
43133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psnr = kMaxPsnr;
43233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
43333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return psnr;
43433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
43533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
43633cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
43733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble CalcFramePsnr(const uint8* src_a, int stride_a,
43833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     const uint8* src_b, int stride_b,
43933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     int width, int height) {
44033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const uint64 samples = width * height;
44133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
44233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                                src_b, stride_b,
44333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                                width, height);
44433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return SumSquareErrorToPsnr(sse, samples);
44533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
44633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
44733cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
44833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble I420Psnr(const uint8* src_y_a, int stride_y_a,
44933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_u_a, int stride_u_a,
45033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_v_a, int stride_v_a,
45133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_y_b, int stride_y_b,
45233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_u_b, int stride_u_b,
45333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_v_b, int stride_v_b,
45433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                int width, int height) {
45533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
45633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                                  src_y_b, stride_y_b,
45733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                                  width, height);
45833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int width_uv = (width + 1) >> 1;
45933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int height_uv = (height + 1) >> 1;
46033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
46133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                                  src_u_b, stride_u_b,
46233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                                  width_uv, height_uv);
46333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
46433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                                  src_v_b, stride_v_b,
46533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                                  width_uv, height_uv);
46633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const uint64 samples = width * height + 2 * (width_uv * height_uv);
46733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const uint64 sse = sse_y + sse_u + sse_v;
46833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return SumSquareErrorToPsnr(sse, samples);
46933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
47033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
47133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const int64 cc1 =  26634;  // (64^2*(.01*255)^2
47233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const int64 cc2 = 239708;  // (64^2*(.03*255)^2
47333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
47433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic double Ssim8x8_C(const uint8* src_a, int stride_a,
47533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        const uint8* src_b, int stride_b) {
47633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int64 sum_a = 0;
47733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int64 sum_b = 0;
47833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int64 sum_sq_a = 0;
47933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int64 sum_sq_b = 0;
48033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int64 sum_axb = 0;
48133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
48233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < 8; ++i) {
48333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    for (int j = 0; j < 8; ++j) {
48433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      sum_a += src_a[j];
48533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      sum_b += src_b[j];
48633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      sum_sq_a += src_a[j] * src_a[j];
48733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      sum_sq_b += src_b[j] * src_b[j];
48833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      sum_axb += src_a[j] * src_b[j];
48933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    }
49033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
49133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src_a += stride_a;
49233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src_b += stride_b;
49333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
49433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
49533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int64 count = 64;
49633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // scale the constants by number of pixels
49733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int64 c1 = (cc1 * count * count) >> 12;
49833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int64 c2 = (cc2 * count * count) >> 12;
49933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
50033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int64 sum_a_x_sum_b = sum_a * sum_b;
50133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
50233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
50333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
50433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
50533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int64 sum_a_sq = sum_a*sum_a;
50633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int64 sum_b_sq = sum_b*sum_b;
50733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
50833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
50933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       (count * sum_sq_a - sum_a_sq +
51033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        count * sum_sq_b - sum_b_sq + c2);
51133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
51233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (ssim_d == 0.0)
51333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    return DBL_MAX;
51433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return ssim_n * 1.0 / ssim_d;
51533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
51633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
51733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// We are using a 8x8 moving window with starting location of each 8x8 window
51833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
51933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// block boundaries to penalize blocking artifacts.
52033cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
52133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble CalcFrameSsim(const uint8* src_a, int stride_a,
52233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     const uint8* src_b, int stride_b,
52333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     int width, int height) {
52433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int samples = 0;
52533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  double ssim_total = 0;
52633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
52733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  double (*Ssim8x8)(const uint8* src_a, int stride_a,
52833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                    const uint8* src_b, int stride_b);
52933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
53033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  Ssim8x8 = Ssim8x8_C;
53133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
53233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // sample point start with each 4x4 location
53333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < height - 8; i += 4) {
53433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    for (int j = 0; j < width - 8; j += 4) {
53533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
53633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      samples++;
53733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    }
53833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
53933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src_a += stride_a * 4;
54033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src_b += stride_b * 4;
54133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
54233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
54333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  ssim_total /= samples;
54433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return ssim_total;
54533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
54633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
54733cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
54833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble I420Ssim(const uint8* src_y_a, int stride_y_a,
54933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_u_a, int stride_u_a,
55033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_v_a, int stride_v_a,
55133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_y_b, int stride_y_b,
55233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_u_b, int stride_u_b,
55333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                const uint8* src_v_b, int stride_v_b,
55433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                int width, int height) {
55533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
55633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                      src_y_b, stride_y_b, width, height);
55733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int width_uv = (width + 1) >> 1;
55833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const int height_uv = (height + 1) >> 1;
55933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
56033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                      src_u_b, stride_u_b,
56133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                      width_uv, height_uv);
56233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
56333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                      src_v_b, stride_v_b,
56433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                      width_uv, height_uv);
56533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
56633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
56733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
56833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
56933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}  // extern "C"
57033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}  // namespace libyuv
57133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
572