133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp/* 233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * Copyright 2011 The LibYuv Project Authors. All rights reserved. 333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * 433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * Use of this source code is governed by a BSD-style license 533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * that can be found in the LICENSE file in the root of the source 633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * tree. An additional intellectual property rights grant can be found 733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * in the file PATENTS. All contributing project authors may 833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * be found in the AUTHORS file in the root of the source tree. 933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp */ 1033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 1133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/compare.h" 1233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 1333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include <float.h> 1433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include <math.h> 1533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef _OPENMP 1633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include <omp.h> 1733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 1833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 1933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/basic_types.h" 2033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/cpu_id.h" 2133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/row.h" 2233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 2333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus 2433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampnamespace libyuv { 2533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampextern "C" { 2633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 2733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 2833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// hash seed of 5381 recommended. 2933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Internal C version of HashDjb2 with int sized count for efficiency. 3033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { 3133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint32 hash = seed; 3233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp for (int i = 0; i < count; ++i) { 3333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp hash += (hash << 5) + src[i]; 3433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 3533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return hash; 3633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 3733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 3833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// This module is for Visual C x86 3933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) 4033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_HASHDJB2_SSE41 4133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 4233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHashMul0 = { 4333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x0c3525e1, // 33 ^ 15 4433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0xa3476dc1, // 33 ^ 14 4533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x3b4039a1, // 33 ^ 13 4633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x4f5f0981, // 33 ^ 12 4733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 4833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHashMul1 = { 4933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x30f35d61, // 33 ^ 11 5033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x855cb541, // 33 ^ 10 5133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x040a9121, // 33 ^ 9 5233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x747c7101, // 33 ^ 8 5333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 5433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHashMul2 = { 5533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0xec41d4e1, // 33 ^ 7 5633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x4cfa3cc1, // 33 ^ 6 5733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x025528a1, // 33 ^ 5 5833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00121881, // 33 ^ 4 5933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 6033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec32 kHashMul3 = { 6133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00008c61, // 33 ^ 3 6233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00000441, // 33 ^ 2 6333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00000021, // 33 ^ 1 6433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00000001, // 33 ^ 0 6533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 6633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 6733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 6833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 44: 66 0F 38 40 DD pmulld xmm3,xmm5 6933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 7033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 7133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 83: 66 0F 38 40 CD pmulld xmm1,xmm5 7233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ 7333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp _asm _emit 0x40 _asm _emit reg 7433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 7533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 7633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 7733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 7833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src 7933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8] // count 8033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm0, [esp + 12] // seed 8133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 8233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm7, xmm7 // constant 0 for unpck 8333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kHash16x33 8433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 8533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 8633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp wloop: 8733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax] // src[0-15] 8833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 8933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 9033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kHashMul0 9133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm1 9233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm7 // src[0-7] 9333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm2 9433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm3, xmm7 // src[0-3] 9533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulld(0xdd) // pmulld xmm3, xmm5 9633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kHashMul1 9733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm2 9833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm4, xmm7 // src[4-7] 9933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulld(0xe5) // pmulld xmm4, xmm5 10033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kHashMul2 10133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm1, xmm7 // src[8-15] 10233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm1 10333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm2, xmm7 // src[8-11] 10433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulld(0xd5) // pmulld xmm2, xmm5 10533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kHashMul3 10633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm7 // src[12-15] 10733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulld(0xcd) // pmulld xmm1, xmm5 10833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm3, xmm4 // add 16 results 10933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm1, xmm2 11033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 11133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm1, xmm3 11233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 11333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm2, xmm1, 14 // upper 2 dwords 11433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm1, xmm2 11533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm2, xmm1, 1 11633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm1, xmm2 11733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm1 11833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg wloop 11933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 12033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd eax, xmm0 // return hash 12133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 12233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 12333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 12433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 12533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif !defined(YUV_DISABLE_ASM) && \ 12633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) 12733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// GCC 4.2 on OSX has link error when passing static or const to inline. 12833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. 12933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __APPLE__ 13033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define CONST 13133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#else 13233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define CONST static const 13333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 13433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_HASHDJB2_SSE41 13533cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 13633cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHashMul0 = { 13733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x0c3525e1, // 33 ^ 15 13833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0xa3476dc1, // 33 ^ 14 13933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x3b4039a1, // 33 ^ 13 14033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x4f5f0981, // 33 ^ 12 14133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 14233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHashMul1 = { 14333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x30f35d61, // 33 ^ 11 14433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x855cb541, // 33 ^ 10 14533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x040a9121, // 33 ^ 9 14633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x747c7101, // 33 ^ 8 14733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 14833cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHashMul2 = { 14933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0xec41d4e1, // 33 ^ 7 15033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x4cfa3cc1, // 33 ^ 6 15133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x025528a1, // 33 ^ 5 15233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00121881, // 33 ^ 4 15333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 15433cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec32 kHashMul3 = { 15533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00008c61, // 33 ^ 3 15633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00000441, // 33 ^ 2 15733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00000021, // 33 ^ 1 15833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0x00000001, // 33 ^ 0 15933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 16033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 16133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint32 hash; 16233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 16333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movd %2,%%xmm0 \n" 16433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pxor %%xmm7,%%xmm7 \n" 16533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %4,%%xmm6 \n" 16633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ".p2align 4 \n" 16733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 16833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqu (%0),%%xmm1 \n" 16933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "lea 0x10(%0),%0 \n" 17033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pmulld %%xmm6,%%xmm0 \n" 17133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %5,%%xmm5 \n" 17233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %%xmm1,%%xmm2 \n" 17333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "punpcklbw %%xmm7,%%xmm2 \n" 17433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %%xmm2,%%xmm3 \n" 17533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "punpcklwd %%xmm7,%%xmm3 \n" 17633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pmulld %%xmm5,%%xmm3 \n" 17733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %6,%%xmm5 \n" 17833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %%xmm2,%%xmm4 \n" 17933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "punpckhwd %%xmm7,%%xmm4 \n" 18033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pmulld %%xmm5,%%xmm4 \n" 18133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %7,%%xmm5 \n" 18233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "punpckhbw %%xmm7,%%xmm1 \n" 18333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %%xmm1,%%xmm2 \n" 18433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "punpcklwd %%xmm7,%%xmm2 \n" 18533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pmulld %%xmm5,%%xmm2 \n" 18633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %8,%%xmm5 \n" 18733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "punpckhwd %%xmm7,%%xmm1 \n" 18833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pmulld %%xmm5,%%xmm1 \n" 18933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm4,%%xmm3 \n" 19033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm2,%%xmm1 \n" 19133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "sub $0x10,%1 \n" 19233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm3,%%xmm1 \n" 19333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pshufd $0xe,%%xmm1,%%xmm2 \n" 19433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm2,%%xmm1 \n" 19533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pshufd $0x1,%%xmm1,%%xmm2 \n" 19633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm2,%%xmm1 \n" 19733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm1,%%xmm0 \n" 19833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "jg 1b \n" 19933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movd %%xmm0,%3 \n" 20033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src), // %0 20133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(count), // %1 20233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+rm"(seed), // %2 20333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "=g"(hash) // %3 20433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "m"(kHash16x33), // %4 20533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "m"(kHashMul0), // %5 20633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "m"(kHashMul1), // %6 20733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "m"(kHashMul2), // %7 20833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "m"(kHashMul3) // %8 20933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "memory", "cc" 21033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__) 21133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 21233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 21333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 21433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return hash; 21533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 21633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_HASHDJB2_SSE41 21733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 21833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// hash seed of 5381 recommended. 21933cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API 22033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampuint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { 22133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; 22233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_HASHDJB2_SSE41) 22333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (TestCpuFlag(kCpuHasSSE41)) { 22433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp HashDjb2_SSE = HashDjb2_SSE41; 22533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 22633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 22733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 22833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int kBlockSize = 1 << 15; // 32768; 22933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp while (count >= static_cast<uint64>(kBlockSize)) { 23033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp seed = HashDjb2_SSE(src, kBlockSize, seed); 23133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src += kBlockSize; 23233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp count -= kBlockSize; 23333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 23433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int remainder = static_cast<int>(count) & ~15; 23533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (remainder) { 23633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp seed = HashDjb2_SSE(src, remainder, seed); 23733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src += remainder; 23833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp count -= remainder; 23933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 24033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp remainder = static_cast<int>(count) & 15; 24133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (remainder) { 24233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp seed = HashDjb2_C(src, remainder, seed); 24333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 24433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return seed; 24533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 24633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 24733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) 24833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_SUMSQUAREERROR_NEON 24933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 25033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampuint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); 25133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 25233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86) 25333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_SUMSQUAREERROR_SSE2 25433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 25533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, 25633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int count) { 25733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 25833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_a 25933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // src_b 26033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // count 26133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm0, xmm0 26233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm5, xmm5 26333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 26433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 26533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 26633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp wloop: 26733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax] 26833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax + edx] 26933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 27033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 27133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm1 // abs trick 27233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubusb xmm1, xmm2 27333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubusb xmm2, xmm3 27433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm2 27533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm1 27633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm1, xmm5 27733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm2, xmm5 27833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddwd xmm1, xmm1 27933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddwd xmm2, xmm2 28033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm1 28133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm2 28233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg wloop 28333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 28433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm1, xmm0, 0EEh 28533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm1 28633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm1, xmm0, 01h 28733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm1 28833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd eax, xmm0 28933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 29033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 29133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 29233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 29333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) 29433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_SUMSQUAREERROR_SSE2 29533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, 29633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int count) { 29733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint32 sse; 29833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp asm volatile ( 29933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pxor %%xmm0,%%xmm0 \n" 30033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pxor %%xmm5,%%xmm5 \n" 30133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "sub %0,%1 \n" 30233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ".p2align 4 \n" 30333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "1: \n" 30433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa (%0),%%xmm1 \n" 30533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa (%0,%1,1),%%xmm2 \n" 30633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "lea 0x10(%0),%0 \n" 30733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "sub $0x10,%2 \n" 30833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %%xmm1,%%xmm3 \n" 30933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "psubusb %%xmm2,%%xmm1 \n" 31033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "psubusb %%xmm3,%%xmm2 \n" 31133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "por %%xmm2,%%xmm1 \n" 31233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movdqa %%xmm1,%%xmm2 \n" 31333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "punpcklbw %%xmm5,%%xmm1 \n" 31433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "punpckhbw %%xmm5,%%xmm2 \n" 31533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pmaddwd %%xmm1,%%xmm1 \n" 31633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pmaddwd %%xmm2,%%xmm2 \n" 31733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm1,%%xmm0 \n" 31833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm2,%%xmm0 \n" 31933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "jg 1b \n" 32033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 32133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pshufd $0xee,%%xmm0,%%xmm1 \n" 32233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm1,%%xmm0 \n" 32333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "pshufd $0x1,%%xmm0,%%xmm1 \n" 32433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "paddd %%xmm1,%%xmm0 \n" 32533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "movd %%xmm0,%3 \n" 32633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 32733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "+r"(src_a), // %0 32833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(src_b), // %1 32933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "+r"(count), // %2 33033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp "=g"(sse) // %3 33133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : 33233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp : "memory", "cc" 33333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__) 33433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp , "xmm0", "xmm1", "xmm2", "xmm5" 33533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 33633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ); 33733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return sse; 33833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 33933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 34033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 34133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, 34233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int count) { 34333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint32 sse = 0u; 34433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp for (int i = 0; i < count; ++i) { 34533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int diff = src_a[i] - src_b[i]; 34633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sse += static_cast<uint32>(diff * diff); 34733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 34833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return sse; 34933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 35033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 35133cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API 35233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampuint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, 35333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int count) { 35433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = 35533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp SumSquareError_C; 35633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_SUMSQUAREERROR_NEON) 35733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (TestCpuFlag(kCpuHasNEON)) { 35833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp SumSquareError = SumSquareError_NEON; 35933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 36033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif defined(HAS_SUMSQUAREERROR_SSE2) 36133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (TestCpuFlag(kCpuHasSSE2) && 36233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { 36333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Note only used for multiples of 16 so count is not checked. 36433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp SumSquareError = SumSquareError_SSE2; 36533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 36633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 36733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 32K values will fit a 32bit int return value from SumSquareError. 36833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // After each block of 32K, accumulate into 64 bit int. 36933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int kBlockSize = 1 << 15; // 32768; 37033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint64 sse = 0; 37133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef _OPENMP 37233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#pragma omp parallel for reduction(+: sse) 37333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 37433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { 37533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sse += SumSquareError(src_a + i, src_b + i, kBlockSize); 37633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 37733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_a += count & ~(kBlockSize - 1); 37833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_b += count & ~(kBlockSize - 1); 37933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int remainder = count & (kBlockSize - 1) & ~15; 38033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (remainder) { 38133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sse += SumSquareError(src_a, src_b, remainder); 38233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_a += remainder; 38333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_b += remainder; 38433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 38533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp remainder = count & 15; 38633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (remainder) { 38733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sse += SumSquareError_C(src_a, src_b, remainder); 38833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 38933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return sse; 39033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 39133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 39233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API 39333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampuint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, 39433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_b, int stride_b, 39533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width, int height) { 39633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = 39733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp SumSquareError_C; 39833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_SUMSQUAREERROR_NEON) 39933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (TestCpuFlag(kCpuHasNEON)) { 40033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp SumSquareError = SumSquareError_NEON; 40133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 40233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif defined(HAS_SUMSQUAREERROR_SSE2) 40333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && 40433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) && 40533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) { 40633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp SumSquareError = SumSquareError_SSE2; 40733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 40833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 40933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 41033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint64 sse = 0; 41133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp for (int h = 0; h < height; ++h) { 41233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sse += SumSquareError(src_a, src_b, width); 41333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_a += stride_a; 41433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_b += stride_b; 41533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 41633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 41733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return sse; 41833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 41933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 42033cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API 42133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble SumSquareErrorToPsnr(uint64 sse, uint64 count) { 42233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp double psnr; 42333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (sse > 0) { 42433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp double mse = static_cast<double>(count) / static_cast<double>(sse); 42533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psnr = 10.0 * log10(255.0 * 255.0 * mse); 42633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } else { 42733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psnr = kMaxPsnr; // Limit to prevent divide by 0 42833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 42933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 43033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (psnr > kMaxPsnr) 43133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psnr = kMaxPsnr; 43233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 43333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return psnr; 43433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 43533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 43633cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API 43733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble CalcFramePsnr(const uint8* src_a, int stride_a, 43833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_b, int stride_b, 43933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width, int height) { 44033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint64 samples = width * height; 44133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, 44233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_b, stride_b, 44333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp width, height); 44433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return SumSquareErrorToPsnr(sse, samples); 44533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 44633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 44733cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API 44833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble I420Psnr(const uint8* src_y_a, int stride_y_a, 44933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_u_a, int stride_u_a, 45033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_v_a, int stride_v_a, 45133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_y_b, int stride_y_b, 45233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_u_b, int stride_u_b, 45333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_v_b, int stride_v_b, 45433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width, int height) { 45533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, 45633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_y_b, stride_y_b, 45733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp width, height); 45833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int width_uv = (width + 1) >> 1; 45933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int height_uv = (height + 1) >> 1; 46033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a, 46133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_u_b, stride_u_b, 46233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp width_uv, height_uv); 46333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a, 46433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_v_b, stride_v_b, 46533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp width_uv, height_uv); 46633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint64 samples = width * height + 2 * (width_uv * height_uv); 46733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint64 sse = sse_y + sse_u + sse_v; 46833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return SumSquareErrorToPsnr(sse, samples); 46933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 47033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 47133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const int64 cc1 = 26634; // (64^2*(.01*255)^2 47233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const int64 cc2 = 239708; // (64^2*(.03*255)^2 47333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 47433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic double Ssim8x8_C(const uint8* src_a, int stride_a, 47533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_b, int stride_b) { 47633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int64 sum_a = 0; 47733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int64 sum_b = 0; 47833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int64 sum_sq_a = 0; 47933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int64 sum_sq_b = 0; 48033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int64 sum_axb = 0; 48133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 48233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp for (int i = 0; i < 8; ++i) { 48333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp for (int j = 0; j < 8; ++j) { 48433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sum_a += src_a[j]; 48533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sum_b += src_b[j]; 48633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sum_sq_a += src_a[j] * src_a[j]; 48733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sum_sq_b += src_b[j] * src_b[j]; 48833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sum_axb += src_a[j] * src_b[j]; 48933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 49033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 49133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_a += stride_a; 49233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_b += stride_b; 49333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 49433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 49533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int64 count = 64; 49633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // scale the constants by number of pixels 49733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int64 c1 = (cc1 * count * count) >> 12; 49833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int64 c2 = (cc2 * count * count) >> 12; 49933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 50033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int64 sum_a_x_sum_b = sum_a * sum_b; 50133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 50233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int64 ssim_n = (2 * sum_a_x_sum_b + c1) * 50333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); 50433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 50533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int64 sum_a_sq = sum_a*sum_a; 50633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int64 sum_b_sq = sum_b*sum_b; 50733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 50833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) * 50933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp (count * sum_sq_a - sum_a_sq + 51033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp count * sum_sq_b - sum_b_sq + c2); 51133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 51233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp if (ssim_d == 0.0) 51333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return DBL_MAX; 51433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return ssim_n * 1.0 / ssim_d; 51533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 51633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 51733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// We are using a 8x8 moving window with starting location of each 8x8 window 51833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// on the 4x4 pixel grid. Such arrangement allows the windows to overlap 51933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// block boundaries to penalize blocking artifacts. 52033cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API 52133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble CalcFrameSsim(const uint8* src_a, int stride_a, 52233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_b, int stride_b, 52333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width, int height) { 52433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int samples = 0; 52533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp double ssim_total = 0; 52633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 52733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp double (*Ssim8x8)(const uint8* src_a, int stride_a, 52833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_b, int stride_b); 52933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 53033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp Ssim8x8 = Ssim8x8_C; 53133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 53233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // sample point start with each 4x4 location 53333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp for (int i = 0; i < height - 8; i += 4) { 53433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp for (int j = 0; j < width - 8; j += 4) { 53533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); 53633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp samples++; 53733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 53833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 53933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_a += stride_a * 4; 54033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_b += stride_b * 4; 54133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 54233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 54333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ssim_total /= samples; 54433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return ssim_total; 54533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 54633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 54733cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API 54833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampdouble I420Ssim(const uint8* src_y_a, int stride_y_a, 54933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_u_a, int stride_u_a, 55033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_v_a, int stride_v_a, 55133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_y_b, int stride_y_b, 55233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_u_b, int stride_u_b, 55333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* src_v_b, int stride_v_b, 55433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width, int height) { 55533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, 55633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_y_b, stride_y_b, width, height); 55733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int width_uv = (width + 1) >> 1; 55833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int height_uv = (height + 1) >> 1; 55933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, 56033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_u_b, stride_u_b, 56133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp width_uv, height_uv); 56233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, 56333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp src_v_b, stride_v_b, 56433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp width_uv, height_uv); 56533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); 56633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 56733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 56833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus 56933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} // extern "C" 57033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} // namespace libyuv 57133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 572