1/* 2 * Copyright 2012 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/basic_types.h" 12#include "libyuv/row.h" 13 14#ifdef __cplusplus 15namespace libyuv { 16extern "C" { 17#endif 18 19#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 20 21__declspec(naked) __declspec(align(16)) 22uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { 23 __asm { 24 mov eax, [esp + 4] // src_a 25 mov edx, [esp + 8] // src_b 26 mov ecx, [esp + 12] // count 27 pxor xmm0, xmm0 28 pxor xmm5, xmm5 29 30 align 4 31 wloop: 32 movdqa xmm1, [eax] 33 lea eax, [eax + 16] 34 movdqa xmm2, [edx] 35 lea edx, [edx + 16] 36 sub ecx, 16 37 movdqa xmm3, xmm1 // abs trick 38 psubusb xmm1, xmm2 39 psubusb xmm2, xmm3 40 por xmm1, xmm2 41 movdqa xmm2, xmm1 42 punpcklbw xmm1, xmm5 43 punpckhbw xmm2, xmm5 44 pmaddwd xmm1, xmm1 45 pmaddwd xmm2, xmm2 46 paddd xmm0, xmm1 47 paddd xmm0, xmm2 48 jg wloop 49 50 pshufd xmm1, xmm0, 0xee 51 paddd xmm0, xmm1 52 pshufd xmm1, xmm0, 0x01 53 paddd xmm0, xmm1 54 movd eax, xmm0 55 ret 56 } 57} 58 59// Visual C 2012 required for AVX2. 60#if _MSC_VER >= 1700 61// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. 62#pragma warning(disable: 4752) 63__declspec(naked) __declspec(align(16)) 64uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { 65 __asm { 66 mov eax, [esp + 4] // src_a 67 mov edx, [esp + 8] // src_b 68 mov ecx, [esp + 12] // count 69 vpxor ymm0, ymm0, ymm0 // sum 70 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck 71 sub edx, eax 72 73 align 4 74 wloop: 75 vmovdqu ymm1, [eax] 76 vmovdqu ymm2, [eax + edx] 77 lea eax, [eax + 32] 78 sub ecx, 32 79 vpsubusb ymm3, ymm1, ymm2 // abs difference trick 80 vpsubusb ymm2, ymm2, ymm1 81 vpor ymm1, ymm2, ymm3 82 vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. 83 vpunpckhbw ymm1, ymm1, ymm5 84 vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. 85 vpmaddwd ymm1, ymm1, ymm1 86 vpaddd ymm0, ymm0, ymm1 87 vpaddd ymm0, ymm0, ymm2 88 jg wloop 89 90 vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. 91 vpaddd ymm0, ymm0, ymm1 92 vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. 93 vpaddd ymm0, ymm0, ymm1 94 vpermq ymm1, ymm0, 0x02 // high + low lane. 95 vpaddd ymm0, ymm0, ymm1 96 vmovd eax, xmm0 97 vzeroupper 98 ret 99 } 100} 101#endif // _MSC_VER >= 1700 102 103#define HAS_HASHDJB2_SSE41 104static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 105static uvec32 kHashMul0 = { 106 0x0c3525e1, // 33 ^ 15 107 0xa3476dc1, // 33 ^ 14 108 0x3b4039a1, // 33 ^ 13 109 0x4f5f0981, // 33 ^ 12 110}; 111static uvec32 kHashMul1 = { 112 0x30f35d61, // 33 ^ 11 113 0x855cb541, // 33 ^ 10 114 0x040a9121, // 33 ^ 9 115 0x747c7101, // 33 ^ 8 116}; 117static uvec32 kHashMul2 = { 118 0xec41d4e1, // 33 ^ 7 119 0x4cfa3cc1, // 33 ^ 6 120 0x025528a1, // 33 ^ 5 121 0x00121881, // 33 ^ 4 122}; 123static uvec32 kHashMul3 = { 124 0x00008c61, // 33 ^ 3 125 0x00000441, // 33 ^ 2 126 0x00000021, // 33 ^ 1 127 0x00000001, // 33 ^ 0 128}; 129 130// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 131// 44: 66 0F 38 40 DD pmulld xmm3,xmm5 132// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 133// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 134// 83: 66 0F 38 40 CD pmulld xmm1,xmm5 135#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ 136 _asm _emit 0x40 _asm _emit reg 137 138__declspec(naked) __declspec(align(16)) 139uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { 140 __asm { 141 mov eax, [esp + 4] // src 142 mov ecx, [esp + 8] // count 143 movd xmm0, [esp + 12] // seed 144 145 pxor xmm7, xmm7 // constant 0 for unpck 146 movdqa xmm6, kHash16x33 147 148 align 4 149 wloop: 150 movdqu xmm1, [eax] // src[0-15] 151 lea eax, [eax + 16] 152 pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 153 movdqa xmm5, kHashMul0 154 movdqa xmm2, xmm1 155 punpcklbw xmm2, xmm7 // src[0-7] 156 movdqa xmm3, xmm2 157 punpcklwd xmm3, xmm7 // src[0-3] 158 pmulld(0xdd) // pmulld xmm3, xmm5 159 movdqa xmm5, kHashMul1 160 movdqa xmm4, xmm2 161 punpckhwd xmm4, xmm7 // src[4-7] 162 pmulld(0xe5) // pmulld xmm4, xmm5 163 movdqa xmm5, kHashMul2 164 punpckhbw xmm1, xmm7 // src[8-15] 165 movdqa xmm2, xmm1 166 punpcklwd xmm2, xmm7 // src[8-11] 167 pmulld(0xd5) // pmulld xmm2, xmm5 168 movdqa xmm5, kHashMul3 169 punpckhwd xmm1, xmm7 // src[12-15] 170 pmulld(0xcd) // pmulld xmm1, xmm5 171 paddd xmm3, xmm4 // add 16 results 172 paddd xmm1, xmm2 173 sub ecx, 16 174 paddd xmm1, xmm3 175 176 pshufd xmm2, xmm1, 0x0e // upper 2 dwords 177 paddd xmm1, xmm2 178 pshufd xmm2, xmm1, 0x01 179 paddd xmm1, xmm2 180 paddd xmm0, xmm1 181 jg wloop 182 183 movd eax, xmm0 // return hash 184 ret 185 } 186} 187 188// Visual C 2012 required for AVX2. 189#if _MSC_VER >= 1700 190__declspec(naked) __declspec(align(16)) 191uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { 192 __asm { 193 mov eax, [esp + 4] // src 194 mov ecx, [esp + 8] // count 195 movd xmm0, [esp + 12] // seed 196 movdqa xmm6, kHash16x33 197 198 align 4 199 wloop: 200 vpmovzxbd xmm3, dword ptr [eax] // src[0-3] 201 pmulld xmm0, xmm6 // hash *= 33 ^ 16 202 vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] 203 pmulld xmm3, kHashMul0 204 vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] 205 pmulld xmm4, kHashMul1 206 vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] 207 pmulld xmm2, kHashMul2 208 lea eax, [eax + 16] 209 pmulld xmm1, kHashMul3 210 paddd xmm3, xmm4 // add 16 results 211 paddd xmm1, xmm2 212 sub ecx, 16 213 paddd xmm1, xmm3 214 pshufd xmm2, xmm1, 0x0e // upper 2 dwords 215 paddd xmm1, xmm2 216 pshufd xmm2, xmm1, 0x01 217 paddd xmm1, xmm2 218 paddd xmm0, xmm1 219 jg wloop 220 221 movd eax, xmm0 // return hash 222 ret 223 } 224} 225#endif // _MSC_VER >= 1700 226 227#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 228 229#ifdef __cplusplus 230} // extern "C" 231} // namespace libyuv 232#endif 233