1/*
2 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/basic_types.h"
12#include "libyuv/row.h"
13
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
20
21__declspec(naked) __declspec(align(16))
22uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
23  __asm {
24    mov        eax, [esp + 4]    // src_a
25    mov        edx, [esp + 8]    // src_b
26    mov        ecx, [esp + 12]   // count
27    pxor       xmm0, xmm0
28    pxor       xmm5, xmm5
29
30    align      4
31  wloop:
32    movdqa     xmm1, [eax]
33    lea        eax,  [eax + 16]
34    movdqa     xmm2, [edx]
35    lea        edx,  [edx + 16]
36    sub        ecx, 16
37    movdqa     xmm3, xmm1  // abs trick
38    psubusb    xmm1, xmm2
39    psubusb    xmm2, xmm3
40    por        xmm1, xmm2
41    movdqa     xmm2, xmm1
42    punpcklbw  xmm1, xmm5
43    punpckhbw  xmm2, xmm5
44    pmaddwd    xmm1, xmm1
45    pmaddwd    xmm2, xmm2
46    paddd      xmm0, xmm1
47    paddd      xmm0, xmm2
48    jg         wloop
49
50    pshufd     xmm1, xmm0, 0xee
51    paddd      xmm0, xmm1
52    pshufd     xmm1, xmm0, 0x01
53    paddd      xmm0, xmm1
54    movd       eax, xmm0
55    ret
56  }
57}
58
59// Visual C 2012 required for AVX2.
60#if _MSC_VER >= 1700
61// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
62#pragma warning(disable: 4752)
63__declspec(naked) __declspec(align(16))
64uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
65  __asm {
66    mov        eax, [esp + 4]    // src_a
67    mov        edx, [esp + 8]    // src_b
68    mov        ecx, [esp + 12]   // count
69    vpxor      ymm0, ymm0, ymm0  // sum
70    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
71    sub        edx, eax
72
73    align      4
74  wloop:
75    vmovdqu    ymm1, [eax]
76    vmovdqu    ymm2, [eax + edx]
77    lea        eax,  [eax + 32]
78    sub        ecx, 32
79    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
80    vpsubusb   ymm2, ymm2, ymm1
81    vpor       ymm1, ymm2, ymm3
82    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
83    vpunpckhbw ymm1, ymm1, ymm5
84    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
85    vpmaddwd   ymm1, ymm1, ymm1
86    vpaddd     ymm0, ymm0, ymm1
87    vpaddd     ymm0, ymm0, ymm2
88    jg         wloop
89
90    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
91    vpaddd     ymm0, ymm0, ymm1
92    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
93    vpaddd     ymm0, ymm0, ymm1
94    vpermq     ymm1, ymm0, 0x02  // high + low lane.
95    vpaddd     ymm0, ymm0, ymm1
96    vmovd      eax, xmm0
97    vzeroupper
98    ret
99  }
100}
101#endif  // _MSC_VER >= 1700
102
103#define HAS_HASHDJB2_SSE41
104static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
105static uvec32 kHashMul0 = {
106  0x0c3525e1,  // 33 ^ 15
107  0xa3476dc1,  // 33 ^ 14
108  0x3b4039a1,  // 33 ^ 13
109  0x4f5f0981,  // 33 ^ 12
110};
111static uvec32 kHashMul1 = {
112  0x30f35d61,  // 33 ^ 11
113  0x855cb541,  // 33 ^ 10
114  0x040a9121,  // 33 ^ 9
115  0x747c7101,  // 33 ^ 8
116};
117static uvec32 kHashMul2 = {
118  0xec41d4e1,  // 33 ^ 7
119  0x4cfa3cc1,  // 33 ^ 6
120  0x025528a1,  // 33 ^ 5
121  0x00121881,  // 33 ^ 4
122};
123static uvec32 kHashMul3 = {
124  0x00008c61,  // 33 ^ 3
125  0x00000441,  // 33 ^ 2
126  0x00000021,  // 33 ^ 1
127  0x00000001,  // 33 ^ 0
128};
129
130// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
131// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
132// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
133// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
134// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
135#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
136    _asm _emit 0x40 _asm _emit reg
137
138__declspec(naked) __declspec(align(16))
139uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
140  __asm {
141    mov        eax, [esp + 4]    // src
142    mov        ecx, [esp + 8]    // count
143    movd       xmm0, [esp + 12]  // seed
144
145    pxor       xmm7, xmm7        // constant 0 for unpck
146    movdqa     xmm6, kHash16x33
147
148    align      4
149  wloop:
150    movdqu     xmm1, [eax]       // src[0-15]
151    lea        eax, [eax + 16]
152    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
153    movdqa     xmm5, kHashMul0
154    movdqa     xmm2, xmm1
155    punpcklbw  xmm2, xmm7        // src[0-7]
156    movdqa     xmm3, xmm2
157    punpcklwd  xmm3, xmm7        // src[0-3]
158    pmulld(0xdd)                 // pmulld     xmm3, xmm5
159    movdqa     xmm5, kHashMul1
160    movdqa     xmm4, xmm2
161    punpckhwd  xmm4, xmm7        // src[4-7]
162    pmulld(0xe5)                 // pmulld     xmm4, xmm5
163    movdqa     xmm5, kHashMul2
164    punpckhbw  xmm1, xmm7        // src[8-15]
165    movdqa     xmm2, xmm1
166    punpcklwd  xmm2, xmm7        // src[8-11]
167    pmulld(0xd5)                 // pmulld     xmm2, xmm5
168    movdqa     xmm5, kHashMul3
169    punpckhwd  xmm1, xmm7        // src[12-15]
170    pmulld(0xcd)                 // pmulld     xmm1, xmm5
171    paddd      xmm3, xmm4        // add 16 results
172    paddd      xmm1, xmm2
173    sub        ecx, 16
174    paddd      xmm1, xmm3
175
176    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
177    paddd      xmm1, xmm2
178    pshufd     xmm2, xmm1, 0x01
179    paddd      xmm1, xmm2
180    paddd      xmm0, xmm1
181    jg         wloop
182
183    movd       eax, xmm0         // return hash
184    ret
185  }
186}
187
188// Visual C 2012 required for AVX2.
189#if _MSC_VER >= 1700
190__declspec(naked) __declspec(align(16))
191uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
192  __asm {
193    mov        eax, [esp + 4]    // src
194    mov        ecx, [esp + 8]    // count
195    movd       xmm0, [esp + 12]  // seed
196    movdqa     xmm6, kHash16x33
197
198    align      4
199  wloop:
200    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
201    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
202    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
203    pmulld     xmm3, kHashMul0
204    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
205    pmulld     xmm4, kHashMul1
206    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
207    pmulld     xmm2, kHashMul2
208    lea        eax, [eax + 16]
209    pmulld     xmm1, kHashMul3
210    paddd      xmm3, xmm4        // add 16 results
211    paddd      xmm1, xmm2
212    sub        ecx, 16
213    paddd      xmm1, xmm3
214    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
215    paddd      xmm1, xmm2
216    pshufd     xmm2, xmm1, 0x01
217    paddd      xmm1, xmm2
218    paddd      xmm0, xmm1
219    jg         wloop
220
221    movd       eax, xmm0         // return hash
222    ret
223  }
224}
225#endif  // _MSC_VER >= 1700
226
227#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
228
229#ifdef __cplusplus
230}  // extern "C"
231}  // namespace libyuv
232#endif
233