1/*
2 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/basic_types.h"
12
13#include "libyuv/compare_row.h"
14#include "libyuv/row.h"
15
16#ifdef __cplusplus
17namespace libyuv {
18extern "C" {
19#endif
20
21// This module is for 32 bit Visual C x86 and clangcl
22#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
23
24__declspec(naked) uint32
25    SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
26  __asm {
27    mov        eax, [esp + 4]  // src_a
28    mov        edx, [esp + 8]  // src_b
29    mov        ecx, [esp + 12]  // count
30    pxor       xmm0, xmm0
31    pxor       xmm5, xmm5
32
33  wloop:
34    movdqu     xmm1, [eax]
35    lea        eax,  [eax + 16]
36    movdqu     xmm2, [edx]
37    lea        edx,  [edx + 16]
38    movdqa     xmm3, xmm1  // abs trick
39    psubusb    xmm1, xmm2
40    psubusb    xmm2, xmm3
41    por        xmm1, xmm2
42    movdqa     xmm2, xmm1
43    punpcklbw  xmm1, xmm5
44    punpckhbw  xmm2, xmm5
45    pmaddwd    xmm1, xmm1
46    pmaddwd    xmm2, xmm2
47    paddd      xmm0, xmm1
48    paddd      xmm0, xmm2
49    sub        ecx, 16
50    jg         wloop
51
52    pshufd     xmm1, xmm0, 0xee
53    paddd      xmm0, xmm1
54    pshufd     xmm1, xmm0, 0x01
55    paddd      xmm0, xmm1
56    movd       eax, xmm0
57    ret
58  }
59}
60
61// Visual C 2012 required for AVX2.
62#if _MSC_VER >= 1700
63// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
64#pragma warning(disable : 4752)
65__declspec(naked) uint32
66    SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
67  __asm {
68    mov        eax, [esp + 4]  // src_a
69    mov        edx, [esp + 8]  // src_b
70    mov        ecx, [esp + 12]  // count
71    vpxor      ymm0, ymm0, ymm0  // sum
72    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
73    sub        edx, eax
74
75  wloop:
76    vmovdqu    ymm1, [eax]
77    vmovdqu    ymm2, [eax + edx]
78    lea        eax,  [eax + 32]
79    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
80    vpsubusb   ymm2, ymm2, ymm1
81    vpor       ymm1, ymm2, ymm3
82    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
83    vpunpckhbw ymm1, ymm1, ymm5
84    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
85    vpmaddwd   ymm1, ymm1, ymm1
86    vpaddd     ymm0, ymm0, ymm1
87    vpaddd     ymm0, ymm0, ymm2
88    sub        ecx, 32
89    jg         wloop
90
91    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
92    vpaddd     ymm0, ymm0, ymm1
93    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
94    vpaddd     ymm0, ymm0, ymm1
95    vpermq     ymm1, ymm0, 0x02  // high + low lane.
96    vpaddd     ymm0, ymm0, ymm1
97    vmovd      eax, xmm0
98    vzeroupper
99    ret
100  }
101}
102#endif  // _MSC_VER >= 1700
103
104uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
105uvec32 kHashMul0 = {
106    0x0c3525e1,  // 33 ^ 15
107    0xa3476dc1,  // 33 ^ 14
108    0x3b4039a1,  // 33 ^ 13
109    0x4f5f0981,  // 33 ^ 12
110};
111uvec32 kHashMul1 = {
112    0x30f35d61,  // 33 ^ 11
113    0x855cb541,  // 33 ^ 10
114    0x040a9121,  // 33 ^ 9
115    0x747c7101,  // 33 ^ 8
116};
117uvec32 kHashMul2 = {
118    0xec41d4e1,  // 33 ^ 7
119    0x4cfa3cc1,  // 33 ^ 6
120    0x025528a1,  // 33 ^ 5
121    0x00121881,  // 33 ^ 4
122};
123uvec32 kHashMul3 = {
124    0x00008c61,  // 33 ^ 3
125    0x00000441,  // 33 ^ 2
126    0x00000021,  // 33 ^ 1
127    0x00000001,  // 33 ^ 0
128};
129
130__declspec(naked) uint32
131    HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
132  __asm {
133    mov        eax, [esp + 4]  // src
134    mov        ecx, [esp + 8]  // count
135    movd       xmm0, [esp + 12]  // seed
136
137    pxor       xmm7, xmm7  // constant 0 for unpck
138    movdqa     xmm6, xmmword ptr kHash16x33
139
140  wloop:
141    movdqu     xmm1, [eax]  // src[0-15]
142    lea        eax, [eax + 16]
143    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
144    movdqa     xmm5, xmmword ptr kHashMul0
145    movdqa     xmm2, xmm1
146    punpcklbw  xmm2, xmm7  // src[0-7]
147    movdqa     xmm3, xmm2
148    punpcklwd  xmm3, xmm7  // src[0-3]
149    pmulld     xmm3, xmm5
150    movdqa     xmm5, xmmword ptr kHashMul1
151    movdqa     xmm4, xmm2
152    punpckhwd  xmm4, xmm7  // src[4-7]
153    pmulld     xmm4, xmm5
154    movdqa     xmm5, xmmword ptr kHashMul2
155    punpckhbw  xmm1, xmm7  // src[8-15]
156    movdqa     xmm2, xmm1
157    punpcklwd  xmm2, xmm7  // src[8-11]
158    pmulld     xmm2, xmm5
159    movdqa     xmm5, xmmword ptr kHashMul3
160    punpckhwd  xmm1, xmm7  // src[12-15]
161    pmulld     xmm1, xmm5
162    paddd      xmm3, xmm4  // add 16 results
163    paddd      xmm1, xmm2
164    paddd      xmm1, xmm3
165
166    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
167    paddd      xmm1, xmm2
168    pshufd     xmm2, xmm1, 0x01
169    paddd      xmm1, xmm2
170    paddd      xmm0, xmm1
171    sub        ecx, 16
172    jg         wloop
173
174    movd       eax, xmm0  // return hash
175    ret
176  }
177}
178
179// Visual C 2012 required for AVX2.
180#if _MSC_VER >= 1700
181__declspec(naked) uint32
182    HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
183  __asm {
184    mov        eax, [esp + 4]  // src
185    mov        ecx, [esp + 8]  // count
186    vmovd      xmm0, [esp + 12]  // seed
187
188  wloop:
189    vpmovzxbd  xmm3, [eax]  // src[0-3]
190    vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
191    vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
192    vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
193    vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
194    vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
195    vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
196    vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
197    lea        eax, [eax + 16]
198    vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
199    vpaddd     xmm3, xmm3, xmm4  // add 16 results
200    vpaddd     xmm1, xmm1, xmm2
201    vpaddd     xmm1, xmm1, xmm3
202    vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
203    vpaddd     xmm1, xmm1,xmm2
204    vpshufd    xmm2, xmm1, 0x01
205    vpaddd     xmm1, xmm1, xmm2
206    vpaddd     xmm0, xmm0, xmm1
207    sub        ecx, 16
208    jg         wloop
209
210    vmovd      eax, xmm0  // return hash
211    vzeroupper
212    ret
213  }
214}
215#endif  // _MSC_VER >= 1700
216
217#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
218
219#ifdef __cplusplus
220}  // extern "C"
221}  // namespace libyuv
222#endif
223