1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/* ---- includes ----------------------------------------------------------- */
18
19#include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */
20
21#if ( defined( WIN64 ) || defined( HW_SSE2 ) )
22
23#include "emmintrin.h"
24
25/* disable warning "local variable 'x' used without having been initialized" */
26#pragma warning( disable : 4700 )
27
28
29/** Using half register (64-bit) in SSE2 to calculate dot product.
30 *  This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c.
31 *  Dependencies: input vectors need to be 16-bit aligned
32 *  Return Value: int32 containing resultL of dot product
33 */
34int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
35{
36	__m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8;
37	int16* vec1L = ( int16* )vec1A;
38	int16* vec2L = ( int16* )vec2A;
39
40	int32 resultL = 0;
41	uint32 alignOffSetL = 0;
42
43	/* initialize registers to 0 */
44	m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 );
45	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
46	m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 );
47
48	alignOffSetL = sizeA % 16;
49	sizeA >>= 4;
50
51	if( sizeA )
52	{
53		while( sizeA > 0 )
54		{
55			m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] );
56			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
57
58			m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] );
59			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
60
61			m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] );
62
63			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 );
64
65			m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] );
66			m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] );
67
68			m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 );
69
70			m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] );
71
72			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
73
74			m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] );
75
76			m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 );
77
78			m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] );
79			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 );
80
81			m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 );
82
83			vec1L += 16;
84			vec2L += 16;
85			sizeA--;
86		}
87
88		/* sum up accumulators */
89		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 );
90
91		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 );
92
93		m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 );
94
95		m_XMM0 = _mm_srli_epi64( m_XMM0, 32 );
96
97		m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 );
98
99		resultL = _mm_cvtsi128_si32( m_XMM7 );
100	}
101
102	/* switch statements produces faster code than loop */
103	switch( alignOffSetL )
104	{
105		case 15:
106			resultL += ( int32 )*vec1L++ * *vec2L++;
107		case 14:
108			resultL += ( int32 )*vec1L++ * *vec2L++;
109		case 13:
110			resultL += ( int32 )*vec1L++ * *vec2L++;
111		case 12:
112			resultL += ( int32 )*vec1L++ * *vec2L++;
113		case 11:
114			resultL += ( int32 )*vec1L++ * *vec2L++;
115		case 10:
116			resultL += ( int32 )*vec1L++ * *vec2L++;
117		case 9:
118			resultL += ( int32 )*vec1L++ * *vec2L++;
119		case 8:
120			resultL += ( int32 )*vec1L++ * *vec2L++;
121		case 7:
122			resultL += ( int32 )*vec1L++ * *vec2L++;
123		case 6:
124			resultL += ( int32 )*vec1L++ * *vec2L++;
125		case 5:
126			resultL += ( int32 )*vec1L++ * *vec2L++;
127		case 4:
128			resultL += ( int32 )*vec1L++ * *vec2L++;
129		case 3:
130			resultL += ( int32 )*vec1L++ * *vec2L++;
131		case 2:
132			resultL += ( int32 )*vec1L++ * *vec2L++;
133		case 1:
134			resultL += ( int32 )*vec1L++ * *vec2L++;
135	}
136
137	return resultL;
138}
139
140/* ------------------------------------------------------------------------- */
141
142/** Using full register (128-bit) in SSE2 to calculate dot Product.
143 *  Dependencies: 16-bit aligned
144 *  Return Value: int32 containing dot Product
145 */
146int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
147{
148	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
149	int16* vec1L = ( int16* )vec1A;
150	int16* vec2L = ( int16* )vec2A;
151
152	int32 resultL = 0;
153	uint32 alignOffSetL = 0;
154
155	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
156	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
157
158	alignOffSetL = sizeA % 16;
159	sizeA >>= 4;
160
161	if( sizeA )
162	{
163		while( sizeA > 0 )
164		{
165			m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] );
166			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
167
168			m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] );
169
170			m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] );
171
172			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
173
174			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
175
176			m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] );
177
178			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
179
180			vec1L += 16;
181			vec2L += 16;
182			sizeA--;
183		}
184
185		/* sum up accumulators */
186		m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
187
188		m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 );
189
190		resultL = _mm_cvtsi128_si32( m_XMM0 );	/* 1st 32bits */
191
192		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
193
194		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 2nd 32bits */
195
196		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
197
198		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 3rd 32bits */
199
200		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
201
202		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 4th 32bits */
203	}
204
205	switch( alignOffSetL )
206	{
207		case 15:
208			resultL += ( int32 )*vec1L++ * *vec2L++;
209		case 14:
210			resultL += ( int32 )*vec1L++ * *vec2L++;
211		case 13:
212			resultL += ( int32 )*vec1L++ * *vec2L++;
213		case 12:
214			resultL += ( int32 )*vec1L++ * *vec2L++;
215		case 11:
216			resultL += ( int32 )*vec1L++ * *vec2L++;
217		case 10:
218			resultL += ( int32 )*vec1L++ * *vec2L++;
219		case 9:
220			resultL += ( int32 )*vec1L++ * *vec2L++;
221		case 8:
222			resultL += ( int32 )*vec1L++ * *vec2L++;
223		case 7:
224			resultL += ( int32 )*vec1L++ * *vec2L++;
225		case 6:
226			resultL += ( int32 )*vec1L++ * *vec2L++;
227		case 5:
228			resultL += ( int32 )*vec1L++ * *vec2L++;
229		case 4:
230			resultL += ( int32 )*vec1L++ * *vec2L++;
231		case 3:
232			resultL += ( int32 )*vec1L++ * *vec2L++;
233		case 2:
234			resultL += ( int32 )*vec1L++ * *vec2L++;
235		case 1:
236			resultL += ( int32 )*vec1L++ * *vec2L++;
237	}
238
239	return resultL;
240}
241
242/* ------------------------------------------------------------------------- */
243
244
245/** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version).
246 *  Dependencies: memory does not need to be 16-bit aligned
247 *  Return Value: int32 containing dot product
248 */
249int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA )
250{
251	__m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6;
252	int16* vec1L = ( int16* )vec1A;
253	int16* vec2L = ( int16* )vec2A;
254	int32 resultL = 0;
255	uint32 alignOffSetL = 0;
256
257	/* initialize registers to 0 */
258	m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 );
259	m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 );
260
261
262	alignOffSetL = sizeA % 16;
263	sizeA >>= 4;
264
265	if( sizeA )
266	{
267		while( sizeA > 0 )
268		{
269			m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] );
270			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
271
272			m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] );
273
274			m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] );
275
276			m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 );
277
278			m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 );
279
280			m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] );
281
282			m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 );
283
284			vec1L += 16;
285			vec2L += 16;
286			sizeA--;
287		}
288
289		/* sum up accumulators */
290		m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 );
291
292		m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 );
293
294		resultL = _mm_cvtsi128_si32( m_XMM0 );	/* 1st 32bits */
295
296		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
297
298		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 2nd 32bits */
299
300		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
301
302		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 3rd 32bits */
303
304		m_XMM0 = _mm_srli_si128( m_XMM0, 4 );
305
306		resultL += _mm_cvtsi128_si32( m_XMM0 );	/* 4th 32bits */
307	}
308
309
310	switch( alignOffSetL )
311	{
312		case 15:
313			resultL += ( int32 )*vec1L++ * *vec2L++;
314		case 14:
315			resultL += ( int32 )*vec1L++ * *vec2L++;
316		case 13:
317			resultL += ( int32 )*vec1L++ * *vec2L++;
318		case 12:
319			resultL += ( int32 )*vec1L++ * *vec2L++;
320		case 11:
321			resultL += ( int32 )*vec1L++ * *vec2L++;
322		case 10:
323			resultL += ( int32 )*vec1L++ * *vec2L++;
324		case 9:
325			resultL += ( int32 )*vec1L++ * *vec2L++;
326		case 8:
327			resultL += ( int32 )*vec1L++ * *vec2L++;
328		case 7:
329			resultL += ( int32 )*vec1L++ * *vec2L++;
330		case 6:
331			resultL += ( int32 )*vec1L++ * *vec2L++;
332		case 5:
333			resultL += ( int32 )*vec1L++ * *vec2L++;
334		case 4:
335			resultL += ( int32 )*vec1L++ * *vec2L++;
336		case 3:
337			resultL += ( int32 )*vec1L++ * *vec2L++;
338		case 2:
339			resultL += ( int32 )*vec1L++ * *vec2L++;
340		case 1:
341			resultL += ( int32 )*vec1L++ * *vec2L++;
342	}
343
344	return resultL;
345}
346
347/* ------------------------------------------------------------------------- */
348
349#endif /* HW_SSE2 */
350