1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17/* ---- includes ----------------------------------------------------------- */ 18 19#include "b_BasicEm/Basic.h" /* to disable some warnings in VC++ */ 20 21#if ( defined( WIN64 ) || defined( HW_SSE2 ) ) 22 23#include "emmintrin.h" 24 25/* disable warning "local variable 'x' used without having been initialized" */ 26#pragma warning( disable : 4700 ) 27 28 29/** Using half register (64-bit) in SSE2 to calculate dot product. 30 * This is a SSE2 reimplementation of bbs_dotProduct_intelMMX16 in Math.c. 31 * Dependencies: input vectors need to be 16-bit aligned 32 * Return Value: int32 containing resultL of dot product 33 */ 34int32 bbs_dotProduct_64SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 35{ 36 __m128i m_XMM0, m_XMM1, m_XMM2, m_XMM3, m_XMM4, m_XMM5, m_XMM6, m_XMM7, m_XMM8; 37 int16* vec1L = ( int16* )vec1A; 38 int16* vec2L = ( int16* )vec2A; 39 40 int32 resultL = 0; 41 uint32 alignOffSetL = 0; 42 43 /* initialize registers to 0 */ 44 m_XMM4 = _mm_xor_si128( m_XMM4, m_XMM4 ); 45 m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 ); 46 m_XMM7 = _mm_xor_si128( m_XMM7, m_XMM7 ); 47 48 alignOffSetL = sizeA % 16; 49 sizeA >>= 4; 50 51 if( sizeA ) 52 { 53 while( sizeA > 0 ) 54 { 55 m_XMM0 = _mm_loadl_epi64( (__m128i *)&0[vec1L] ); 56 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 ); 57 58 m_XMM1 = _mm_loadl_epi64( (__m128i *)&0[vec2L] ); 59 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 ); 60 61 m_XMM2 = _mm_loadl_epi64( (__m128i *)&4[vec1L] ); 62 63 m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM1 ); 64 65 m_XMM3 = _mm_loadl_epi64( (__m128i *)&4[vec2L] ); 66 m_XMM4 = _mm_loadl_epi64( (__m128i *)&8[vec1L] ); 67 68 m_XMM2 = _mm_madd_epi16( m_XMM2, m_XMM3 ); 69 70 m_XMM5 = _mm_loadl_epi64( (__m128i *)&8[vec2L] ); 71 72 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 ); 73 74 m_XMM6 = _mm_loadl_epi64( (__m128i *)&12[vec1L] ); 75 76 m_XMM4 = _mm_madd_epi16( m_XMM4, m_XMM5 ); 77 78 m_XMM8 = _mm_loadl_epi64( (__m128i *)&12[vec2L] ); 79 m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM8 ); 80 81 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM2 ); 82 83 vec1L += 16; 84 vec2L += 16; 85 sizeA--; 86 } 87 88 /* sum up accumulators */ 89 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM4 ); 90 91 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM6 ); 92 93 m_XMM0 = _mm_loadl_epi64( (__m128i *)&m_XMM7 ); 94 95 m_XMM0 = _mm_srli_epi64( m_XMM0, 32 ); 96 97 m_XMM7 = _mm_add_epi32( m_XMM7, m_XMM0 ); 98 99 resultL = _mm_cvtsi128_si32( m_XMM7 ); 100 } 101 102 /* switch statements produces faster code than loop */ 103 switch( alignOffSetL ) 104 { 105 case 15: 106 resultL += ( int32 )*vec1L++ * *vec2L++; 107 case 14: 108 resultL += ( int32 )*vec1L++ * *vec2L++; 109 case 13: 110 resultL += ( int32 )*vec1L++ * *vec2L++; 111 case 12: 112 resultL += ( int32 )*vec1L++ * *vec2L++; 113 case 11: 114 resultL += ( int32 )*vec1L++ * *vec2L++; 115 case 10: 116 resultL += ( int32 )*vec1L++ * *vec2L++; 117 case 9: 118 resultL += ( int32 )*vec1L++ * *vec2L++; 119 case 8: 120 resultL += ( int32 )*vec1L++ * *vec2L++; 121 case 7: 122 resultL += ( int32 )*vec1L++ * *vec2L++; 123 case 6: 124 resultL += ( int32 )*vec1L++ * *vec2L++; 125 case 5: 126 resultL += ( int32 )*vec1L++ * *vec2L++; 127 case 4: 128 resultL += ( int32 )*vec1L++ * *vec2L++; 129 case 3: 130 resultL += ( int32 )*vec1L++ * *vec2L++; 131 case 2: 132 resultL += ( int32 )*vec1L++ * *vec2L++; 133 case 1: 134 resultL += ( int32 )*vec1L++ * *vec2L++; 135 } 136 137 return resultL; 138} 139 140/* ------------------------------------------------------------------------- */ 141 142/** Using full register (128-bit) in SSE2 to calculate dot Product. 143 * Dependencies: 16-bit aligned 144 * Return Value: int32 containing dot Product 145 */ 146int32 bbs_dotProduct_128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 147{ 148 __m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6; 149 int16* vec1L = ( int16* )vec1A; 150 int16* vec2L = ( int16* )vec2A; 151 152 int32 resultL = 0; 153 uint32 alignOffSetL = 0; 154 155 m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 ); 156 m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 ); 157 158 alignOffSetL = sizeA % 16; 159 sizeA >>= 4; 160 161 if( sizeA ) 162 { 163 while( sizeA > 0 ) 164 { 165 m_XMM0 = _mm_load_si128( (__m128i *)&0[vec1L] ); 166 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 ); 167 168 m_XMM2 = _mm_load_si128( (__m128i *)&0[vec2L] ); 169 170 m_XMM6 = _mm_load_si128( (__m128i *)&8[vec1L] ); 171 172 m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 ); 173 174 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 ); 175 176 m_XMM3 = _mm_load_si128( (__m128i *)&8[vec2L] ); 177 178 m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 ); 179 180 vec1L += 16; 181 vec2L += 16; 182 sizeA--; 183 } 184 185 /* sum up accumulators */ 186 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 ); 187 188 m_XMM0 = _mm_load_si128( (__m128i *)&m_XMM5 ); 189 190 resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */ 191 192 m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); 193 194 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */ 195 196 m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); 197 198 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */ 199 200 m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); 201 202 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */ 203 } 204 205 switch( alignOffSetL ) 206 { 207 case 15: 208 resultL += ( int32 )*vec1L++ * *vec2L++; 209 case 14: 210 resultL += ( int32 )*vec1L++ * *vec2L++; 211 case 13: 212 resultL += ( int32 )*vec1L++ * *vec2L++; 213 case 12: 214 resultL += ( int32 )*vec1L++ * *vec2L++; 215 case 11: 216 resultL += ( int32 )*vec1L++ * *vec2L++; 217 case 10: 218 resultL += ( int32 )*vec1L++ * *vec2L++; 219 case 9: 220 resultL += ( int32 )*vec1L++ * *vec2L++; 221 case 8: 222 resultL += ( int32 )*vec1L++ * *vec2L++; 223 case 7: 224 resultL += ( int32 )*vec1L++ * *vec2L++; 225 case 6: 226 resultL += ( int32 )*vec1L++ * *vec2L++; 227 case 5: 228 resultL += ( int32 )*vec1L++ * *vec2L++; 229 case 4: 230 resultL += ( int32 )*vec1L++ * *vec2L++; 231 case 3: 232 resultL += ( int32 )*vec1L++ * *vec2L++; 233 case 2: 234 resultL += ( int32 )*vec1L++ * *vec2L++; 235 case 1: 236 resultL += ( int32 )*vec1L++ * *vec2L++; 237 } 238 239 return resultL; 240} 241 242/* ------------------------------------------------------------------------- */ 243 244 245/** Using full register (128-bit) in SSE2 to calculate dot product (non aligned version). 246 * Dependencies: memory does not need to be 16-bit aligned 247 * Return Value: int32 containing dot product 248 */ 249int32 bbs_dotProduct_u128SSE2( const int16* vec1A, const int16* vec2A, uint32 sizeA ) 250{ 251 __m128i m_XMM0, m_XMM2, m_XMM3, m_XMM5, m_XMM6; 252 int16* vec1L = ( int16* )vec1A; 253 int16* vec2L = ( int16* )vec2A; 254 int32 resultL = 0; 255 uint32 alignOffSetL = 0; 256 257 /* initialize registers to 0 */ 258 m_XMM5 = _mm_xor_si128( m_XMM5, m_XMM5 ); 259 m_XMM6 = _mm_xor_si128( m_XMM6, m_XMM6 ); 260 261 262 alignOffSetL = sizeA % 16; 263 sizeA >>= 4; 264 265 if( sizeA ) 266 { 267 while( sizeA > 0 ) 268 { 269 m_XMM0 = _mm_loadu_si128( (__m128i *)&0[vec1L] ); 270 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 ); 271 272 m_XMM2 = _mm_loadu_si128( (__m128i *)&0[vec2L] ); 273 274 m_XMM6 = _mm_loadu_si128( (__m128i *)&8[vec1L] ); 275 276 m_XMM0 = _mm_madd_epi16( m_XMM0, m_XMM2 ); 277 278 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM0 ); 279 280 m_XMM3 = _mm_loadu_si128( (__m128i *)&8[vec2L] ); 281 282 m_XMM6 = _mm_madd_epi16( m_XMM6, m_XMM3 ); 283 284 vec1L += 16; 285 vec2L += 16; 286 sizeA--; 287 } 288 289 /* sum up accumulators */ 290 m_XMM5 = _mm_add_epi32( m_XMM5, m_XMM6 ); 291 292 m_XMM0 = _mm_loadu_si128( (__m128i *)&m_XMM5 ); 293 294 resultL = _mm_cvtsi128_si32( m_XMM0 ); /* 1st 32bits */ 295 296 m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); 297 298 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 2nd 32bits */ 299 300 m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); 301 302 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 3rd 32bits */ 303 304 m_XMM0 = _mm_srli_si128( m_XMM0, 4 ); 305 306 resultL += _mm_cvtsi128_si32( m_XMM0 ); /* 4th 32bits */ 307 } 308 309 310 switch( alignOffSetL ) 311 { 312 case 15: 313 resultL += ( int32 )*vec1L++ * *vec2L++; 314 case 14: 315 resultL += ( int32 )*vec1L++ * *vec2L++; 316 case 13: 317 resultL += ( int32 )*vec1L++ * *vec2L++; 318 case 12: 319 resultL += ( int32 )*vec1L++ * *vec2L++; 320 case 11: 321 resultL += ( int32 )*vec1L++ * *vec2L++; 322 case 10: 323 resultL += ( int32 )*vec1L++ * *vec2L++; 324 case 9: 325 resultL += ( int32 )*vec1L++ * *vec2L++; 326 case 8: 327 resultL += ( int32 )*vec1L++ * *vec2L++; 328 case 7: 329 resultL += ( int32 )*vec1L++ * *vec2L++; 330 case 6: 331 resultL += ( int32 )*vec1L++ * *vec2L++; 332 case 5: 333 resultL += ( int32 )*vec1L++ * *vec2L++; 334 case 4: 335 resultL += ( int32 )*vec1L++ * *vec2L++; 336 case 3: 337 resultL += ( int32 )*vec1L++ * *vec2L++; 338 case 2: 339 resultL += ( int32 )*vec1L++ * *vec2L++; 340 case 1: 341 resultL += ( int32 )*vec1L++ * *vec2L++; 342 } 343 344 return resultL; 345} 346 347/* ------------------------------------------------------------------------- */ 348 349#endif /* HW_SSE2 */ 350