1// This file is part of the ustl library, an STL implementation. 2// 3// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net> 4// This file is free software, distributed under the MIT License. 5// 6/// \file simd.h 7/// \brief SIMD-type algorithms, with hardware acceleration, if available. 8/// 9/// All algorithms are container-based because iterator syntax is just too 10/// damn verbose and because the specializations need to be able to tell 11/// how many elements are in the container in order to choose proper SIMD 12/// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!) 13/// Specializations are only for the tuple template because the container 14/// must be of a fixed and compile-time-known size for the compiler to be 15/// able to choose the specialization. 16/// 17 18#ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9 19#define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9 20 21#include "uassert.h" 22#include "ulimits.h" 23#if HAVE_MATH_H 24 #include <math.h> 25#endif 26 27#if PLATFORM_ANDROID 28#include <stdio.h> 29#undef CPU_HAS_MMX 30#endif 31 32namespace ustl { 33namespace simd { 34 35//---------------------------------------------------------------------- 36// Generic algorithms 37//---------------------------------------------------------------------- 38 39/// Applies \p op to each element in \p op1. 40template <typename Ctr, typename UnaryOperation> 41inline void packop (Ctr& op1, UnaryOperation op) 42{ 43 foreach (typename Ctr::iterator, i, op1) 44 op (*i); 45} 46 47/// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2. 48template <typename Ctr, typename BinaryOperation> 49inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op) 50{ 51 assert (op2.size() <= op1.size()); 52 typename Ctr::const_iterator i1 (op1.begin()); 53 typename Ctr::iterator i2 (op2.begin()); 54 for (; i2 != op2.end(); ++i1, ++i2) 55 *i2 = op (*i2, *i1); 56} 57 58/// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result. 59template <typename Ctr, typename BinaryOperation> 60inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op) 61{ 62 assert (op1.size() <= op2.size() && op1.size() <= result.size()); 63 passign (op1, result); 64 packop (op2, result); 65} 66 67/// Copies \p op1 into \p result. 68template <typename Ctr> 69inline void passign (const Ctr& op1, Ctr& result) 70{ 71 assert (op1.size() <= result.size()); 72 typename Ctr::iterator d (result.begin()); 73 foreach (typename Ctr::const_iterator, s, op1) 74 *d++ = *s; 75} 76 77/// Copies \p result.size() elements from \p op1 to \p result. 78template <typename Ctr> 79inline void ipassign (typename Ctr::const_iterator op1, Ctr& result) 80{ 81 foreach (typename Ctr::iterator, d, result) 82 *d = *op1++; 83} 84 85template <typename Ctr1, typename Ctr2, typename ConvertFunction> 86inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f) 87{ 88 assert (op1.size() <= op2.size()); 89 typename Ctr1::const_iterator i1 (op1.begin()); 90 typename Ctr2::iterator i2 (op2.begin()); 91 for (; i1 != op1.end(); ++i1, ++i2) 92 *i2 = f (*i1); 93} 94 95// Functionoids for SIMD operations, like saturation arithmetic, shifts, etc. 96STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b)) 97STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b)) 98STD_BINARY_FUNCTOR (fpshl, T, (a << b)) 99STD_BINARY_FUNCTOR (fpshr, T, (a >> b)) 100STD_BINARY_FUNCTOR (fpmin, T, (min (a, b))) 101STD_BINARY_FUNCTOR (fpmax, T, (max (a, b))) 102STD_BINARY_FUNCTOR (fpavg, T, ((a + b + 1) / 2)) 103STD_CONVERSION_FUNCTOR (fcast, (D(a))) 104#if HAVE_MATH_H 105STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a)) 106STD_UNARY_FUNCTOR (fpsqrt, T, (reset_mmx(), T (sqrt (a)))) 107STD_UNARY_FUNCTOR (fprecipsqrt, T, (reset_mmx(), 1 / T(sqrt (a)))) 108STD_UNARY_FUNCTOR (fsin, T, (reset_mmx(), T (sin (a)))) 109STD_UNARY_FUNCTOR (fcos, T, (reset_mmx(), T (cos (a)))) 110STD_UNARY_FUNCTOR (ftan, T, (reset_mmx(), T (tan (a)))) 111#if HAVE_RINTF 112STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a)))) 113#else 114STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a)))) 115#endif 116template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); } 117#endif 118template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); } 119template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); } 120 121#define SIMD_PACKEDOP1(name, operation) \ 122template <typename Ctr> \ 123inline void name (Ctr& op1) \ 124{ \ 125 typedef typename Ctr::value_type value_t; \ 126 packop (op1, operation<value_t>()); \ 127} 128#define SIMD_PACKEDOP2(name, operation) \ 129template <typename Ctr> \ 130inline void name (const Ctr& op1, Ctr& op2) \ 131{ \ 132 typedef typename Ctr::value_type value_t; \ 133 packop (op1, op2, operation<value_t>()); \ 134} 135#define SIMD_PACKEDOP3(name, operation) \ 136template <typename Ctr> \ 137inline void name (const Ctr& op1, const Ctr& op2, Ctr& result) \ 138{ \ 139 typedef typename Ctr::value_type value_t; \ 140 packop (op1, op2, result, operation<value_t>()); \ 141} 142#define SIMD_SINGLEOP1(name, operation) \ 143template <typename T> \ 144inline T name (T op) \ 145{ \ 146 operation<T> obj; \ 147 return (obj(op)); \ 148} 149#define SIMD_CONVERTOP(name, operation) \ 150template <typename Ctr1, typename Ctr2> \ 151inline void name (const Ctr1& op1, Ctr2& op2) \ 152{ \ 153 typedef typename Ctr1::value_type value1_t; \ 154 typedef typename Ctr2::value_type value2_t; \ 155 pconvert (op1, op2, operation<value1_t, value2_t>());\ 156} 157 158SIMD_PACKEDOP2 (padd, plus) 159SIMD_PACKEDOP2 (psub, minus) 160SIMD_PACKEDOP2 (pmul, multiplies) 161SIMD_PACKEDOP2 (pdiv, divides) 162SIMD_PACKEDOP2 (pand, bitwise_and) 163SIMD_PACKEDOP2 (por, bitwise_or) 164SIMD_PACKEDOP2 (pxor, bitwise_xor) 165SIMD_PACKEDOP2 (pshl, fpshl) 166SIMD_PACKEDOP2 (pshr, fpshr) 167SIMD_PACKEDOP2 (psubs, fpsubs) 168SIMD_PACKEDOP2 (pmin, fpmin) 169SIMD_PACKEDOP2 (pmax, fpmax) 170SIMD_PACKEDOP2 (pavg, fpavg) 171 172SIMD_PACKEDOP3 (padd, plus) 173SIMD_PACKEDOP3 (psub, minus) 174SIMD_PACKEDOP3 (pmul, multiplies) 175SIMD_PACKEDOP3 (pdiv, divides) 176SIMD_PACKEDOP3 (pand, bitwise_and) 177SIMD_PACKEDOP3 (por, bitwise_or) 178SIMD_PACKEDOP3 (pxor, bitwise_xor) 179SIMD_PACKEDOP3 (pshl, fpshl) 180SIMD_PACKEDOP3 (pshr, fpshr) 181SIMD_PACKEDOP3 (padds, fpadds) 182SIMD_PACKEDOP3 (psubs, fpsubs) 183SIMD_PACKEDOP3 (pmin, fpmin) 184SIMD_PACKEDOP3 (pmax, fpmax) 185SIMD_PACKEDOP3 (pavg, fpavg) 186 187#if HAVE_MATH_H 188SIMD_PACKEDOP1 (precip, fpreciprocal) 189SIMD_PACKEDOP1 (psqrt, fpsqrt) 190SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt) 191SIMD_PACKEDOP1 (psin, fsin) 192SIMD_PACKEDOP1 (pcos, fcos) 193SIMD_PACKEDOP1 (ptan, ftan) 194 195SIMD_SINGLEOP1 (srecip, fpreciprocal) 196SIMD_SINGLEOP1 (ssqrt, fpsqrt) 197SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt) 198SIMD_SINGLEOP1 (ssin, fsin) 199SIMD_SINGLEOP1 (scos, fcos) 200SIMD_SINGLEOP1 (stan, ftan) 201 202SIMD_CONVERTOP (pround, fround) 203 204template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); } 205#endif 206 207#undef SIMD_SINGLEOP1 208#undef SIMD_PACKEDOP3 209#undef SIMD_PACKEDOP2 210#undef SIMD_PACKEDOP1 211 212//---------------------------------------------------------------------- 213// Vector types to cast tuple data to 214//---------------------------------------------------------------------- 215 216#if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4 217#define VECTOR_ATTRIBUTE(mode,vs) __attribute__((vector_size(vs))) 218#else 219#define VECTOR_ATTRIBUTE(mode,vs) 220#endif 221typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8); 222typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8); 223typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16); 224typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8); 225typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16); 226#if HAVE_INT64_T 227typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8); 228#endif 229typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8); 230typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16); 231typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16); 232#undef VECTOR_ATTRIBUTE 233 234//---------------------------------------------------------------------- 235// Hardware accelerated specializations 236//---------------------------------------------------------------------- 237 238#define SIMD_PKOP2_SPEC(n, type, optype) \ 239template <> \ 240inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>) 241#define SIMD_PASSIGN_SPEC(n, type) \ 242template <> \ 243inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout) 244#define SIMD_IPASSIGN_SPEC(n, type) \ 245template <> \ 246inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout) 247#define SIMD_CONVERT_SPEC(n, type1, type2, optype) \ 248template <> \ 249inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>) 250 251#if CPU_HAS_MMX 252#define STD_MMX_ARGS "=m"(oout[0]) : "m"(oin[0]) : "mm0", "st", "memory" 253#define DBL_MMX_ARGS "=m"(oout[0]), "=m"(oout[2]) : "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory" 254#define MMX_PKOP2_SPEC(n,type,optype,instruction) \ 255SIMD_PKOP2_SPEC(n,type,optype) \ 256{ asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); } 257#define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction) \ 258SIMD_PKOP2_SPEC(n,type,optype) \ 259{ asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); } 260#define MMX_PASSIGN_SPEC(n,type) \ 261SIMD_PASSIGN_SPEC(n,type) \ 262{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); } 263#define MMX_DBL_PASSIGN_SPEC(n,type) \ 264SIMD_PASSIGN_SPEC(n,type) \ 265{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); } 266#define MMX_IPASSIGN_SPEC(n,type) \ 267SIMD_IPASSIGN_SPEC(n,type) \ 268{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); } 269#define MMX_DBL_IPASSIGN_SPEC(n,type) \ 270SIMD_IPASSIGN_SPEC(n,type) \ 271{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); } 272 273MMX_PASSIGN_SPEC(8,uint8_t) 274MMX_PKOP2_SPEC(8,uint8_t,plus,paddb) 275MMX_PKOP2_SPEC(8,uint8_t,minus,psubb) 276MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand) 277MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por) 278MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor) 279MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb) 280MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb) 281 282MMX_PASSIGN_SPEC(8,int8_t) 283MMX_PKOP2_SPEC(8,int8_t,plus,paddb) 284MMX_PKOP2_SPEC(8,int8_t,minus,psubb) 285MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand) 286MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por) 287MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor) 288MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb) 289MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb) 290 291MMX_PASSIGN_SPEC(4,uint16_t) 292MMX_PKOP2_SPEC(4,uint16_t,plus,paddw) 293MMX_PKOP2_SPEC(4,uint16_t,minus,psubw) 294MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand) 295MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por) 296MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor) 297/// \todo psllw does not work like other operations, it uses the first element for shift count. 298//MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw) 299//MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw) 300MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw) 301MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw) 302 303MMX_PASSIGN_SPEC(4,int16_t) 304MMX_PKOP2_SPEC(4,int16_t,plus,paddw) 305MMX_PKOP2_SPEC(4,int16_t,minus,psubw) 306MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand) 307MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por) 308MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor) 309//MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw) 310//MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw) 311MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw) 312MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw) 313 314MMX_PASSIGN_SPEC(2,uint32_t) 315MMX_PKOP2_SPEC(2,uint32_t,plus,paddd) 316MMX_PKOP2_SPEC(2,uint32_t,minus,psubd) 317MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand) 318MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por) 319MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor) 320//MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld) 321//MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld) 322 323MMX_PASSIGN_SPEC(2,int32_t) 324MMX_PKOP2_SPEC(2,int32_t,plus,paddd) 325MMX_PKOP2_SPEC(2,int32_t,minus,psubd) 326MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand) 327MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por) 328MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor) 329//MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld) 330//MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld) 331 332MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd) 333MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd) 334MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand) 335MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por) 336MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor) 337//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld) 338//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld) 339 340MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd) 341MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd) 342MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand) 343MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por) 344MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor) 345//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld) 346//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld) 347 348#if CPU_HAS_SSE || CPU_HAS_3DNOW 349MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb) 350MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb) 351MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw) 352MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw) 353MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub) 354MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub) 355MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw) 356MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw) 357#endif // CPU_HAS_SSE || CPU_HAS_3DNOW 358 359#if CPU_HAS_3DNOW 360MMX_PASSIGN_SPEC(2,float) 361MMX_PKOP2_SPEC(2,float,plus,pfadd) 362MMX_PKOP2_SPEC(2,float,minus,pfsub) 363MMX_PKOP2_SPEC(2,float,multiplies,pfmul) 364MMX_PKOP2_SPEC(2,float,fpmin,pfmin) 365MMX_PKOP2_SPEC(2,float,fpmax,pfmax) 366#ifndef CPU_HAS_SSE 367MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd) 368MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub) 369MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul) 370MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin) 371MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax) 372#endif 373#endif // CPU_HAS_3DNOW 374 375MMX_IPASSIGN_SPEC(8,uint8_t) 376MMX_IPASSIGN_SPEC(4,uint16_t) 377MMX_IPASSIGN_SPEC(2,uint32_t) 378MMX_IPASSIGN_SPEC(2,float) 379 380#ifndef CPU_HAS_SSE 381MMX_DBL_PASSIGN_SPEC(4,float) 382MMX_DBL_PASSIGN_SPEC(4,uint32_t) 383MMX_DBL_PASSIGN_SPEC(4,int32_t) 384MMX_DBL_IPASSIGN_SPEC(4,float) 385MMX_DBL_IPASSIGN_SPEC(4,uint32_t) 386MMX_DBL_IPASSIGN_SPEC(4,int32_t) 387#endif 388 389#undef MMX_IPASSIGN_SPEC 390#undef MMX_PASSIGN_SPEC 391#undef MMX_PKOP2_SPEC 392#undef STD_MMX_ARGS 393#endif // CPU_HAS_MMX 394 395#if CPU_HAS_SSE 396#define STD_SSE_ARGS "=m"(oout[0]) : "m"(oin[0]) : "xmm0", "memory" 397#define SSE_PKOP2_SPEC(n,type,optype,instruction) \ 398SIMD_PKOP2_SPEC(n,type,optype) \ 399{ asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);} 400#define SSE_PASSIGN_SPEC(n,type) \ 401SIMD_PASSIGN_SPEC(n,type) \ 402{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);} 403#define SSE_IPASSIGN_SPEC(n,type) \ 404SIMD_IPASSIGN_SPEC(n,type) \ 405{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);} 406SSE_PASSIGN_SPEC(4,float) 407SSE_PASSIGN_SPEC(4,int32_t) 408SSE_PASSIGN_SPEC(4,uint32_t) 409SSE_PKOP2_SPEC(4,float,plus,addps) 410SSE_PKOP2_SPEC(4,float,minus,subps) 411SSE_PKOP2_SPEC(4,float,multiplies,mulps) 412SSE_PKOP2_SPEC(4,float,divides,divps) 413SSE_PKOP2_SPEC(4,float,bitwise_and,andps) 414SSE_PKOP2_SPEC(4,float,bitwise_or,orps) 415SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps) 416SSE_PKOP2_SPEC(4,float,fpmax,maxps) 417SSE_PKOP2_SPEC(4,float,fpmin,minps) 418 419SIMD_CONVERT_SPEC(4,float,int32_t,fround) { 420 asm ("cvtps2pi %2, %%mm0\n\t" 421 "cvtps2pi %3, %%mm1\n\t" 422 "movq %%mm0, %0\n\t" 423 "movq %%mm1, %1" 424 : DBL_MMX_ARGS); 425 reset_mmx(); 426} 427SIMD_CONVERT_SPEC(4,int32_t,float,fround) { 428 asm ("cvtpi2ps %2, %%xmm0\n\t" 429 "shufps $0x4E,%%xmm0,%%xmm0\n\t" 430 "cvtpi2ps %1, %%xmm0\n\t" 431 "movups %%xmm0, %0" 432 : "=m"(oout[0]) : "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory"); 433} 434template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const { 435 register int32_t rv; 436 asm ("movss %1, %%xmm0\n\t" 437 "cvtss2si %%xmm0, %0" 438 : "=r"(rv) : "m"(a) : "xmm0" ); 439 return (rv); 440} 441template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const { 442 register uint32_t rv; 443 asm ("movss %1, %%xmm0\n\t" 444 "cvtss2si %%xmm0, %0" 445 : "=r"(rv) : "m"(a) : "xmm0" ); 446 return (rv); 447} 448 449SSE_IPASSIGN_SPEC(4,float) 450SSE_IPASSIGN_SPEC(4,int32_t) 451SSE_IPASSIGN_SPEC(4,uint32_t) 452 453#undef SSE_IPASSIGN_SPEC 454#undef SSE_PASSIGN_SPEC 455#undef SSE_PKOP2_SPEC 456#undef STD_SSE_ARGS 457#endif // CPU_HAS_SSE 458 459#undef SIMD_PACKEDOP_SPEC 460 461} // namespace simd 462} // namespace ustl 463 464#endif 465 466