1// This file is part of the ustl library, an STL implementation.
2//
3// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4// This file is free software, distributed under the MIT License.
5//
6/// \file simd.h
7/// \brief SIMD-type algorithms, with hardware acceleration, if available.
8///
9/// All algorithms are container-based because iterator syntax is just too
10/// damn verbose and because the specializations need to be able to tell
11/// how many elements are in the container in order to choose proper SIMD
12/// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
13/// Specializations are only for the tuple template because the container
14/// must be of a fixed and compile-time-known size for the compiler to be
15/// able to choose the specialization.
16///
17
18#ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
19#define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
20
21#include "uassert.h"
22#include "ulimits.h"
23#if HAVE_MATH_H
24    #include <math.h>
25#endif
26
27#if PLATFORM_ANDROID
28#include <stdio.h>
29#undef CPU_HAS_MMX
30#endif
31
32namespace ustl {
33namespace simd {
34
35//----------------------------------------------------------------------
36// Generic algorithms
37//----------------------------------------------------------------------
38
39/// Applies \p op to each element in \p op1.
40template <typename Ctr, typename UnaryOperation>
41inline void packop (Ctr& op1, UnaryOperation op)
42{
43    foreach (typename Ctr::iterator, i, op1)
44	op (*i);
45}
46
47/// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
48template <typename Ctr, typename BinaryOperation>
49inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
50{
51    assert (op2.size() <= op1.size());
52    typename Ctr::const_iterator i1 (op1.begin());
53    typename Ctr::iterator i2 (op2.begin());
54    for (; i2 != op2.end(); ++i1, ++i2)
55	*i2 = op (*i2, *i1);
56}
57
58/// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
59template <typename Ctr, typename BinaryOperation>
60inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
61{
62    assert (op1.size() <= op2.size() && op1.size() <= result.size());
63    passign (op1, result);
64    packop (op2, result);
65}
66
67/// Copies \p op1 into \p result.
68template <typename Ctr>
69inline void passign (const Ctr& op1, Ctr& result)
70{
71    assert (op1.size() <= result.size());
72    typename Ctr::iterator d (result.begin());
73    foreach (typename Ctr::const_iterator, s, op1)
74	*d++ = *s;
75}
76
77/// Copies \p result.size() elements from \p op1 to \p result.
78template <typename Ctr>
79inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
80{
81    foreach (typename Ctr::iterator, d, result)
82	*d = *op1++;
83}
84
85template <typename Ctr1, typename Ctr2, typename ConvertFunction>
86inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
87{
88    assert (op1.size() <= op2.size());
89    typename Ctr1::const_iterator i1 (op1.begin());
90    typename Ctr2::iterator i2 (op2.begin());
91    for (; i1 != op1.end(); ++i1, ++i2)
92	*i2 = f (*i1);
93}
94
95// Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
96STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
97STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
98STD_BINARY_FUNCTOR (fpshl,  T, (a << b))
99STD_BINARY_FUNCTOR (fpshr,  T, (a >> b))
100STD_BINARY_FUNCTOR (fpmin,  T, (min (a, b)))
101STD_BINARY_FUNCTOR (fpmax,  T, (max (a, b)))
102STD_BINARY_FUNCTOR (fpavg,  T, ((a + b + 1) / 2))
103STD_CONVERSION_FUNCTOR (fcast, (D(a)))
104#if HAVE_MATH_H
105STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
106STD_UNARY_FUNCTOR (fpsqrt,	T, (reset_mmx(), T (sqrt (a))))
107STD_UNARY_FUNCTOR (fprecipsqrt,	T, (reset_mmx(), 1 / T(sqrt (a))))
108STD_UNARY_FUNCTOR (fsin,	T, (reset_mmx(), T (sin (a))))
109STD_UNARY_FUNCTOR (fcos,	T, (reset_mmx(), T (cos (a))))
110STD_UNARY_FUNCTOR (ftan,	T, (reset_mmx(), T (tan (a))))
111#if HAVE_RINTF
112STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
113#else
114STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
115#endif
116template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
117#endif
118template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
119template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }
120
121#define SIMD_PACKEDOP1(name, operation)		\
122template <typename Ctr>				\
123inline void name (Ctr& op1)			\
124{						\
125    typedef typename Ctr::value_type value_t;	\
126    packop (op1, operation<value_t>());		\
127}
128#define SIMD_PACKEDOP2(name, operation)		\
129template <typename Ctr>				\
130inline void name (const Ctr& op1, Ctr& op2)	\
131{						\
132    typedef typename Ctr::value_type value_t;	\
133    packop (op1, op2, operation<value_t>());	\
134}
135#define SIMD_PACKEDOP3(name, operation)			\
136template <typename Ctr>					\
137inline void name (const Ctr& op1, const Ctr& op2, Ctr& result)	\
138{							\
139    typedef typename Ctr::value_type value_t;		\
140    packop (op1, op2, result, operation<value_t>());	\
141}
142#define SIMD_SINGLEOP1(name, operation)		\
143template <typename T>				\
144inline T name (T op)				\
145{						\
146    operation<T> obj;				\
147    return (obj(op));				\
148}
149#define SIMD_CONVERTOP(name, operation)		\
150template <typename Ctr1, typename Ctr2>		\
151inline void name (const Ctr1& op1, Ctr2& op2)	\
152{						\
153    typedef typename Ctr1::value_type value1_t;	\
154    typedef typename Ctr2::value_type value2_t;	\
155    pconvert (op1, op2, operation<value1_t, value2_t>());\
156}
157
158SIMD_PACKEDOP2 (padd, plus)
159SIMD_PACKEDOP2 (psub, minus)
160SIMD_PACKEDOP2 (pmul, multiplies)
161SIMD_PACKEDOP2 (pdiv, divides)
162SIMD_PACKEDOP2 (pand, bitwise_and)
163SIMD_PACKEDOP2 (por, bitwise_or)
164SIMD_PACKEDOP2 (pxor, bitwise_xor)
165SIMD_PACKEDOP2 (pshl, fpshl)
166SIMD_PACKEDOP2 (pshr, fpshr)
167SIMD_PACKEDOP2 (psubs, fpsubs)
168SIMD_PACKEDOP2 (pmin, fpmin)
169SIMD_PACKEDOP2 (pmax, fpmax)
170SIMD_PACKEDOP2 (pavg, fpavg)
171
172SIMD_PACKEDOP3 (padd, plus)
173SIMD_PACKEDOP3 (psub, minus)
174SIMD_PACKEDOP3 (pmul, multiplies)
175SIMD_PACKEDOP3 (pdiv, divides)
176SIMD_PACKEDOP3 (pand, bitwise_and)
177SIMD_PACKEDOP3 (por, bitwise_or)
178SIMD_PACKEDOP3 (pxor, bitwise_xor)
179SIMD_PACKEDOP3 (pshl, fpshl)
180SIMD_PACKEDOP3 (pshr, fpshr)
181SIMD_PACKEDOP3 (padds, fpadds)
182SIMD_PACKEDOP3 (psubs, fpsubs)
183SIMD_PACKEDOP3 (pmin, fpmin)
184SIMD_PACKEDOP3 (pmax, fpmax)
185SIMD_PACKEDOP3 (pavg, fpavg)
186
187#if HAVE_MATH_H
188SIMD_PACKEDOP1 (precip, fpreciprocal)
189SIMD_PACKEDOP1 (psqrt, fpsqrt)
190SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
191SIMD_PACKEDOP1 (psin, fsin)
192SIMD_PACKEDOP1 (pcos, fcos)
193SIMD_PACKEDOP1 (ptan, ftan)
194
195SIMD_SINGLEOP1 (srecip, fpreciprocal)
196SIMD_SINGLEOP1 (ssqrt, fpsqrt)
197SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
198SIMD_SINGLEOP1 (ssin, fsin)
199SIMD_SINGLEOP1 (scos, fcos)
200SIMD_SINGLEOP1 (stan, ftan)
201
202SIMD_CONVERTOP (pround, fround)
203
204template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
205#endif
206
207#undef SIMD_SINGLEOP1
208#undef SIMD_PACKEDOP3
209#undef SIMD_PACKEDOP2
210#undef SIMD_PACKEDOP1
211
212//----------------------------------------------------------------------
213// Vector types to cast tuple data to
214//----------------------------------------------------------------------
215
216#if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
217#define VECTOR_ATTRIBUTE(mode,vs)	__attribute__((vector_size(vs)))
218#else
219#define VECTOR_ATTRIBUTE(mode,vs)
220#endif
221typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
222typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
223typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
224typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
225typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
226#if HAVE_INT64_T
227typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
228#endif
229typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
230typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
231typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
232#undef VECTOR_ATTRIBUTE
233
234//----------------------------------------------------------------------
235// Hardware accelerated specializations
236//----------------------------------------------------------------------
237
238#define SIMD_PKOP2_SPEC(n, type, optype)	\
239template <>					\
240inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
241#define SIMD_PASSIGN_SPEC(n, type)		\
242template <>					\
243inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
244#define SIMD_IPASSIGN_SPEC(n, type)		\
245template <>					\
246inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
247#define SIMD_CONVERT_SPEC(n, type1, type2, optype)	\
248template <>					\
249inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)
250
251#if CPU_HAS_MMX
252#define STD_MMX_ARGS	"=m"(oout[0]) : "m"(oin[0]) : "mm0", "st", "memory"
253#define DBL_MMX_ARGS	"=m"(oout[0]), "=m"(oout[2]) : "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
254#define MMX_PKOP2_SPEC(n,type,optype,instruction)	\
255SIMD_PKOP2_SPEC(n,type,optype)		\
256{ asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
257#define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction)	\
258SIMD_PKOP2_SPEC(n,type,optype)		\
259{ asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
260#define MMX_PASSIGN_SPEC(n,type)	\
261SIMD_PASSIGN_SPEC(n,type)		\
262{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
263#define MMX_DBL_PASSIGN_SPEC(n,type)	\
264SIMD_PASSIGN_SPEC(n,type)		\
265{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
266#define MMX_IPASSIGN_SPEC(n,type)	\
267SIMD_IPASSIGN_SPEC(n,type)		\
268{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
269#define MMX_DBL_IPASSIGN_SPEC(n,type)	\
270SIMD_IPASSIGN_SPEC(n,type)		\
271{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
272
273MMX_PASSIGN_SPEC(8,uint8_t)
274MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
275MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
276MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
277MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
278MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
279MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
280MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)
281
282MMX_PASSIGN_SPEC(8,int8_t)
283MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
284MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
285MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
286MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
287MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
288MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
289MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)
290
291MMX_PASSIGN_SPEC(4,uint16_t)
292MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
293MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
294MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
295MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
296MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
297/// \todo psllw does not work like other operations, it uses the first element for shift count.
298//MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
299//MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
300MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
301MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)
302
303MMX_PASSIGN_SPEC(4,int16_t)
304MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
305MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
306MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
307MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
308MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
309//MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
310//MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
311MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
312MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)
313
314MMX_PASSIGN_SPEC(2,uint32_t)
315MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
316MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
317MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
318MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
319MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
320//MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
321//MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
322
323MMX_PASSIGN_SPEC(2,int32_t)
324MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
325MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
326MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
327MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
328MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
329//MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
330//MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)
331
332MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
333MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
334MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
335MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
336MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
337//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
338//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)
339
340MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
341MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
342MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
343MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
344MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
345//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
346//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)
347
348#if CPU_HAS_SSE || CPU_HAS_3DNOW
349MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
350MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
351MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
352MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
353MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
354MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
355MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
356MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
357#endif // CPU_HAS_SSE || CPU_HAS_3DNOW
358
359#if CPU_HAS_3DNOW
360MMX_PASSIGN_SPEC(2,float)
361MMX_PKOP2_SPEC(2,float,plus,pfadd)
362MMX_PKOP2_SPEC(2,float,minus,pfsub)
363MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
364MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
365MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
366#ifndef CPU_HAS_SSE
367MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
368MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
369MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
370MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
371MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
372#endif
373#endif // CPU_HAS_3DNOW
374
375MMX_IPASSIGN_SPEC(8,uint8_t)
376MMX_IPASSIGN_SPEC(4,uint16_t)
377MMX_IPASSIGN_SPEC(2,uint32_t)
378MMX_IPASSIGN_SPEC(2,float)
379
380#ifndef CPU_HAS_SSE
381MMX_DBL_PASSIGN_SPEC(4,float)
382MMX_DBL_PASSIGN_SPEC(4,uint32_t)
383MMX_DBL_PASSIGN_SPEC(4,int32_t)
384MMX_DBL_IPASSIGN_SPEC(4,float)
385MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
386MMX_DBL_IPASSIGN_SPEC(4,int32_t)
387#endif
388
389#undef MMX_IPASSIGN_SPEC
390#undef MMX_PASSIGN_SPEC
391#undef MMX_PKOP2_SPEC
392#undef STD_MMX_ARGS
393#endif // CPU_HAS_MMX
394
395#if CPU_HAS_SSE
396#define STD_SSE_ARGS	"=m"(oout[0]) : "m"(oin[0]) : "xmm0", "memory"
397#define SSE_PKOP2_SPEC(n,type,optype,instruction)	\
398SIMD_PKOP2_SPEC(n,type,optype)		\
399{ asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
400#define SSE_PASSIGN_SPEC(n,type)			\
401SIMD_PASSIGN_SPEC(n,type)		\
402{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
403#define SSE_IPASSIGN_SPEC(n,type)	\
404SIMD_IPASSIGN_SPEC(n,type)		\
405{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
406SSE_PASSIGN_SPEC(4,float)
407SSE_PASSIGN_SPEC(4,int32_t)
408SSE_PASSIGN_SPEC(4,uint32_t)
409SSE_PKOP2_SPEC(4,float,plus,addps)
410SSE_PKOP2_SPEC(4,float,minus,subps)
411SSE_PKOP2_SPEC(4,float,multiplies,mulps)
412SSE_PKOP2_SPEC(4,float,divides,divps)
413SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
414SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
415SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
416SSE_PKOP2_SPEC(4,float,fpmax,maxps)
417SSE_PKOP2_SPEC(4,float,fpmin,minps)
418
419SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
420    asm ("cvtps2pi %2, %%mm0\n\t"
421	 "cvtps2pi %3, %%mm1\n\t"
422	 "movq %%mm0, %0\n\t"
423	 "movq %%mm1, %1"
424	 : DBL_MMX_ARGS);
425    reset_mmx();
426}
427SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
428    asm ("cvtpi2ps %2, %%xmm0\n\t"
429	 "shufps $0x4E,%%xmm0,%%xmm0\n\t"
430	 "cvtpi2ps %1, %%xmm0\n\t"
431	 "movups %%xmm0, %0"
432	 : "=m"(oout[0]) : "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
433}
434template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
435    register int32_t rv;
436    asm ("movss %1, %%xmm0\n\t"
437	 "cvtss2si %%xmm0, %0"
438	 : "=r"(rv) : "m"(a) : "xmm0" );
439    return (rv);
440}
441template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
442    register uint32_t rv;
443    asm ("movss %1, %%xmm0\n\t"
444	 "cvtss2si %%xmm0, %0"
445	 : "=r"(rv) : "m"(a) : "xmm0" );
446    return (rv);
447}
448
449SSE_IPASSIGN_SPEC(4,float)
450SSE_IPASSIGN_SPEC(4,int32_t)
451SSE_IPASSIGN_SPEC(4,uint32_t)
452
453#undef SSE_IPASSIGN_SPEC
454#undef SSE_PASSIGN_SPEC
455#undef SSE_PKOP2_SPEC
456#undef STD_SSE_ARGS
457#endif // CPU_HAS_SSE
458
459#undef SIMD_PACKEDOP_SPEC
460
461} // namespace simd
462} // namespace ustl
463
464#endif
465
466