util/ustl-1.0/simd.h

// This file is part of the ustl library, an STL implementation.
//
// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
// This file is free software, distributed under the MIT License.
//
/// \file simd.h
/// \brief SIMD-type algorithms, with hardware acceleration, if available.
///
/// All algorithms are container-based because iterator syntax is just too
/// damn verbose and because the specializations need to be able to tell
/// how many elements are in the container in order to choose proper SIMD
/// instruction set (i.e.: 4 floats select SSE, while 2 floats select 3dNow!)
/// Specializations are only for the tuple template because the container
/// must be of a fixed and compile-time-known size for the compiler to be
/// able to choose the specialization.
///

#ifndef SIMD_H_39BE2D970DF4BD00508CCFFB482496F9
#define SIMD_H_39BE2D970DF4BD00508CCFFB482496F9

#include "uassert.h"
#include "ulimits.h"
#if HAVE_MATH_H
    #include <math.h>
#endif

#if PLATFORM_ANDROID
#include <stdio.h>
#undef CPU_HAS_MMX
#endif

namespace ustl {
namespace simd {

//----------------------------------------------------------------------
// Generic algorithms
//----------------------------------------------------------------------

/// Applies \p op to each element in \p op1.
template <typename Ctr, typename UnaryOperation>
inline void packop (Ctr& op1, UnaryOperation op)
{
    foreach (typename Ctr::iterator, i, op1)
	op (*i);
}

/// Applies \p op to each element in \p op1 and \p op2 and stores in \p op2.
template <typename Ctr, typename BinaryOperation>
inline void packop (const Ctr& op1, Ctr& op2, BinaryOperation op)
{
    assert (op2.size() <= op1.size());
    typename Ctr::const_iterator i1 (op1.begin());
    typename Ctr::iterator i2 (op2.begin());
    for (; i2 != op2.end(); ++i1, ++i2)
	*i2 = op (*i2, *i1);
}

/// Applies \p op to corresponding elements in \p op1 and \p op2 and stores in \p result.
template <typename Ctr, typename BinaryOperation>
inline void packop (const Ctr& op1, const Ctr& op2, Ctr& result, BinaryOperation op)
{
    assert (op1.size() <= op2.size() && op1.size() <= result.size());
    passign (op1, result);
    packop (op2, result);
}

/// Copies \p op1 into \p result.
template <typename Ctr>
inline void passign (const Ctr& op1, Ctr& result)
{
    assert (op1.size() <= result.size());
    typename Ctr::iterator d (result.begin());
    foreach (typename Ctr::const_iterator, s, op1)
	*d++ = *s;
}

/// Copies \p result.size() elements from \p op1 to \p result.
template <typename Ctr>
inline void ipassign (typename Ctr::const_iterator op1, Ctr& result)
{
    foreach (typename Ctr::iterator, d, result)
	*d = *op1++;
}

template <typename Ctr1, typename Ctr2, typename ConvertFunction>
inline void pconvert (const Ctr1& op1, Ctr2& op2, ConvertFunction f)
{
    assert (op1.size() <= op2.size());
    typename Ctr1::const_iterator i1 (op1.begin());
    typename Ctr2::iterator i2 (op2.begin());
    for (; i1 != op1.end(); ++i1, ++i2)
	*i2 = f (*i1);
}

// Functionoids for SIMD operations, like saturation arithmetic, shifts, etc.
STD_BINARY_FUNCTOR (fpadds, T, ((b > numeric_limits<T>::max() - a) ? numeric_limits<T>::max() : a + b))
STD_BINARY_FUNCTOR (fpsubs, T, ((a < numeric_limits<T>::min() + b) ? numeric_limits<T>::min() : a - b))
STD_BINARY_FUNCTOR (fpshl,  T, (a << b))
STD_BINARY_FUNCTOR (fpshr,  T, (a >> b))
STD_BINARY_FUNCTOR (fpmin,  T, (min (a, b)))
STD_BINARY_FUNCTOR (fpmax,  T, (max (a, b)))
STD_BINARY_FUNCTOR (fpavg,  T, ((a + b + 1) / 2))
STD_CONVERSION_FUNCTOR (fcast, (D(a)))
#if HAVE_MATH_H
STD_UNARY_FUNCTOR (fpreciprocal,T, (1 / a))
STD_UNARY_FUNCTOR (fpsqrt,	T, (reset_mmx(), T (sqrt (a))))
STD_UNARY_FUNCTOR (fprecipsqrt,	T, (reset_mmx(), 1 / T(sqrt (a))))
STD_UNARY_FUNCTOR (fsin,	T, (reset_mmx(), T (sin (a))))
STD_UNARY_FUNCTOR (fcos,	T, (reset_mmx(), T (cos (a))))
STD_UNARY_FUNCTOR (ftan,	T, (reset_mmx(), T (tan (a))))
#if HAVE_RINTF
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rintf(a))))
#else
STD_CONVERSION_FUNCTOR (fround, (reset_mmx(), D(rint(a))))
#endif
template <> inline int32_t fround<double,int32_t>::operator()(const double& a) const { reset_mmx(); return (int32_t(rint(a))); }
#endif
template <> inline float fpavg<float>::operator()(const float& a, const float& b) const { return ((a + b) / 2); }
template <> inline double fpavg<double>::operator()(const double& a, const double& b) const { return ((a + b) / 2); }

#define SIMD_PACKEDOP1(name, operation)		\
template <typename Ctr>				\
inline void name (Ctr& op1)			\
{						\
    typedef typename Ctr::value_type value_t;	\
    packop (op1, operation<value_t>());		\
}
#define SIMD_PACKEDOP2(name, operation)		\
template <typename Ctr>				\
inline void name (const Ctr& op1, Ctr& op2)	\
{						\
    typedef typename Ctr::value_type value_t;	\
    packop (op1, op2, operation<value_t>());	\
}
#define SIMD_PACKEDOP3(name, operation)			\
template <typename Ctr>					\
inline void name (const Ctr& op1, const Ctr& op2, Ctr& result)	\
{							\
    typedef typename Ctr::value_type value_t;		\
    packop (op1, op2, result, operation<value_t>());	\
}
#define SIMD_SINGLEOP1(name, operation)		\
template <typename T>				\
inline T name (T op)				\
{						\
    operation<T> obj;				\
    return (obj(op));				\
}
#define SIMD_CONVERTOP(name, operation)		\
template <typename Ctr1, typename Ctr2>		\
inline void name (const Ctr1& op1, Ctr2& op2)	\
{						\
    typedef typename Ctr1::value_type value1_t;	\
    typedef typename Ctr2::value_type value2_t;	\
    pconvert (op1, op2, operation<value1_t, value2_t>());\
}

SIMD_PACKEDOP2 (padd, plus)
SIMD_PACKEDOP2 (psub, minus)
SIMD_PACKEDOP2 (pmul, multiplies)
SIMD_PACKEDOP2 (pdiv, divides)
SIMD_PACKEDOP2 (pand, bitwise_and)
SIMD_PACKEDOP2 (por, bitwise_or)
SIMD_PACKEDOP2 (pxor, bitwise_xor)
SIMD_PACKEDOP2 (pshl, fpshl)
SIMD_PACKEDOP2 (pshr, fpshr)
SIMD_PACKEDOP2 (psubs, fpsubs)
SIMD_PACKEDOP2 (pmin, fpmin)
SIMD_PACKEDOP2 (pmax, fpmax)
SIMD_PACKEDOP2 (pavg, fpavg)

SIMD_PACKEDOP3 (padd, plus)
SIMD_PACKEDOP3 (psub, minus)
SIMD_PACKEDOP3 (pmul, multiplies)
SIMD_PACKEDOP3 (pdiv, divides)
SIMD_PACKEDOP3 (pand, bitwise_and)
SIMD_PACKEDOP3 (por, bitwise_or)
SIMD_PACKEDOP3 (pxor, bitwise_xor)
SIMD_PACKEDOP3 (pshl, fpshl)
SIMD_PACKEDOP3 (pshr, fpshr)
SIMD_PACKEDOP3 (padds, fpadds)
SIMD_PACKEDOP3 (psubs, fpsubs)
SIMD_PACKEDOP3 (pmin, fpmin)
SIMD_PACKEDOP3 (pmax, fpmax)
SIMD_PACKEDOP3 (pavg, fpavg)

#if HAVE_MATH_H
SIMD_PACKEDOP1 (precip, fpreciprocal)
SIMD_PACKEDOP1 (psqrt, fpsqrt)
SIMD_PACKEDOP1 (precipsqrt, fprecipsqrt)
SIMD_PACKEDOP1 (psin, fsin)
SIMD_PACKEDOP1 (pcos, fcos)
SIMD_PACKEDOP1 (ptan, ftan)

SIMD_SINGLEOP1 (srecip, fpreciprocal)
SIMD_SINGLEOP1 (ssqrt, fpsqrt)
SIMD_SINGLEOP1 (srecipsqrt, fprecipsqrt)
SIMD_SINGLEOP1 (ssin, fsin)
SIMD_SINGLEOP1 (scos, fcos)
SIMD_SINGLEOP1 (stan, ftan)

SIMD_CONVERTOP (pround, fround)

template <typename T> inline int32_t sround (T op) { fround<T,int32_t> obj; return (obj (op)); }
#endif

#undef SIMD_SINGLEOP1
#undef SIMD_PACKEDOP3
#undef SIMD_PACKEDOP2
#undef SIMD_PACKEDOP1

//----------------------------------------------------------------------
// Vector types to cast tuple data to
//----------------------------------------------------------------------

#if HAVE_VECTOR_EXTENSIONS && __GNUC__ >= 4
#define VECTOR_ATTRIBUTE(mode,vs)	__attribute__((vector_size(vs)))
#else
#define VECTOR_ATTRIBUTE(mode,vs)
#endif
typedef uint8_t v8qi_t VECTOR_ATTRIBUTE (V8QI,8);
typedef uint16_t v4hi_t VECTOR_ATTRIBUTE (V4HI,8);
typedef uint16_t v8hi_t VECTOR_ATTRIBUTE (V8HI,16);
typedef uint32_t v2si_t VECTOR_ATTRIBUTE (V2SI,8);
typedef uint32_t v4si_t VECTOR_ATTRIBUTE (V4SI,16);
#if HAVE_INT64_T
typedef uint64_t v1di_t VECTOR_ATTRIBUTE (V1DI,8);
#endif
typedef float v2sf_t VECTOR_ATTRIBUTE (V2SF,8);
typedef float v4sf_t VECTOR_ATTRIBUTE (V4SF,16);
typedef double v2df_t VECTOR_ATTRIBUTE (V2DF,16);
#undef VECTOR_ATTRIBUTE

//----------------------------------------------------------------------
// Hardware accelerated specializations
//----------------------------------------------------------------------

#define SIMD_PKOP2_SPEC(n, type, optype)	\
template <>					\
inline void packop (const tuple<n,type>& oin, tuple<n,type>& oout, optype<type>)
#define SIMD_PASSIGN_SPEC(n, type)		\
template <>					\
inline void passign (const tuple<n,type>& oin, tuple<n,type>& oout)
#define SIMD_IPASSIGN_SPEC(n, type)		\
template <>					\
inline void ipassign (tuple<n,type>::const_iterator oin, tuple<n,type>& oout)
#define SIMD_CONVERT_SPEC(n, type1, type2, optype)	\
template <>					\
inline void pconvert (const tuple<n,type1>& oin, tuple<n,type2>& oout, optype<type1,type2>)

#if CPU_HAS_MMX
#define STD_MMX_ARGS	"=m"(oout[0]) : "m"(oin[0]) : "mm0", "st", "memory"
#define DBL_MMX_ARGS	"=m"(oout[0]), "=m"(oout[2]) : "m"(oin[0]), "m"(oin[2]) : "mm0", "mm1", "st", "st(1)", "memory"
#define MMX_PKOP2_SPEC(n,type,optype,instruction)	\
SIMD_PKOP2_SPEC(n,type,optype)		\
{ asm ("movq %0, %%mm0\n\t" #instruction " %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
#define MMX_DBL_PKOP2_SPEC(n,type,optype,instruction)	\
SIMD_PKOP2_SPEC(n,type,optype)		\
{ asm ("movq %0, %%mm0\n\tmovq %1, %%mm1\n\t" #instruction " %2, %%mm0\n\t" #instruction " %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
#define MMX_PASSIGN_SPEC(n,type)	\
SIMD_PASSIGN_SPEC(n,type)		\
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
#define MMX_DBL_PASSIGN_SPEC(n,type)	\
SIMD_PASSIGN_SPEC(n,type)		\
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }
#define MMX_IPASSIGN_SPEC(n,type)	\
SIMD_IPASSIGN_SPEC(n,type)		\
{ asm ("movq %1, %%mm0\n\tmovq %%mm0, %0" : STD_MMX_ARGS); reset_mmx(); }
#define MMX_DBL_IPASSIGN_SPEC(n,type)	\
SIMD_IPASSIGN_SPEC(n,type)		\
{ asm ("movq %2, %%mm0\n\tmovq %3, %%mm1\n\tmovq %%mm0, %0\n\tmovq %%mm1, %1" : DBL_MMX_ARGS); reset_mmx(); }

MMX_PASSIGN_SPEC(8,uint8_t)
MMX_PKOP2_SPEC(8,uint8_t,plus,paddb)
MMX_PKOP2_SPEC(8,uint8_t,minus,psubb)
MMX_PKOP2_SPEC(8,uint8_t,bitwise_and,pand)
MMX_PKOP2_SPEC(8,uint8_t,bitwise_or,por)
MMX_PKOP2_SPEC(8,uint8_t,bitwise_xor,pxor)
MMX_PKOP2_SPEC(8,uint8_t,fpadds,paddusb)
MMX_PKOP2_SPEC(8,uint8_t,fpsubs,psubusb)

MMX_PASSIGN_SPEC(8,int8_t)
MMX_PKOP2_SPEC(8,int8_t,plus,paddb)
MMX_PKOP2_SPEC(8,int8_t,minus,psubb)
MMX_PKOP2_SPEC(8,int8_t,bitwise_and,pand)
MMX_PKOP2_SPEC(8,int8_t,bitwise_or,por)
MMX_PKOP2_SPEC(8,int8_t,bitwise_xor,pxor)
MMX_PKOP2_SPEC(8,int8_t,fpadds,paddsb)
MMX_PKOP2_SPEC(8,int8_t,fpsubs,psubsb)

MMX_PASSIGN_SPEC(4,uint16_t)
MMX_PKOP2_SPEC(4,uint16_t,plus,paddw)
MMX_PKOP2_SPEC(4,uint16_t,minus,psubw)
MMX_PKOP2_SPEC(4,uint16_t,bitwise_and,pand)
MMX_PKOP2_SPEC(4,uint16_t,bitwise_or,por)
MMX_PKOP2_SPEC(4,uint16_t,bitwise_xor,pxor)
/// \todo psllw does not work like other operations, it uses the first element for shift count.
//MMX_PKOP2_SPEC(4,uint16_t,fpshl,psllw)
//MMX_PKOP2_SPEC(4,uint16_t,fpshr,psrlw)
MMX_PKOP2_SPEC(4,uint16_t,fpadds,paddusw)
MMX_PKOP2_SPEC(4,uint16_t,fpsubs,psubusw)

MMX_PASSIGN_SPEC(4,int16_t)
MMX_PKOP2_SPEC(4,int16_t,plus,paddw)
MMX_PKOP2_SPEC(4,int16_t,minus,psubw)
MMX_PKOP2_SPEC(4,int16_t,bitwise_and,pand)
MMX_PKOP2_SPEC(4,int16_t,bitwise_or,por)
MMX_PKOP2_SPEC(4,int16_t,bitwise_xor,pxor)
//MMX_PKOP2_SPEC(4,int16_t,fpshl,psllw)
//MMX_PKOP2_SPEC(4,int16_t,fpshr,psrlw)
MMX_PKOP2_SPEC(4,int16_t,fpadds,paddsw)
MMX_PKOP2_SPEC(4,int16_t,fpsubs,psubsw)

MMX_PASSIGN_SPEC(2,uint32_t)
MMX_PKOP2_SPEC(2,uint32_t,plus,paddd)
MMX_PKOP2_SPEC(2,uint32_t,minus,psubd)
MMX_PKOP2_SPEC(2,uint32_t,bitwise_and,pand)
MMX_PKOP2_SPEC(2,uint32_t,bitwise_or,por)
MMX_PKOP2_SPEC(2,uint32_t,bitwise_xor,pxor)
//MMX_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
//MMX_PKOP2_SPEC(2,uint32_t,fpshr,psrld)

MMX_PASSIGN_SPEC(2,int32_t)
MMX_PKOP2_SPEC(2,int32_t,plus,paddd)
MMX_PKOP2_SPEC(2,int32_t,minus,psubd)
MMX_PKOP2_SPEC(2,int32_t,bitwise_and,pand)
MMX_PKOP2_SPEC(2,int32_t,bitwise_or,por)
MMX_PKOP2_SPEC(2,int32_t,bitwise_xor,pxor)
//MMX_PKOP2_SPEC(2,int32_t,fpshl,pslld)
//MMX_PKOP2_SPEC(2,int32_t,fpshr,psrld)

MMX_DBL_PKOP2_SPEC(4,uint32_t,plus,paddd)
MMX_DBL_PKOP2_SPEC(4,uint32_t,minus,psubd)
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_and,pand)
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_or,por)
MMX_DBL_PKOP2_SPEC(4,uint32_t,bitwise_xor,pxor)
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshl,pslld)
//MMX_DBL_PKOP2_SPEC(2,uint32_t,fpshr,psrld)

MMX_DBL_PKOP2_SPEC(4,int32_t,plus,paddd)
MMX_DBL_PKOP2_SPEC(4,int32_t,minus,psubd)
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_and,pand)
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_or,por)
MMX_DBL_PKOP2_SPEC(4,int32_t,bitwise_xor,pxor)
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshl,pslld)
//MMX_DBL_PKOP2_SPEC(2,int32_t,fpshr,psrld)

#if CPU_HAS_SSE || CPU_HAS_3DNOW
MMX_PKOP2_SPEC(8,uint8_t,fpavg,pavgb)
MMX_PKOP2_SPEC(8,int8_t,fpavg,pavgb)
MMX_PKOP2_SPEC(4,uint16_t,fpavg,pavgw)
MMX_PKOP2_SPEC(4,int16_t,fpavg,pavgw)
MMX_PKOP2_SPEC(8,uint8_t,fpmin,pminub)
MMX_PKOP2_SPEC(8,uint8_t,fpmax,pmaxub)
MMX_PKOP2_SPEC(4,int16_t,fpmax,pmaxsw)
MMX_PKOP2_SPEC(4,int16_t,fpmin,pminsw)
#endif // CPU_HAS_SSE || CPU_HAS_3DNOW

#if CPU_HAS_3DNOW
MMX_PASSIGN_SPEC(2,float)
MMX_PKOP2_SPEC(2,float,plus,pfadd)
MMX_PKOP2_SPEC(2,float,minus,pfsub)
MMX_PKOP2_SPEC(2,float,multiplies,pfmul)
MMX_PKOP2_SPEC(2,float,fpmin,pfmin)
MMX_PKOP2_SPEC(2,float,fpmax,pfmax)
#ifndef CPU_HAS_SSE
MMX_DBL_PKOP2_SPEC(4,float,plus,pfadd)
MMX_DBL_PKOP2_SPEC(4,float,minus,pfsub)
MMX_DBL_PKOP2_SPEC(4,float,multiplies,pfmul)
MMX_DBL_PKOP2_SPEC(4,float,fpmin,pfmin)
MMX_DBL_PKOP2_SPEC(4,float,fpmax,pfmax)
#endif
#endif // CPU_HAS_3DNOW

MMX_IPASSIGN_SPEC(8,uint8_t)
MMX_IPASSIGN_SPEC(4,uint16_t)
MMX_IPASSIGN_SPEC(2,uint32_t)
MMX_IPASSIGN_SPEC(2,float)

#ifndef CPU_HAS_SSE
MMX_DBL_PASSIGN_SPEC(4,float)
MMX_DBL_PASSIGN_SPEC(4,uint32_t)
MMX_DBL_PASSIGN_SPEC(4,int32_t)
MMX_DBL_IPASSIGN_SPEC(4,float)
MMX_DBL_IPASSIGN_SPEC(4,uint32_t)
MMX_DBL_IPASSIGN_SPEC(4,int32_t)
#endif

#undef MMX_IPASSIGN_SPEC
#undef MMX_PASSIGN_SPEC
#undef MMX_PKOP2_SPEC
#undef STD_MMX_ARGS
#endif // CPU_HAS_MMX

#if CPU_HAS_SSE
#define STD_SSE_ARGS	"=m"(oout[0]) : "m"(oin[0]) : "xmm0", "memory"
#define SSE_PKOP2_SPEC(n,type,optype,instruction)	\
SIMD_PKOP2_SPEC(n,type,optype)		\
{ asm ("movups %0, %%xmm0\n\tmovups %1, %%xmm1\n\t" #instruction " %%xmm1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
#define SSE_PASSIGN_SPEC(n,type)			\
SIMD_PASSIGN_SPEC(n,type)		\
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
#define SSE_IPASSIGN_SPEC(n,type)	\
SIMD_IPASSIGN_SPEC(n,type)		\
{ asm ("movups %1, %%xmm0\n\tmovups %%xmm0, %0" : STD_SSE_ARGS);}
SSE_PASSIGN_SPEC(4,float)
SSE_PASSIGN_SPEC(4,int32_t)
SSE_PASSIGN_SPEC(4,uint32_t)
SSE_PKOP2_SPEC(4,float,plus,addps)
SSE_PKOP2_SPEC(4,float,minus,subps)
SSE_PKOP2_SPEC(4,float,multiplies,mulps)
SSE_PKOP2_SPEC(4,float,divides,divps)
SSE_PKOP2_SPEC(4,float,bitwise_and,andps)
SSE_PKOP2_SPEC(4,float,bitwise_or,orps)
SSE_PKOP2_SPEC(4,float,bitwise_xor,xorps)
SSE_PKOP2_SPEC(4,float,fpmax,maxps)
SSE_PKOP2_SPEC(4,float,fpmin,minps)

SIMD_CONVERT_SPEC(4,float,int32_t,fround) {
    asm ("cvtps2pi %2, %%mm0\n\t"
	 "cvtps2pi %3, %%mm1\n\t"
	 "movq %%mm0, %0\n\t"
	 "movq %%mm1, %1"
	 : DBL_MMX_ARGS);
    reset_mmx();
}
SIMD_CONVERT_SPEC(4,int32_t,float,fround) {
    asm ("cvtpi2ps %2, %%xmm0\n\t"
	 "shufps $0x4E,%%xmm0,%%xmm0\n\t"
	 "cvtpi2ps %1, %%xmm0\n\t"
	 "movups %%xmm0, %0"
	 : "=m"(oout[0]) : "m"(oin[0]), "m"(oin[2]) : "xmm0", "memory");
}
template <> inline int32_t fround<float,int32_t>::operator()(const float& a) const {
    register int32_t rv;
    asm ("movss %1, %%xmm0\n\t"
	 "cvtss2si %%xmm0, %0"
	 : "=r"(rv) : "m"(a) : "xmm0" );
    return (rv);
}
template <> inline uint32_t fround<float,uint32_t>::operator()(const float& a) const {
    register uint32_t rv;
    asm ("movss %1, %%xmm0\n\t"
	 "cvtss2si %%xmm0, %0"
	 : "=r"(rv) : "m"(a) : "xmm0" );
    return (rv);
}

SSE_IPASSIGN_SPEC(4,float)
SSE_IPASSIGN_SPEC(4,int32_t)
SSE_IPASSIGN_SPEC(4,uint32_t)

#undef SSE_IPASSIGN_SPEC
#undef SSE_PASSIGN_SPEC
#undef SSE_PKOP2_SPEC
#undef STD_SSE_ARGS
#endif // CPU_HAS_SSE

#undef SIMD_PACKEDOP_SPEC

} // namespace simd
} // namespace ustl

#endif