1// This file is part of the ustl library, an STL implementation. 2// 3// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net> 4// This file is free software, distributed under the MIT License. 5// 6// ulaalgo.h 7// 8 9#ifndef ULAALGO_H_2E403D182E83FB596AFB800E68B255A1 10#define ULAALGO_H_2E403D182E83FB596AFB800E68B255A1 11 12#include "umatrix.h" 13#include "simd.h" 14 15namespace ustl { 16 17/// \brief Creates an identity matrix in \p m 18/// \ingroup NumericAlgorithms 19template <size_t NX, size_t NY, typename T> 20void load_identity (matrix<NX,NY,T>& m) 21{ 22 fill_n (m.begin(), NX * NY, 0); 23 for (typename matrix<NX,NY,T>::iterator i = m.begin(); i < m.end(); i += NX + 1) 24 *i = 1; 25} 26 27/// \brief Multiplies two matrices 28/// \ingroup NumericAlgorithms 29template <size_t NX, size_t NY, typename T> 30matrix<NY,NY,T> operator* (const matrix<NX,NY,T>& m1, const matrix<NY,NX,T>& m2) 31{ 32 matrix<NY,NY,T> mr; 33 for (uoff_t ry = 0; ry < NY; ++ ry) { 34 for (uoff_t rx = 0; rx < NY; ++ rx) { 35 T dpv (0); 36 for (uoff_t x = 0; x < NX; ++ x) 37 dpv += m1[ry][x] * m2[x][rx]; 38 mr[ry][rx] = dpv; 39 } 40 } 41 return (mr); 42} 43 44/// \brief Transforms vector \p t with matrix \p m 45/// \ingroup NumericAlgorithms 46template <size_t NX, size_t NY, typename T> 47tuple<NX,T> operator* (const tuple<NY,T>& t, const matrix<NX,NY,T>& m) 48{ 49 tuple<NX,T> tr; 50 for (uoff_t x = 0; x < NX; ++ x) { 51 T dpv (0); 52 for (uoff_t y = 0; y < NY; ++ y) 53 dpv += t[y] * m[y][x]; 54 tr[x] = dpv; 55 } 56 return (tr); 57} 58 59/// \brief Transposes (exchanges rows and columns) matrix \p m. 60/// \ingroup NumericAlgorithms 61template <size_t N, typename T> 62void transpose (matrix<N,N,T>& m) 63{ 64 for (uoff_t x = 0; x < N; ++ x) 65 for (uoff_t y = x; y < N; ++ y) 66 swap (m[x][y], m[y][x]); 67} 68 69#if WANT_UNROLLED_COPY 70 71#if CPU_HAS_SSE 72 73#if linux // Non-linux gcc versions (BSD, Solaris) can't handle "x" constraint and provide no alternative. 74template <> 75inline void load_identity (matrix<4,4,float>& m) 76{ 77 asm ( 78 "movaps %4, %%xmm1 \n\t" // 1 0 0 0 79 "movups %4, %0 \n\t" // 1 0 0 0 80 "shufps $0xB1,%%xmm1,%%xmm1 \n\t" // 0 1 0 0 81 "movups %%xmm1, %1 \n\t" // 0 1 0 0 82 "shufps $0x4F,%4,%%xmm1 \n\t" // 0 0 1 0 83 "shufps $0x1B,%4,%4 \n\t" // 0 0 0 1 84 "movups %%xmm1, %2 \n\t" // 0 0 1 0 85 "movups %4, %3" // 0 0 0 1 86 : "=m"(m[0][0]), "=m"(m[1][0]), "=m"(m[2][0]), "=m"(m[3][0]) 87 : "x"(1.0f) 88 : "xmm1" 89 ); 90} 91#endif 92 93inline void _sse_load_matrix (const float* m) 94{ 95 asm ( 96 "movups %0, %%xmm4 \n\t" // xmm4 = m[1 2 3 4] 97 "movups %1, %%xmm5 \n\t" // xmm5 = m[1 2 3 4] 98 "movups %2, %%xmm6 \n\t" // xmm6 = m[1 2 3 4] 99 "movups %3, %%xmm7" // xmm7 = m[1 2 3 4] 100 : : "m"(m[0]), "m"(m[4]), "m"(m[8]), "m"(m[12]) 101 : "xmm4", "xmm5", "xmm6", "xmm7" 102 ); 103} 104 105inline void _sse_transform_to_vector (float* result) 106{ 107 asm ( 108 "movaps %%xmm0, %%xmm1 \n\t" // xmm1 = t[0 1 2 3] 109 "movaps %%xmm0, %%xmm2 \n\t" // xmm1 = t[0 1 2 3] 110 "movaps %%xmm0, %%xmm3 \n\t" // xmm1 = t[0 1 2 3] 111 "shufps $0x00, %%xmm0, %%xmm0 \n\t" // xmm0 = t[0 0 0 0] 112 "shufps $0x66, %%xmm1, %%xmm1 \n\t" // xmm1 = t[1 1 1 1] 113 "shufps $0xAA, %%xmm2, %%xmm2 \n\t" // xmm2 = t[2 2 2 2] 114 "shufps $0xFF, %%xmm3, %%xmm3 \n\t" // xmm3 = t[3 3 3 3] 115 "mulps %%xmm4, %%xmm0 \n\t" // xmm0 = t[0 0 0 0] * m[0 1 2 3] 116 "mulps %%xmm5, %%xmm1 \n\t" // xmm1 = t[1 1 1 1] * m[0 1 2 3] 117 "addps %%xmm1, %%xmm0 \n\t" // xmm0 = xmm0 + xmm1 118 "mulps %%xmm6, %%xmm2 \n\t" // xmm2 = t[2 2 2 2] * m[0 1 2 3] 119 "mulps %%xmm7, %%xmm3 \n\t" // xmm3 = t[3 3 3 3] * m[0 1 2 3] 120 "addps %%xmm3, %%xmm2 \n\t" // xmm2 = xmm2 + xmm3 121 "addps %%xmm2, %%xmm0 \n\t" // xmm0 = result 122 "movups %%xmm0, %0" 123 : "=m"(result[0]) : 124 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 125 ); 126} 127 128template <> 129tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m) 130{ 131 tuple<4,float> result; 132 _sse_load_matrix (m.begin()); 133 asm ("movups %0, %%xmm0" : : "m"(t[0]) : "xmm0"); 134 _sse_transform_to_vector (result.begin()); 135 return (result); 136} 137 138template <> 139matrix<4,4,float> operator* (const matrix<4,4,float>& m1, const matrix<4,4,float>& m2) 140{ 141 matrix<4,4,float> result; 142 _sse_load_matrix (m2.begin()); 143 for (uoff_t r = 0; r < 4; ++ r) { 144 asm ("movups %0, %%xmm0" : : "m"(m1[r][0]) : "xmm0"); 145 _sse_transform_to_vector (result[r]); 146 } 147 return (result); 148} 149 150#elif CPU_HAS_3DNOW 151 152/// Specialization for 4-component vector transform, the slow part of 3D graphics. 153template <> 154tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m) 155{ 156 tuple<4,float> result; 157 // This is taken from "AMD Athlon Code Optimization Guide" from AMD. 18 cycles! 158 // If you are writing a 3D engine, you may want to copy it instead of calling it 159 // because of the femms instruction at the end, which takes 2 cycles. 160 asm ( 161 "movq %2, %%mm0 \n\t" // y | x 162 "movq %3, %%mm1 \n\t" // w | z 163 "movq %%mm0, %%mm2 \n\t" // y | x 164 "movq %4, %%mm3 \n\t" // m[0][1] | m[0][0] 165 "punpckldq %%mm0, %%mm0 \n\t" // x | x 166 "movq %6, %%mm4 \n\t" // m[1][1] | m[1][0] 167 "pfmul %%mm0, %%mm3 \n\t" // x*m[0][1] | x*m[0][0] 168 "punpckhdq %%mm2, %%mm2 \n\t" // y | y 169 "pfmul %%mm2, %%mm4 \n\t" // y*m[1][1] | y*m[1][0] 170 "movq %5, %%mm5 \n\t" // m[0][3] | m[0][2] 171 "movq %7, %%mm7 \n\t" // m[1][3] | m[1][2] 172 "movq %%mm1, %%mm6 \n\t" // w | z 173 "pfmul %%mm0, %%mm5 \n\t" // x*m[0][3] | v0>x*m[0][2] 174 "movq %8, %%mm0 \n\t" // m[2][1] | m[2][0] 175 "punpckldq %%mm1, %%mm1 \n\t" // z | z 176 "pfmul %%mm2, %%mm7 \n\t" // y*m[1][3] | y*m[1][2] 177 "movq %9, %%mm2 \n\t" // m[2][3] | m[2][2] 178 "pfmul %%mm1, %%mm0 \n\t" // z*m[2][1] | z*m[2][0] 179 "pfadd %%mm4, %%mm3 \n\t" // x*m[0][1]+y*m[1][1] | x*m[0][0]+y*m[1][0] 180 "movq %10, %%mm4 \n\t" // m[3][1] | m[3][0] 181 "pfmul %%mm1, %%mm2 \n\t" // z*m[2][3] | z*m[2][2] 182 "pfadd %%mm7, %%mm5 \n\t" // x*m[0][3]+y*m[1][3] | x*m[0][2]+y*m[1][2] 183 "movq %11, %%mm1 \n\t" // m[3][3] | m[3][2] 184 "punpckhdq %%mm6, %%mm6 \n\t" // w | w 185 "pfadd %%mm0, %%mm3 \n\t" // x*m[0][1]+y*m[1][1]+z*m[2][1] | x*m[0][0]+y*m[1][0]+z*m[2][0] 186 "pfmul %%mm6, %%mm4 \n\t" // w*m[3][1] | w*m[3][0] 187 "pfmul %%mm6, %%mm1 \n\t" // w*m[3][3] | w*m[3][2] 188 "pfadd %%mm2, %%mm5 \n\t" // x*m[0][3]+y*m[1][3]+z*m[2][3] | x*m[0][2]+y*m[1][2]+z*m[2][2] 189 "pfadd %%mm4, %%mm3 \n\t" // x*m[0][1]+y*m[1][1]+z*m[2][1]+w*m[3][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]+w*m[3][0] 190 "movq %%mm3, %0 \n\t" // store result->y | result->x 191 "pfadd %%mm1, %%mm5 \n\t" // x*m[0][3]+y*m[1][3]+z*m[2][3]+w*m[3][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]+w*m[3][2] 192 "movq %%mm5, %1" // store result->w | result->z 193 : "=m"(result[0]), "=m"(result[2]) 194 : "m"(t[0]), "m"(t[2]), 195 "m"(m[0][0]), "m"(m[0][2]), 196 "m"(m[1][0]), "m"(m[1][2]), 197 "m"(m[2][0]), "m"(m[2][2]), 198 "m"(m[3][0]), "m"(m[3][2]) 199 : "mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7" 200 ); 201 simd::reset_mmx(); 202 return (result); 203} 204 205#else // If no processor extensions, just unroll the multiplication 206 207/// Specialization for 4-component vector transform, the slow part of 3D graphics. 208template <> 209tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m) 210{ 211 tuple<4,float> tr; 212 for (uoff_t i = 0; i < 4; ++ i) 213 tr[i] = t[0] * m[0][i] + t[1] * m[1][i] + t[2] * m[2][i] + t[3] * m[3][i]; 214 return (tr); 215} 216 217#endif // CPU_HAS_3DNOW 218#endif // WANT_UNROLLED_COPY 219 220} // namespace ustl 221 222#endif 223 224