1// This file is part of the ustl library, an STL implementation.
2//
3// Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4// This file is free software, distributed under the MIT License.
5//
6// ulaalgo.h
7//
8
9#ifndef ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
10#define ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
11
12#include "umatrix.h"
13#include "simd.h"
14
15namespace ustl {
16
17/// \brief Creates an identity matrix in \p m
18/// \ingroup NumericAlgorithms
19template <size_t NX, size_t NY, typename T>
20void load_identity (matrix<NX,NY,T>& m)
21{
22    fill_n (m.begin(), NX * NY, 0);
23    for (typename matrix<NX,NY,T>::iterator i = m.begin(); i < m.end(); i += NX + 1)
24	*i = 1;
25}
26
27/// \brief Multiplies two matrices
28/// \ingroup NumericAlgorithms
29template <size_t NX, size_t NY, typename T>
30matrix<NY,NY,T> operator* (const matrix<NX,NY,T>& m1, const matrix<NY,NX,T>& m2)
31{
32    matrix<NY,NY,T> mr;
33    for (uoff_t ry = 0; ry < NY; ++ ry) {
34	for (uoff_t rx = 0; rx < NY; ++ rx) {
35	    T dpv (0);
36	    for (uoff_t x = 0; x < NX; ++ x)
37		dpv += m1[ry][x] * m2[x][rx];
38	    mr[ry][rx] = dpv;
39	}
40    }
41    return (mr);
42}
43
44/// \brief Transforms vector \p t with matrix \p m
45/// \ingroup NumericAlgorithms
46template <size_t NX, size_t NY, typename T>
47tuple<NX,T> operator* (const tuple<NY,T>& t, const matrix<NX,NY,T>& m)
48{
49    tuple<NX,T> tr;
50    for (uoff_t x = 0; x < NX; ++ x) {
51	T dpv (0);
52	for (uoff_t y = 0; y < NY; ++ y)
53	    dpv += t[y] * m[y][x];
54	tr[x] = dpv;
55    }
56    return (tr);
57}
58
59/// \brief Transposes (exchanges rows and columns) matrix \p m.
60/// \ingroup NumericAlgorithms
61template <size_t N, typename T>
62void transpose (matrix<N,N,T>& m)
63{
64    for (uoff_t x = 0; x < N; ++ x)
65	for (uoff_t y = x; y < N; ++ y)
66	    swap (m[x][y], m[y][x]);
67}
68
69#if WANT_UNROLLED_COPY
70
71#if CPU_HAS_SSE
72
73#if linux // Non-linux gcc versions (BSD, Solaris) can't handle "x" constraint and provide no alternative.
74template <>
75inline void load_identity (matrix<4,4,float>& m)
76{
77    asm (
78	"movaps %4, %%xmm1		\n\t"	// 1 0 0 0
79	"movups %4, %0			\n\t"	// 1 0 0 0
80	"shufps $0xB1,%%xmm1,%%xmm1	\n\t"	// 0 1 0 0
81	"movups %%xmm1, %1		\n\t"	// 0 1 0 0
82	"shufps $0x4F,%4,%%xmm1		\n\t"	// 0 0 1 0
83	"shufps $0x1B,%4,%4		\n\t"	// 0 0 0 1
84	"movups %%xmm1, %2		\n\t"	// 0 0 1 0
85	"movups %4, %3"				// 0 0 0 1
86	: "=m"(m[0][0]), "=m"(m[1][0]), "=m"(m[2][0]), "=m"(m[3][0])
87	: "x"(1.0f)
88	: "xmm1"
89    );
90}
91#endif
92
93inline void _sse_load_matrix (const float* m)
94{
95    asm (
96	"movups %0, %%xmm4	\n\t"	// xmm4 = m[1 2 3 4]
97	"movups %1, %%xmm5	\n\t"	// xmm5 = m[1 2 3 4]
98	"movups %2, %%xmm6	\n\t"	// xmm6 = m[1 2 3 4]
99	"movups %3, %%xmm7"		// xmm7 = m[1 2 3 4]
100	: : "m"(m[0]), "m"(m[4]), "m"(m[8]), "m"(m[12])
101	: "xmm4", "xmm5", "xmm6", "xmm7"
102    );
103}
104
105inline void _sse_transform_to_vector (float* result)
106{
107    asm (
108	"movaps %%xmm0, %%xmm1		\n\t" // xmm1 = t[0 1 2 3]
109	"movaps %%xmm0, %%xmm2		\n\t" // xmm1 = t[0 1 2 3]
110	"movaps %%xmm0, %%xmm3		\n\t" // xmm1 = t[0 1 2 3]
111	"shufps $0x00, %%xmm0, %%xmm0	\n\t" // xmm0 = t[0 0 0 0]
112	"shufps $0x66, %%xmm1, %%xmm1	\n\t" // xmm1 = t[1 1 1 1]
113	"shufps $0xAA, %%xmm2, %%xmm2	\n\t" // xmm2 = t[2 2 2 2]
114	"shufps $0xFF, %%xmm3, %%xmm3	\n\t" // xmm3 = t[3 3 3 3]
115	"mulps  %%xmm4, %%xmm0		\n\t" // xmm0 = t[0 0 0 0] * m[0 1 2 3]
116	"mulps  %%xmm5, %%xmm1		\n\t" // xmm1 = t[1 1 1 1] * m[0 1 2 3]
117	"addps  %%xmm1, %%xmm0		\n\t" // xmm0 = xmm0 + xmm1
118	"mulps  %%xmm6, %%xmm2		\n\t" // xmm2 = t[2 2 2 2] * m[0 1 2 3]
119	"mulps  %%xmm7, %%xmm3		\n\t" // xmm3 = t[3 3 3 3] * m[0 1 2 3]
120	"addps  %%xmm3, %%xmm2		\n\t" // xmm2 = xmm2 + xmm3
121	"addps  %%xmm2, %%xmm0		\n\t" // xmm0 = result
122	"movups %%xmm0, %0"
123	: "=m"(result[0]) :
124	: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
125    );
126}
127
128template <>
129tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
130{
131    tuple<4,float> result;
132    _sse_load_matrix (m.begin());
133    asm ("movups %0, %%xmm0" : : "m"(t[0]) : "xmm0");
134    _sse_transform_to_vector (result.begin());
135    return (result);
136}
137
138template <>
139matrix<4,4,float> operator* (const matrix<4,4,float>& m1, const matrix<4,4,float>& m2)
140{
141    matrix<4,4,float> result;
142    _sse_load_matrix (m2.begin());
143    for (uoff_t r = 0; r < 4; ++ r) {
144	asm ("movups %0, %%xmm0" : : "m"(m1[r][0]) : "xmm0");
145	_sse_transform_to_vector (result[r]);
146    }
147    return (result);
148}
149
150#elif CPU_HAS_3DNOW
151
152/// Specialization for 4-component vector transform, the slow part of 3D graphics.
153template <>
154tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
155{
156    tuple<4,float> result;
157    // This is taken from "AMD Athlon Code Optimization Guide" from AMD. 18 cycles!
158    // If you are writing a 3D engine, you may want to copy it instead of calling it
159    // because of the femms instruction at the end, which takes 2 cycles.
160    asm (
161	"movq	   %2, %%mm0		\n\t"	//            y | x
162	"movq	   %3, %%mm1		\n\t"	//            w | z
163	"movq	   %%mm0, %%mm2		\n\t"	//            y | x
164	"movq	   %4, %%mm3		\n\t"	//      m[0][1] | m[0][0]
165	"punpckldq  %%mm0, %%mm0	\n\t"	//            x | x
166	"movq	   %6, %%mm4		\n\t"	//      m[1][1] | m[1][0]
167	"pfmul	   %%mm0, %%mm3		\n\t"	//    x*m[0][1] | x*m[0][0]
168	"punpckhdq  %%mm2, %%mm2	\n\t"	//            y | y
169	"pfmul	   %%mm2, %%mm4		\n\t"	//    y*m[1][1] | y*m[1][0]
170	"movq	   %5, %%mm5		\n\t"	//      m[0][3] | m[0][2]
171	"movq	   %7, %%mm7		\n\t"	//      m[1][3] | m[1][2]
172	"movq	   %%mm1, %%mm6		\n\t"	//            w | z
173	"pfmul	   %%mm0, %%mm5		\n\t"	//    x*m[0][3] | v0>x*m[0][2]
174	"movq	   %8, %%mm0		\n\t"	//      m[2][1] | m[2][0]
175	"punpckldq  %%mm1, %%mm1	\n\t"	//            z | z
176	"pfmul	   %%mm2, %%mm7		\n\t"	//    y*m[1][3] | y*m[1][2]
177	"movq	   %9, %%mm2		\n\t"	//      m[2][3] | m[2][2]
178	"pfmul	   %%mm1, %%mm0		\n\t"	//    z*m[2][1] | z*m[2][0]
179	"pfadd	   %%mm4, %%mm3		\n\t"	// x*m[0][1]+y*m[1][1] | x*m[0][0]+y*m[1][0]
180	"movq	   %10, %%mm4		\n\t"	//      m[3][1] | m[3][0]
181	"pfmul	   %%mm1, %%mm2		\n\t"	//    z*m[2][3] | z*m[2][2]
182	"pfadd	   %%mm7, %%mm5		\n\t"	// x*m[0][3]+y*m[1][3] | x*m[0][2]+y*m[1][2]
183	"movq	   %11, %%mm1		\n\t"	//      m[3][3] | m[3][2]
184	"punpckhdq  %%mm6, %%mm6	\n\t"	//            w | w
185	"pfadd	   %%mm0, %%mm3		\n\t"	// x*m[0][1]+y*m[1][1]+z*m[2][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]
186	"pfmul	   %%mm6, %%mm4		\n\t"	//    w*m[3][1] | w*m[3][0]
187	"pfmul	   %%mm6, %%mm1		\n\t"	//    w*m[3][3] | w*m[3][2]
188	"pfadd	   %%mm2, %%mm5		\n\t"	// x*m[0][3]+y*m[1][3]+z*m[2][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]
189	"pfadd	   %%mm4, %%mm3		\n\t"	// x*m[0][1]+y*m[1][1]+z*m[2][1]+w*m[3][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]+w*m[3][0]
190	"movq	   %%mm3, %0		\n\t"	// store result->y | result->x
191	"pfadd	   %%mm1, %%mm5		\n\t"	// x*m[0][3]+y*m[1][3]+z*m[2][3]+w*m[3][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]+w*m[3][2]
192	"movq	   %%mm5, %1"			// store result->w | result->z
193	: "=m"(result[0]), "=m"(result[2])
194	: "m"(t[0]), "m"(t[2]),
195	  "m"(m[0][0]), "m"(m[0][2]),
196	  "m"(m[1][0]), "m"(m[1][2]),
197	  "m"(m[2][0]), "m"(m[2][2]),
198	  "m"(m[3][0]), "m"(m[3][2])
199	: "mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7"
200    );
201    simd::reset_mmx();
202    return (result);
203}
204
205#else	// If no processor extensions, just unroll the multiplication
206
207/// Specialization for 4-component vector transform, the slow part of 3D graphics.
208template <>
209tuple<4,float> operator* (const tuple<4,float>& t, const matrix<4,4,float>& m)
210{
211    tuple<4,float> tr;
212    for (uoff_t i = 0; i < 4; ++ i)
213	tr[i] = t[0] * m[0][i] + t[1] * m[1][i] + t[2] * m[2][i] + t[3] * m[3][i];
214    return (tr);
215}
216
217#endif	// CPU_HAS_3DNOW
218#endif	// WANT_UNROLLED_COPY
219
220} // namespace ustl
221
222#endif
223
224