1/*
2 * Mesa 3-D graphics library
3 * Version:  7.1
4 *
5 * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25#ifdef USE_X86_64_ASM
26
27#include "matypes.h"
28
29.text
30
31.align 16
32.globl _mesa_x86_64_cpuid
33.hidden _mesa_x86_64_cpuid
34_mesa_x86_64_cpuid:
35	pushq	%rbx
36	movl	(%rdi), %eax
37	movl	8(%rdi), %ecx
38
39	cpuid
40
41	movl	%ebx, 4(%rdi)
42	movl	%eax, (%rdi)
43	movl	%ecx, 8(%rdi)
44	movl	%edx, 12(%rdi)
45	popq	%rbx
46	ret
47
48.align 16
49.globl _mesa_x86_64_transform_points4_general
50.hidden _mesa_x86_64_transform_points4_general
51_mesa_x86_64_transform_points4_general:
52/*
53 *	rdi = dest
54 *	rsi = matrix
55 *	rdx = source
56 */
57	movl V4F_COUNT(%rdx), %ecx	/* count */
58	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
59
60	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
61	movl $4, V4F_SIZE(%rdi)		/* set dest size */
62	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
63	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
64
65	testl %ecx, %ecx		/* verify non-zero count */
66	prefetchnta 64(%rsi)
67	jz p4_general_done
68
69	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
70	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
71
72	prefetch 16(%rdx)
73
74	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
75	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
76	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
77	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
78        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
79
80p4_general_loop:
81
82	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
83	prefetchw 16(%rdi)
84
85	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
86	addq %rax, %rdx
87	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
88	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
89	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
90	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
91	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
92	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
93	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
94	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
95	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
96	prefetch 16(%rdx)
97	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
98
99	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
100	addq $16, %rdi
101
102	decl %ecx
103	jnz p4_general_loop
104
105p4_general_done:
106	.byte 0xf3
107	ret
108
109.section .rodata
110
111.align 16
112p4_constants:
113.byte  0xff, 0xff, 0xff, 0xff
114.byte  0xff, 0xff, 0xff, 0xff
115.byte  0xff, 0xff, 0xff, 0xff
116.byte  0x00, 0x00, 0x00, 0x00
117
118.byte  0x00, 0x00, 0x00, 0x00
119.byte  0x00, 0x00, 0x00, 0x00
120.byte  0x00, 0x00, 0x00, 0x00
121.float 1.0
122
123.text
124.align 16
125.globl _mesa_x86_64_transform_points4_3d
126.hidden _mesa_x86_64_transform_points4_3d
127/*
128 * this is slower than _mesa_x86_64_transform_points4_general
129 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
130 */
131_mesa_x86_64_transform_points4_3d:
132
133	leaq p4_constants(%rip), %rax
134
135	prefetchnta 64(%rsi)
136
137	movaps (%rax), %xmm9
138	movaps 16(%rax), %xmm10
139
140	movl V4F_COUNT(%rdx), %ecx	/* count */
141	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
142
143	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
144	movl $4, V4F_SIZE(%rdi)		/* set dest size */
145	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
146
147	testl %ecx, %ecx		/* verify non-zero count */
148	jz p4_3d_done
149
150	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
151	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
152
153	prefetch 16(%rdx)
154
155	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
156	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
157	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
158	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
159	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
160        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
161	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
162	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
163	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
164	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
165
166p4_3d_loop:
167
168	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
169	prefetchw 16(%rdi)
170
171	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
172	addq %rax, %rdx
173	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
174	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
175	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
176	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
177	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
178	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
179	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
180	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
181	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
182	prefetch 16(%rdx)
183	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
184
185	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
186	addq $16, %rdi
187
188	dec %ecx
189	jnz p4_3d_loop
190
191p4_3d_done:
192	.byte 0xf3
193	ret
194
195
196.align 16
197.globl _mesa_x86_64_transform_points4_identity
198.hidden _mesa_x86_64_transform_points4_identity
199_mesa_x86_64_transform_points4_identity:
200
201	movl V4F_COUNT(%rdx), %ecx	/* count */
202	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
203
204	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
205	movl $4, V4F_SIZE(%rdi)		/* set dest size */
206	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
207
208	test %ecx, %ecx
209	jz p4_identity_done
210
211	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
212	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
213	prefetch 64(%rsi)
214	prefetchw 64(%rdi)
215
216	add %ecx, %ecx
217
218	rep movsq
219
220p4_identity_done:
221	.byte 0xf3
222	ret
223
224
225.align 16
226.globl _mesa_3dnow_transform_points4_3d_no_rot
227.hidden _mesa_3dnow_transform_points4_3d_no_rot
228_mesa_3dnow_transform_points4_3d_no_rot:
229
230	movl V4F_COUNT(%rdx), %ecx	/* count */
231	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
232
233	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
234	movl $4, V4F_SIZE(%rdi)		/* set dest size */
235	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
236	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
237
238	test %ecx, %ecx
239	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
240	jz p4_3d_no_rot_done
241
242	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
243	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
244
245	prefetch (%rdx)
246
247	movd (%rsi), %mm0		/*                 | m00             */
248	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
249	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
250
251	movd 40(%rsi), %mm2		/*                 | m22             */
252	movq 48(%rsi), %mm1		/* m31             | m30             */
253
254	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
255
256p4_3d_no_rot_loop:
257
258	prefetchw 32(%rdi)
259
260	movq  (%rdx), %mm4		/* x1              | x0              */
261	movq  8(%rdx), %mm5		/* x3              | x2              */
262	movd  12(%rdx), %mm7		/*                 | x3              */
263
264	movq  %mm5, %mm6		/* x3              | x2              */
265	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
266
267	punpckhdq %mm6, %mm6		/* x3              | x3              */
268	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
269
270	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
271	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
272
273        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
274
275	addq %rax, %rdx
276	movq %mm4, (%rdi)		/* write r0, r1                      */
277	movq %mm5, 8(%rdi)		/* write r2, r3                      */
278
279	addq $16, %rdi
280
281	decl %ecx
282	prefetch 32(%rdx)
283	jnz p4_3d_no_rot_loop
284
285p4_3d_no_rot_done:
286	femms
287	ret
288
289
290.align 16
291.globl _mesa_3dnow_transform_points4_perspective
292.hidden _mesa_3dnow_transform_points4_perspective
293_mesa_3dnow_transform_points4_perspective:
294
295	movl V4F_COUNT(%rdx), %ecx	/* count */
296	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
297
298	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
299	movl $4, V4F_SIZE(%rdi)		/* set dest size */
300	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
301
302	test %ecx, %ecx
303	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
304	jz p4_perspective_done
305
306	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
307	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
308
309	movd (%rsi), %mm0		/*                 | m00             */
310        pxor %mm7, %mm7			/* 0               | 0               */
311	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
312
313	movq 32(%rsi), %mm2		/* m21             | m20             */
314	prefetch (%rdx)
315
316	movd 40(%rsi), %mm1		/*                 | m22             */
317
318	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
319	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
320
321
322p4_perspective_loop:
323
324	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
325
326	movq (%rdx), %mm4		/* x1              | x0              */
327	movq 8(%rdx), %mm5		/* x3              | x2              */
328	movd 8(%rdx), %mm3		/*                 | x2              */
329
330	movq %mm5, %mm6			/* x3              | x2              */
331	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
332
333	punpckldq %mm5, %mm5		/* x2              | x2              */
334
335	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
336	pfsubr %mm7, %mm3		/*                 | -x2             */
337
338	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
339	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
340
341	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
342
343	movq %mm5, (%rdi)		/* write r0, r1                      */
344	addq %rax, %rdx
345	movq %mm6, 8(%rdi)		/* write r2, r3                      */
346
347	addq $16, %rdi
348
349	decl %ecx
350	prefetch 32(%rdx)		/* hopefully stride is zero          */
351	jnz p4_perspective_loop
352
353p4_perspective_done:
354	femms
355	ret
356
357.align 16
358.globl _mesa_3dnow_transform_points4_2d_no_rot
359.hidden _mesa_3dnow_transform_points4_2d_no_rot
360_mesa_3dnow_transform_points4_2d_no_rot:
361
362	movl V4F_COUNT(%rdx), %ecx	/* count */
363	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
364
365	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
366	movl $4, V4F_SIZE(%rdi)		/* set dest size */
367	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
368
369	test %ecx, %ecx
370	.byte 0x90			/* manual align += 1 */
371	jz p4_2d_no_rot_done
372
373	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
374	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
375
376	movd (%rsi), %mm0		/*                 | m00             */
377	prefetch (%rdx)
378	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
379
380	movq 48(%rsi), %mm1		/* m31             | m30             */
381
382p4_2d_no_rot_loop:
383
384	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
385
386	movq (%rdx), %mm4		/* x1              | x0              */
387	movq 8(%rdx), %mm5		/* x3              | x2              */
388
389	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
390	movq %mm5, %mm6			/* x3              | x2              */
391
392	punpckhdq %mm6, %mm6		/* x3              | x3              */
393
394	addq %rax, %rdx
395	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
396
397	prefetch 32(%rdx)		/* hopefully stride is zero          */
398	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
399
400	movq %mm6, (%rdi)		/* write r0, r1                      */
401	movq %mm5, 8(%rdi)		/* write r2, r3                      */
402
403	addq $16, %rdi
404
405	decl %ecx
406	jnz p4_2d_no_rot_loop
407
408p4_2d_no_rot_done:
409	femms
410	ret
411
412
413.align 16
414.globl _mesa_3dnow_transform_points4_2d
415.hidden _mesa_3dnow_transform_points4_2d
416_mesa_3dnow_transform_points4_2d:
417
418	movl V4F_COUNT(%rdx), %ecx	/* count */
419	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
420
421	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
422	movl $4, V4F_SIZE(%rdi)		/* set dest size */
423	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
424	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
425
426	test %ecx, %ecx
427	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
428	jz p4_2d_done
429
430	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
431	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
432
433	movd (%rsi), %mm0		/*                 | m00             */
434	movd 4(%rsi), %mm1		/*                 | m01             */
435
436	prefetch (%rdx)
437
438	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
439	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
440	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
441
442	movq 48(%rsi), %mm2		/* m31             | m30             */
443
444p4_2d_loop:
445
446	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
447
448	movq (%rdx), %mm3		/* x1              | x0              */
449	movq 8(%rdx), %mm5		/* x3              | x2              */
450
451	movq %mm3, %mm4			/* x1              | x0              */
452	movq %mm5, %mm6			/* x3              | x2              */
453
454	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
455	punpckhdq %mm6, %mm6		/* x3              | x3              */
456
457	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
458
459	addq %rax, %rdx
460	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
461
462	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
463	prefetch 32(%rdx)		/* hopefully stride is zero          */
464
465	pfadd %mm6, %mm3		/* r1              | r0              */
466
467	movq %mm3, (%rdi)		/* write r0, r1                      */
468	movq %mm5, 8(%rdi)		/* write r2, r3                      */
469
470	addq $16, %rdi
471
472	decl %ecx
473	jnz p4_2d_loop
474
475p4_2d_done:
476	femms
477	ret
478
479#endif
480
481#if defined (__ELF__) && defined (__linux__)
482	.section .note.GNU-stack,"",%progbits
483#endif
484