142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/*
242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Mesa 3-D graphics library
33b7df51eb5f5e2f980248f5e8c547869ea93ac78Brian * Version:  7.1
442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *
53b7df51eb5f5e2f980248f5e8c547869ea93ac78Brian * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *
742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Permission is hereby granted, free of charge, to any person obtaining a
842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * copy of this software and associated documentation files (the "Software"),
942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * to deal in the Software without restriction, including without limitation
1042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * the rights to use, copy, modify, merge, publish, distribute, sublicense,
1142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * and/or sell copies of the Software, and to permit persons to whom the
1242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Software is furnished to do so, subject to the following conditions:
1342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *
1442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * The above copyright notice and this permission notice shall be included
1542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * in all copies or substantial portions of the Software.
1642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *
1742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
1842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
2042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
2142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
2342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */
2442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
2542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#ifdef USE_X86_64_ASM
2642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
2742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#include "matypes.h"
2842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
2942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.text
3042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
3142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
322b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_x86_64_cpuid
33124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_x86_64_cpuid
342b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_x86_64_cpuid:
352b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	pushq	%rbx
362b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	movl	(%rdi), %eax
372b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	movl	8(%rdi), %ecx
382b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick
392b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	cpuid
402b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick
412b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	movl	%ebx, 4(%rdi)
422b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	movl	%eax, (%rdi)
432b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	movl	%ecx, 8(%rdi)
442b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	movl	%edx, 12(%rdi)
452b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	popq	%rbx
462b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick	ret
4742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
482b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.align 16
4942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_general
50124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_x86_64_transform_points4_general
5142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_general:
5242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/*
5342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *	rdi = dest
5442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *	rsi = matrix
5542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *	rdx = source
5642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */
5742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
583fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
5942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
6042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
6142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
6242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
6342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
6442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
6542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	testl %ecx, %ecx		/* verify non-zero count */
6642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchnta 64(%rsi)
6742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_general_done
6842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
6942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
7042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
7142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
7242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 16(%rdx)
7342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
7442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
7542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
7642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
7742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
7842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
7942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
8042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_general_loop:
8142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
823b7df51eb5f5e2f980248f5e8c547869ea93ac78Brian	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
8342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 16(%rdi)
8442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
8542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
8642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
8742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
8842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
8942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
9042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
9142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
9242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
9342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
9442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
9542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
9642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 16(%rdx)
9742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
9842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
9942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
10042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
10142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
10242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
10342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_general_loop
10442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
10542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_general_done:
10642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0xf3
10742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
10842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
10942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.section .rodata
11042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
11142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
11242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_constants:
11342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0xff, 0xff, 0xff, 0xff
11442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0xff, 0xff, 0xff, 0xff
11542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0xff, 0xff, 0xff, 0xff
11642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0x00, 0x00, 0x00, 0x00
11742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
11842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0x00, 0x00, 0x00, 0x00
11942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0x00, 0x00, 0x00, 0x00
12042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0x00, 0x00, 0x00, 0x00
1219cd64ec35acd54cbe0be4d03236d2c5a9d4be6feChad Versace.float 1.0
12242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
12342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.text
12442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
12542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_3d
126124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_x86_64_transform_points4_3d
12742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/*
12842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * this is slower than _mesa_x86_64_transform_points4_general
12942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
13042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */
13142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_3d:
13242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
13342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	leaq p4_constants(%rip), %rax
13442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
13542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchnta 64(%rsi)
13642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
13742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps (%rax), %xmm9
13842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 16(%rax), %xmm10
13942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
14042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
1413fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
14242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
14342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
14442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
14542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
14642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
14742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	testl %ecx, %ecx		/* verify non-zero count */
14842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_3d_done
14942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
15042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
15142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
15242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
15342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 16(%rdx)
15442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
15542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
15642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
15742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
15842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
15942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
16042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
16142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
16242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
16342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
16442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
16542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
16642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_loop:
16742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
16874e1d0996d109ce6426507f9ca64cf38d3b83acfRoland Scheidegger	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
16942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 16(%rdi)
17042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
17142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
17242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
17342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
17442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
17542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
17642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
17742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
17842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
17942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
18042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
18142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
18242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 16(%rdx)
18342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
18442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
18542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
18642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
18742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
18842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	dec %ecx
18942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_3d_loop
19042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
19142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_done:
19242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0xf3
19342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
19442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
19542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
19642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
19742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_identity
198124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_x86_64_transform_points4_identity
19942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_identity:
20042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
20142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
2023fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
20342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
20442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
20542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
20642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
20742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
20842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
20942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_identity_done
21042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
21142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
21242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
21342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 64(%rsi)
21442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 64(%rdi)
21542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
21642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	add %ecx, %ecx
21742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
21842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	rep movsq
21942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
22042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_identity_done:
22142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0xf3
22242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
22342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
22442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
22542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
2262b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_3dnow_transform_points4_3d_no_rot
227124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_3dnow_transform_points4_3d_no_rot
2282b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_3dnow_transform_points4_3d_no_rot:
22942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
23042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
2313fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
23242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
23342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
23442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
23542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
23642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
23742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
23842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
23942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
24042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_3d_no_rot_done
24142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
24242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
24342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
24442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
24542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch (%rdx)
24642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
24742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd (%rsi), %mm0		/*                 | m00             */
24842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
24942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
25042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
25142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd 40(%rsi), %mm2		/*                 | m22             */
25242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 48(%rsi), %mm1		/* m31             | m30             */
25342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
25442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
25542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
25642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_no_rot_loop:
25742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
25842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 32(%rdi)
25942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
26042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq  (%rdx), %mm4		/* x1              | x0              */
26142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq  8(%rdx), %mm5		/* x3              | x2              */
26242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd  12(%rdx), %mm7		/*                 | x3              */
26342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
26442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq  %mm5, %mm6		/* x3              | x2              */
26542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
26642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
26742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckhdq %mm6, %mm6		/* x3              | x3              */
26842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
26942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
27042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
27142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
27242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
27342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
27442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
27542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
27642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm4, (%rdi)		/* write r0, r1                      */
27742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, 8(%rdi)		/* write r2, r3                      */
27842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
27942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
28042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
28142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
28242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 32(%rdx)
28342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_3d_no_rot_loop
28442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
28542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_no_rot_done:
28642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	femms
28742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
28842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
28942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
29042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
2912b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_3dnow_transform_points4_perspective
292124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_3dnow_transform_points4_perspective
2932b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_3dnow_transform_points4_perspective:
29442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
29542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
2963fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
29742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
29842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
29942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
30042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
30142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
30242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
30342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
30442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_perspective_done
30542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
30642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
30742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
30842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
30942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd (%rsi), %mm0		/*                 | m00             */
31042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul        pxor %mm7, %mm7			/* 0               | 0               */
31142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
31242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
31342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 32(%rsi), %mm2		/* m21             | m20             */
31442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch (%rdx)
31542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
31642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd 40(%rsi), %mm1		/*                 | m22             */
31742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
31842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
31942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
32042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
32142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
32242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_perspective_loop:
32342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
32442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
32542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
32642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq (%rdx), %mm4		/* x1              | x0              */
32742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 8(%rdx), %mm5		/* x3              | x2              */
32842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd 8(%rdx), %mm3		/*                 | x2              */
32942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
33042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, %mm6			/* x3              | x2              */
33142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
33242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
33342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq %mm5, %mm5		/* x2              | x2              */
33442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
33542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
33642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfsubr %mm7, %mm3		/*                 | -x2             */
33742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
33842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
33942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
34042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
34142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
34242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
34342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, (%rdi)		/* write r0, r1                      */
34442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
34542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm6, 8(%rdi)		/* write r2, r3                      */
34642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
34742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
34842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
34942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
35042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 32(%rdx)		/* hopefully stride is zero          */
35142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_perspective_loop
35242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
35342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_perspective_done:
35442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	femms
35542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
35642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
35742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
3582b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_3dnow_transform_points4_2d_no_rot
359124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_3dnow_transform_points4_2d_no_rot
3602b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_3dnow_transform_points4_2d_no_rot:
36142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
36242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
3633fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
36442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
36542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
36642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
36742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
36842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
36942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
37042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x90			/* manual align += 1 */
37142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_2d_no_rot_done
37242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
37342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
37442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
37542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
37642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd (%rsi), %mm0		/*                 | m00             */
37742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch (%rdx)
37842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
37942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
38042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 48(%rsi), %mm1		/* m31             | m30             */
38142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
38242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_no_rot_loop:
38342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
38442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
38542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
38642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq (%rdx), %mm4		/* x1              | x0              */
38742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 8(%rdx), %mm5		/* x3              | x2              */
38842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
38942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
39042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, %mm6			/* x3              | x2              */
39142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
39242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckhdq %mm6, %mm6		/* x3              | x3              */
39342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
39442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
39542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
39642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
39742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 32(%rdx)		/* hopefully stride is zero          */
39842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
39942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
40042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm6, (%rdi)		/* write r0, r1                      */
40142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, 8(%rdi)		/* write r2, r3                      */
40242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
40342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
40442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
40542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
40642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_2d_no_rot_loop
40742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
40842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_no_rot_done:
40942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	femms
41042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
41142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
41242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
41342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
4142b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_3dnow_transform_points4_2d
415124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_3dnow_transform_points4_2d
4162b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_3dnow_transform_points4_2d:
41742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
41842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
4193fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
42042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
42142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
42242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
42342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
42442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
42542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
42642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
42742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
42842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_2d_done
42942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
43042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
43142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
43242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
43342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd (%rsi), %mm0		/*                 | m00             */
43442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd 4(%rsi), %mm1		/*                 | m01             */
43542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
43642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch (%rdx)
43742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
43842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
43942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
44042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
44142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
44242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 48(%rsi), %mm2		/* m31             | m30             */
44342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
44442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_loop:
44542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
44642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
44742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
44842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq (%rdx), %mm3		/* x1              | x0              */
44942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 8(%rdx), %mm5		/* x3              | x2              */
45042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
45142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm3, %mm4			/* x1              | x0              */
45242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, %mm6			/* x3              | x2              */
45342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
45442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
45542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckhdq %mm6, %mm6		/* x3              | x3              */
45642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
45742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
45842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
45942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
46042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
46142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
46242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
46342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 32(%rdx)		/* hopefully stride is zero          */
46442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
46542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfadd %mm6, %mm3		/* r1              | r0              */
46642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
46742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm3, (%rdi)		/* write r0, r1                      */
46842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, 8(%rdi)		/* write r2, r3                      */
46942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
47042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
47142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
47242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
47342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_2d_loop
47442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
47542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_done:
47642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	femms
47742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
47842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
47942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#endif
480fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg
481fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#if defined (__ELF__) && defined (__linux__)
482fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg	.section .note.GNU-stack,"",%progbits
483fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#endif
484