xform4.S revision 3b7df51eb5f5e2f980248f5e8c547869ea93ac78
142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/*
242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Mesa 3-D graphics library
33b7df51eb5f5e2f980248f5e8c547869ea93ac78Brian * Version:  7.1
442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *
53b7df51eb5f5e2f980248f5e8c547869ea93ac78Brian * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *
742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Permission is hereby granted, free of charge, to any person obtaining a
842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * copy of this software and associated documentation files (the "Software"),
942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * to deal in the Software without restriction, including without limitation
1042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * the rights to use, copy, modify, merge, publish, distribute, sublicense,
1142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * and/or sell copies of the Software, and to permit persons to whom the
1242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Software is furnished to do so, subject to the following conditions:
1342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *
1442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * The above copyright notice and this permission notice shall be included
1542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * in all copies or substantial portions of the Software.
1642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *
1742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
1842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
2042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
2142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
2342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */
2442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
2542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#ifdef USE_X86_64_ASM
2642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
2742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#include "matypes.h"
2842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
2942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.text
3042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
3142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
3242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
3342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_general
3442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_general:
3542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/*
3642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *	rdi = dest
3742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *	rsi = matrix
3842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul *	rdx = source
3942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */
4042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
4142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movzx V4F_STRIDE(%rdx), %eax	/* stride */
4242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
4342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
4442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
4542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
4642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
4742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
4842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	testl %ecx, %ecx		/* verify non-zero count */
4942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchnta 64(%rsi)
5042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_general_done
5142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
5242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
5342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
5442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
5542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 16(%rdx)
5642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
5742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
5842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
5942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
6042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
6142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
6242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
6342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_general_loop:
6442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
653b7df51eb5f5e2f980248f5e8c547869ea93ac78Brian	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
6642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 16(%rdi)
6742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
6842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
6942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
7042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
7142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
7242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
7342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
7442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
7542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
7642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
7742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
7842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
7942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 16(%rdx)
8042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
8142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
8242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
8342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
8442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
8542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
8642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_general_loop
8742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
8842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_general_done:
8942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0xf3
9042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
9142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
9242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.section .rodata
9342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
9442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
9542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_constants:
9642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0xff, 0xff, 0xff, 0xff
9742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0xff, 0xff, 0xff, 0xff
9842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0xff, 0xff, 0xff, 0xff
9942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0x00, 0x00, 0x00, 0x00
10042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
10142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0x00, 0x00, 0x00, 0x00
10242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0x00, 0x00, 0x00, 0x00
10342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte  0x00, 0x00, 0x00, 0x00
10442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.float 0f+1.0
10542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
10642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.text
10742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
10842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_3d
10942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/*
11042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * this is slower than _mesa_x86_64_transform_points4_general
11142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
11242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */
11342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_3d:
11442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
11542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	leaq p4_constants(%rip), %rax
11642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
11742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchnta 64(%rsi)
11842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
11942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps (%rax), %xmm9
12042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 16(%rax), %xmm10
12142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
12242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
12342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movzx V4F_STRIDE(%rdx), %eax	/* stride */
12442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
12542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
12642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
12742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
12842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
12942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	testl %ecx, %ecx		/* verify non-zero count */
13042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_3d_done
13142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
13242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
13342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
13442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
13542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 16(%rdx)
13642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
13742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
13842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
13942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
14042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
14142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
14242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
14342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
14442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
14542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
14642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
14742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
14842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_loop:
14942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
15042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps (%rdx), %xmm8		/* ox | oy | oz | ow */
15142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 16(%rdi)
15242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
15342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
15442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
15542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
15642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
15742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
15842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
15942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
16042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
16142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
16242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
16342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
16442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 16(%rdx)
16542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
16642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
16742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
16842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
16942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
17042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	dec %ecx
17142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_3d_loop
17242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
17342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_done:
17442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0xf3
17542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
17642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
17742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
17842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
17942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_identity
18042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_identity:
18142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
18242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
18342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movzx V4F_STRIDE(%rdx), %eax	/* stride */
18442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
18542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
18642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
18742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
18842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
18942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
19042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_identity_done
19142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
19242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
19342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
19442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 64(%rsi)
19542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 64(%rdi)
19642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
19742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	add %ecx, %ecx
19842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
19942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	rep movsq
20042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
20142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_identity_done:
20242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0xf3
20342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
20442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
20542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
20642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
20742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_3d_no_rot
20842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_3d_no_rot:
20942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
21042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
21142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movzx V4F_STRIDE(%rdx), %eax	/* stride */
21242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
21342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
21442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
21542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
21642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
21742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
21842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
21942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
22042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_3d_no_rot_done
22142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
22242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
22342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
22442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
22542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch (%rdx)
22642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
22742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd (%rsi), %mm0		/*                 | m00             */
22842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
22942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
23042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
23142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd 40(%rsi), %mm2		/*                 | m22             */
23242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 48(%rsi), %mm1		/* m31             | m30             */
23342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
23442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
23542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
23642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_no_rot_loop:
23742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
23842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 32(%rdi)
23942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
24042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq  (%rdx), %mm4		/* x1              | x0              */
24142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq  8(%rdx), %mm5		/* x3              | x2              */
24242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd  12(%rdx), %mm7		/*                 | x3              */
24342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
24442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq  %mm5, %mm6		/* x3              | x2              */
24542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
24642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
24742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckhdq %mm6, %mm6		/* x3              | x3              */
24842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
24942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
25042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
25142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
25242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
25342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
25442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
25542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
25642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm4, (%rdi)		/* write r0, r1                      */
25742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, 8(%rdi)		/* write r2, r3                      */
25842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
25942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
26042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
26142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
26242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 32(%rdx)
26342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_3d_no_rot_loop
26442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
26542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_no_rot_done:
26642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	femms
26742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
26842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
26942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
27042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
27142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_perspective
27242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_perspective:
27342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
27442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
27542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movzx V4F_STRIDE(%rdx), %eax	/* stride */
27642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
27742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
27842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
27942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
28042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
28142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
28242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
28342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_perspective_done
28442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
28542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
28642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
28742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
28842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd (%rsi), %mm0		/*                 | m00             */
28942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul        pxor %mm7, %mm7			/* 0               | 0               */
29042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
29142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
29242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 32(%rsi), %mm2		/* m21             | m20             */
29342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch (%rdx)
29442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
29542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd 40(%rsi), %mm1		/*                 | m22             */
29642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
29742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
29842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
29942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
30042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
30142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_perspective_loop:
30242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
30342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
30442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
30542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq (%rdx), %mm4		/* x1              | x0              */
30642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 8(%rdx), %mm5		/* x3              | x2              */
30742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd 8(%rdx), %mm3		/*                 | x2              */
30842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
30942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, %mm6			/* x3              | x2              */
31042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
31142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
31242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq %mm5, %mm5		/* x2              | x2              */
31342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
31442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
31542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfsubr %mm7, %mm3		/*                 | -x2             */
31642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
31742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
31842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
31942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
32042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
32142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
32242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, (%rdi)		/* write r0, r1                      */
32342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
32442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm6, 8(%rdi)		/* write r2, r3                      */
32542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
32642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
32742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
32842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
32942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 32(%rdx)		/* hopefully stride is zero          */
33042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_perspective_loop
33142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
33242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_perspective_done:
33342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	femms
33442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
33542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
33642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
33742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_2d_no_rot
33842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_2d_no_rot:
33942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
34042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
34142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movzx V4F_STRIDE(%rdx), %eax	/* stride */
34242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
34342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
34442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
34542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
34642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
34742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
34842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x90			/* manual align += 1 */
34942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_2d_no_rot_done
35042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
35142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
35242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
35342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
35442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd (%rsi), %mm0		/*                 | m00             */
35542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch (%rdx)
35642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
35742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
35842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 48(%rsi), %mm1		/* m31             | m30             */
35942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
36042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_no_rot_loop:
36142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
36242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
36342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
36442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq (%rdx), %mm4		/* x1              | x0              */
36542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 8(%rdx), %mm5		/* x3              | x2              */
36642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
36742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
36842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, %mm6			/* x3              | x2              */
36942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
37042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckhdq %mm6, %mm6		/* x3              | x3              */
37142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
37242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
37342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
37442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
37542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 32(%rdx)		/* hopefully stride is zero          */
37642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
37742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
37842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm6, (%rdi)		/* write r0, r1                      */
37942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, 8(%rdi)		/* write r2, r3                      */
38042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
38142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
38242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
38342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
38442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_2d_no_rot_loop
38542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
38642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_no_rot_done:
38742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	femms
38842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
38942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
39042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
39142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16
39242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_2d
39342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_2d:
39442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
39542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl V4F_COUNT(%rdx), %ecx	/* count */
39642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movzx V4F_STRIDE(%rdx), %eax	/* stride */
39742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
39842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
39942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movl $4, V4F_SIZE(%rdi)		/* set dest size */
40042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
40142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
40242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
40342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	test %ecx, %ecx
40442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
40542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jz p4_2d_done
40642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
40742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
40842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
40942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
41042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd (%rsi), %mm0		/*                 | m00             */
41142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movd 4(%rsi), %mm1		/*                 | m01             */
41242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
41342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch (%rdx)
41442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
41542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
41642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
41742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
41842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
41942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 48(%rsi), %mm2		/* m31             | m30             */
42042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
42142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_loop:
42242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
42342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
42442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
42542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq (%rdx), %mm3		/* x1              | x0              */
42642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq 8(%rdx), %mm5		/* x3              | x2              */
42742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
42842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm3, %mm4			/* x1              | x0              */
42942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, %mm6			/* x3              | x2              */
43042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
43142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
43242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	punpckhdq %mm6, %mm6		/* x3              | x3              */
43342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
43442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
43542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
43642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq %rax, %rdx
43742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
43842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
43942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
44042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	prefetch 32(%rdx)		/* hopefully stride is zero          */
44142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
44242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	pfadd %mm6, %mm3		/* r1              | r0              */
44342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
44442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm3, (%rdi)		/* write r0, r1                      */
44542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	movq %mm5, 8(%rdi)		/* write r2, r3                      */
44642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
44742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	addq $16, %rdi
44842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
44942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	decl %ecx
45042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	jnz p4_2d_loop
45142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
45242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_done:
45342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	femms
45442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul	ret
45542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul
45642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#endif
457fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg
458fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#if defined (__ELF__) && defined (__linux__)
459fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg	.section .note.GNU-stack,"",%progbits
460fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#endif
461