xform4.S revision 42fa81275c67d7d1ad8d255120af0ffeeb46b963
142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/* $Id: xform4.S,v 1.1 2005/05/07 16:59:59 brianp Exp $ */ 242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/* 442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Mesa 3-D graphics library 542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Version: 3.5 642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * 742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. 842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * 942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Permission is hereby granted, free of charge, to any person obtaining a 1042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * copy of this software and associated documentation files (the "Software"), 1142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * to deal in the Software without restriction, including without limitation 1242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * and/or sell copies of the Software, and to permit persons to whom the 1442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Software is furnished to do so, subject to the following conditions: 1542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * 1642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * The above copyright notice and this permission notice shall be included 1742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * in all copies or substantial portions of the Software. 1842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * 1942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 2042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 2142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 2242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN 2342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 2542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */ 2642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 2742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#ifdef USE_X86_64_ASM 2842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 2942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#include "matypes.h" 3042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 3142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.text 3242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 3342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 3442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 3542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_general 3642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_general: 3742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/* 3842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * rdi = dest 3942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * rsi = matrix 4042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * rdx = source 4142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */ 4242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 4342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movzx V4F_STRIDE(%rdx), %eax /* stride */ 4442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 4542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 4642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 4742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */ 4842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 4942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 5042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul testl %ecx, %ecx /* verify non-zero count */ 5142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchnta 64(%rsi) 5242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_general_done 5342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 5442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 5542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 5642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 5742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch 16(%rdx) 5842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 5942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 6042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 6142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 6242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 6342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 6442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 6542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_general_loop: 6642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 6742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps (%rdx), %xmm8 /* ox | oy | oz | ow */ 6842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchw 16(%rdi) 6942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 7042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 7142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 7242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 7342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 7442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 7542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 7642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 7742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 7842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 7942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 8042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 8142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch 16(%rdx) 8242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 8342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 8442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 8542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 8642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 8742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 8842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_general_loop 8942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 9042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_general_done: 9142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0xf3 9242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 9342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 9442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.section .rodata 9542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 9642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 9742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_constants: 9842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0xff, 0xff, 0xff, 0xff 9942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0xff, 0xff, 0xff, 0xff 10042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0xff, 0xff, 0xff, 0xff 10142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0x00, 0x00, 0x00, 0x00 10242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 10342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0x00, 0x00, 0x00, 0x00 10442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0x00, 0x00, 0x00, 0x00 10542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0x00, 0x00, 0x00, 0x00 10642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.float 0f+1.0 10742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 10842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.text 10942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 11042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_3d 11142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/* 11242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * this is slower than _mesa_x86_64_transform_points4_general 11342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 11442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */ 11542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_3d: 11642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 11742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul leaq p4_constants(%rip), %rax 11842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 11942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchnta 64(%rsi) 12042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 12142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps (%rax), %xmm9 12242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 16(%rax), %xmm10 12342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 12442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 12542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movzx V4F_STRIDE(%rdx), %eax /* stride */ 12642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 12742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 12842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 12942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 13042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 13142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul testl %ecx, %ecx /* verify non-zero count */ 13242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_3d_done 13342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 13442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 13542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 13642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 13742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch 16(%rdx) 13842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 13942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 14042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 14142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */ 14242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 14342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */ 14442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 14542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */ 14642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */ 14742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 14842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */ 14942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 15042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_loop: 15142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 15242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps (%rdx), %xmm8 /* ox | oy | oz | ow */ 15342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchw 16(%rdi) 15442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 15542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 15642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 15742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 15842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 15942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 16042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 16142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 16242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 16342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 16442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 16542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 16642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch 16(%rdx) 16742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 16842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 16942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 17042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 17142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 17242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul dec %ecx 17342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_3d_loop 17442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 17542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_done: 17642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0xf3 17742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 17842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 17942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 18042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 18142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_identity 18242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_identity: 18342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 18442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 18542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movzx V4F_STRIDE(%rdx), %eax /* stride */ 18642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 18742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 18842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 18942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 19042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 19142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 19242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_identity_done 19342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 19442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ 19542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 19642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch 64(%rsi) 19742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchw 64(%rdi) 19842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 19942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul add %ecx, %ecx 20042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 20142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul rep movsq 20242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 20342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_identity_done: 20442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0xf3 20542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 20642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 20742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 20842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 20942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_3d_no_rot 21042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_3d_no_rot: 21142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 21242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 21342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movzx V4F_STRIDE(%rdx), %eax /* stride */ 21442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 21542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 21642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 21742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 21842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 21942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 22042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 22142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 22242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_3d_no_rot_done 22342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 22442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 22542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 22642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 22742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch (%rdx) 22842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 22942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd (%rsi), %mm0 /* | m00 */ 23042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 23142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 23242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 23342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 40(%rsi), %mm2 /* | m22 */ 23442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 48(%rsi), %mm1 /* m31 | m30 */ 23542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 23642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 56(%rsi), %mm2 /* m11 | m00 */ 23742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 23842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_no_rot_loop: 23942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 24042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchw 32(%rdi) 24142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 24242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq (%rdx), %mm4 /* x1 | x0 */ 24342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 8(%rdx), %mm5 /* x3 | x2 */ 24442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 12(%rdx), %mm7 /* | x3 */ 24542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 24642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, %mm6 /* x3 | x2 */ 24742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 24842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 24942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckhdq %mm6, %mm6 /* x3 | x3 */ 25042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */ 25142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 25242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 25342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */ 25442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 25542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 25642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 25742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 25842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm4, (%rdi) /* write r0, r1 */ 25942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, 8(%rdi) /* write r2, r3 */ 26042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 26142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 26242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 26342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 26442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch 32(%rdx) 26542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_3d_no_rot_loop 26642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 26742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_no_rot_done: 26842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul femms 26942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 27042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 27142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 27242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 27342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_perspective 27442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_perspective: 27542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 27642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 27742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movzx V4F_STRIDE(%rdx), %eax /* stride */ 27842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 27942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 28042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 28142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 28242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 28342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 28442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 28542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_perspective_done 28642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 28742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 28842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 28942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 29042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd (%rsi), %mm0 /* | m00 */ 29142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pxor %mm7, %mm7 /* 0 | 0 */ 29242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 29342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 29442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 32(%rsi), %mm2 /* m21 | m20 */ 29542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch (%rdx) 29642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 29742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 40(%rsi), %mm1 /* | m22 */ 29842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 29942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 30042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 56(%rsi), %mm1 /* m32 | m22 */ 30142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 30242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 30342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_perspective_loop: 30442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 30542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ 30642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 30742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq (%rdx), %mm4 /* x1 | x0 */ 30842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 8(%rdx), %mm5 /* x3 | x2 */ 30942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 8(%rdx), %mm3 /* | x2 */ 31042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 31142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, %mm6 /* x3 | x2 */ 31242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 31342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 31442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq %mm5, %mm5 /* x2 | x2 */ 31542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 31642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */ 31742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfsubr %mm7, %mm3 /* | -x2 */ 31842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 31942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */ 32042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */ 32142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 32242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */ 32342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 32442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, (%rdi) /* write r0, r1 */ 32542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 32642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm6, 8(%rdi) /* write r2, r3 */ 32742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 32842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 32942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 33042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 33142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch 32(%rdx) /* hopefully stride is zero */ 33242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_perspective_loop 33342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 33442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_perspective_done: 33542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul femms 33642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 33742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 33842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 33942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_2d_no_rot 34042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_2d_no_rot: 34142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 34242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 34342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movzx V4F_STRIDE(%rdx), %eax /* stride */ 34442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 34542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 34642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 34742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 34842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 34942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 35042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x90 /* manual align += 1 */ 35142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_2d_no_rot_done 35242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 35342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 35442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 35542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 35642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd (%rsi), %mm0 /* | m00 */ 35742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch (%rdx) 35842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 35942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 36042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 48(%rsi), %mm1 /* m31 | m30 */ 36142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 36242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_no_rot_loop: 36342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 36442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ 36542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 36642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq (%rdx), %mm4 /* x1 | x0 */ 36742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 8(%rdx), %mm5 /* x3 | x2 */ 36842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 36942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 37042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, %mm6 /* x3 | x2 */ 37142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 37242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckhdq %mm6, %mm6 /* x3 | x3 */ 37342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 37442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 37542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 37642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 37742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch 32(%rdx) /* hopefully stride is zero */ 37842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 37942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 38042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm6, (%rdi) /* write r0, r1 */ 38142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, 8(%rdi) /* write r2, r3 */ 38242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 38342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 38442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 38542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 38642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_2d_no_rot_loop 38742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 38842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_no_rot_done: 38942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul femms 39042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 39142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 39242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 39342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 39442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_2d 39542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_2d: 39642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 39742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 39842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movzx V4F_STRIDE(%rdx), %eax /* stride */ 39942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 40042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 40142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 40242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 40342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 40442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 40542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 40642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 40742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_2d_done 40842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 40942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 41042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 41142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 41242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd (%rsi), %mm0 /* | m00 */ 41342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 4(%rsi), %mm1 /* | m01 */ 41442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 41542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch (%rdx) 41642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 41742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 16(%rsi), %mm0 /* m10 | m00 */ 41842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 41942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 20(%rsi), %mm1 /* m11 | m01 */ 42042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 42142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 48(%rsi), %mm2 /* m31 | m30 */ 42242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 42342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_loop: 42442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 42542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ 42642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 42742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq (%rdx), %mm3 /* x1 | x0 */ 42842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 8(%rdx), %mm5 /* x3 | x2 */ 42942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 43042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm3, %mm4 /* x1 | x0 */ 43142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, %mm6 /* x3 | x2 */ 43242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 43342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */ 43442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckhdq %mm6, %mm6 /* x3 | x3 */ 43542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 43642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */ 43742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 43842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 43942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ 44042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 44142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ 44242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetch 32(%rdx) /* hopefully stride is zero */ 44342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 44442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfadd %mm6, %mm3 /* r1 | r0 */ 44542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 44642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm3, (%rdi) /* write r0, r1 */ 44742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, 8(%rdi) /* write r2, r3 */ 44842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 44942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 45042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 45142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 45242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_2d_loop 45342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 45442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_done: 45542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul femms 45642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 45742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 45842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#endif 459