142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/* 242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Mesa 3-D graphics library 342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * 43b7df51eb5f5e2f980248f5e8c547869ea93ac78Brian Paul * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. 542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * 642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Permission is hereby granted, free of charge, to any person obtaining a 742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * copy of this software and associated documentation files (the "Software"), 842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * to deal in the Software without restriction, including without limitation 942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * the rights to use, copy, modify, merge, publish, distribute, sublicense, 1042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * and/or sell copies of the Software, and to permit persons to whom the 1142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * Software is furnished to do so, subject to the following conditions: 1242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * 1342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * The above copyright notice and this permission notice shall be included 1442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * in all copies or substantial portions of the Software. 1542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * 1642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 1742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 193d8d5b298a268b119d840bc9bae0ee9e0c9244a9Kenneth Graunke * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 203d8d5b298a268b119d840bc9bae0ee9e0c9244a9Kenneth Graunke * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 213d8d5b298a268b119d840bc9bae0ee9e0c9244a9Kenneth Graunke * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 223d8d5b298a268b119d840bc9bae0ee9e0c9244a9Kenneth Graunke * OTHER DEALINGS IN THE SOFTWARE. 2342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */ 2442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 2542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#ifdef USE_X86_64_ASM 2642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 2742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#include "matypes.h" 2842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 2942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.text 3042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 3142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 322b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_x86_64_cpuid 33124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_x86_64_cpuid 342b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_x86_64_cpuid: 352b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick pushq %rbx 362b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick movl (%rdi), %eax 372b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick movl 8(%rdi), %ecx 382b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick 392b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick cpuid 402b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick 412b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick movl %ebx, 4(%rdi) 422b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick movl %eax, (%rdi) 432b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick movl %ecx, 8(%rdi) 442b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick movl %edx, 12(%rdi) 452b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick popq %rbx 462b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick ret 4742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 482b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.align 16 4942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_general 50124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_x86_64_transform_points4_general 5142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_general: 5242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/* 5342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * rdi = dest 5442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * rsi = matrix 5542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * rdx = source 5642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */ 5742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 583fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric movzbl V4F_STRIDE(%rdx), %eax /* stride */ 5942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 6042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 6142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 6242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */ 6342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 6442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 6542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul testl %ecx, %ecx /* verify non-zero count */ 6642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchnta 64(%rsi) 6742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_general_done 6842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 6942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 7042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 7142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 729c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 16(%rdx) 7342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 7442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 7542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 7642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 7742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 7842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 7942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 8042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_general_loop: 8142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 823b7df51eb5f5e2f980248f5e8c547869ea93ac78Brian Paul movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 839c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 16(%rdi) 8442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 8542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 8642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 8742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 8842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 8942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 9042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 9142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 9242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 9342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 9442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 9542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 969c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 16(%rdx) 9742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 9842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 9942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 10042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 10142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 10242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 10342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_general_loop 10442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 10542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_general_done: 10642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0xf3 10742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 10842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 10942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.section .rodata 11042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 11142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 11242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_constants: 11342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0xff, 0xff, 0xff, 0xff 11442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0xff, 0xff, 0xff, 0xff 11542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0xff, 0xff, 0xff, 0xff 11642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0x00, 0x00, 0x00, 0x00 11742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 11842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0x00, 0x00, 0x00, 0x00 11942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0x00, 0x00, 0x00, 0x00 12042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.byte 0x00, 0x00, 0x00, 0x00 1219cd64ec35acd54cbe0be4d03236d2c5a9d4be6feChad Versace.float 1.0 12242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 12342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.text 12442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 12542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_3d 126124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_x86_64_transform_points4_3d 12742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul/* 12842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * this is slower than _mesa_x86_64_transform_points4_general 12942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 13042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul */ 13142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_3d: 13242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 13342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul leaq p4_constants(%rip), %rax 13442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 13542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul prefetchnta 64(%rsi) 13642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 13742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps (%rax), %xmm9 13842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 16(%rax), %xmm10 13942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 14042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 1413fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric movzbl V4F_STRIDE(%rdx), %eax /* stride */ 14242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 14342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 14442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 14542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 14642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 14742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul testl %ecx, %ecx /* verify non-zero count */ 14842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_3d_done 14942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 15042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 15142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 15242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 1539c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 16(%rdx) 15442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 15542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 15642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 15742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */ 15842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 15942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */ 16042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 16142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */ 16242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */ 16342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 16442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */ 16542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 16642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_loop: 16742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 16874e1d0996d109ce6426507f9ca64cf38d3b83acfRoland Scheidegger movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 1699c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 16(%rdi) 17042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 17142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 17242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 17342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 17442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 17542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 17642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 17742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 17842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 17942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 18042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 18142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 1829c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 16(%rdx) 18342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 18442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 18542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 18642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 18742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 18842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul dec %ecx 18942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_3d_loop 19042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 19142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_done: 19242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0xf3 19342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 19442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 19542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 19642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 19742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.globl _mesa_x86_64_transform_points4_identity 198124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_x86_64_transform_points4_identity 19942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul_mesa_x86_64_transform_points4_identity: 20042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 20142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 2023fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric movzbl V4F_STRIDE(%rdx), %eax /* stride */ 20342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 20442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 20542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 20642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 20742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 20842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 20942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_identity_done 21042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 21142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ 21242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 2139c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 64(%rsi) 2149c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 64(%rdi) 21542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 21642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul add %ecx, %ecx 21742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 21842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul rep movsq 21942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 22042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_identity_done: 22142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0xf3 22242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 22342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 22442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 22542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 2262b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_3dnow_transform_points4_3d_no_rot 227124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_3dnow_transform_points4_3d_no_rot 2282b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_3dnow_transform_points4_3d_no_rot: 22942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 23042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 2313fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric movzbl V4F_STRIDE(%rdx), %eax /* stride */ 23242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 23342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 23442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 23542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 23642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 23742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 23842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 23942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 24042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_3d_no_rot_done 24142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 24242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 24342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 24442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 2459c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 (%rdx) 24642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 24742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd (%rsi), %mm0 /* | m00 */ 24842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 24942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 25042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 25142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 40(%rsi), %mm2 /* | m22 */ 25242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 48(%rsi), %mm1 /* m31 | m30 */ 25342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 25442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 56(%rsi), %mm2 /* m11 | m00 */ 25542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 25642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_no_rot_loop: 25742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 2589c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 32(%rdi) 25942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 26042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq (%rdx), %mm4 /* x1 | x0 */ 26142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 8(%rdx), %mm5 /* x3 | x2 */ 26242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 12(%rdx), %mm7 /* | x3 */ 26342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 26442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, %mm6 /* x3 | x2 */ 26542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 26642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 26742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckhdq %mm6, %mm6 /* x3 | x3 */ 26842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */ 26942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 27042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 27142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */ 27242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 27342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 27442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 27542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 27642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm4, (%rdi) /* write r0, r1 */ 27742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, 8(%rdi) /* write r2, r3 */ 27842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 27942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 28042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 28142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 2829c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 32(%rdx) 28342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_3d_no_rot_loop 28442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 28542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_3d_no_rot_done: 28642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul femms 28742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 28842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 28942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 29042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 2912b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_3dnow_transform_points4_perspective 292124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_3dnow_transform_points4_perspective 2932b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_3dnow_transform_points4_perspective: 29442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 29542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 2963fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric movzbl V4F_STRIDE(%rdx), %eax /* stride */ 29742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 29842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 29942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 30042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 30142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 30242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 30342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 30442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_perspective_done 30542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 30642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 30742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 30842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 30942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd (%rsi), %mm0 /* | m00 */ 31042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pxor %mm7, %mm7 /* 0 | 0 */ 31142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 31242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 31342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 32(%rsi), %mm2 /* m21 | m20 */ 3149c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 (%rdx) 31542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 31642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 40(%rsi), %mm1 /* | m22 */ 31742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 31842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 31942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 56(%rsi), %mm1 /* m32 | m22 */ 32042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 32142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 32242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_perspective_loop: 32342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 3249c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 32542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 32642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq (%rdx), %mm4 /* x1 | x0 */ 32742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 8(%rdx), %mm5 /* x3 | x2 */ 32842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 8(%rdx), %mm3 /* | x2 */ 32942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 33042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, %mm6 /* x3 | x2 */ 33142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 33242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 33342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq %mm5, %mm5 /* x2 | x2 */ 33442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 33542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */ 33642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfsubr %mm7, %mm3 /* | -x2 */ 33742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 33842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */ 33942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */ 34042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 34142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */ 34242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 34342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, (%rdi) /* write r0, r1 */ 34442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 34542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm6, 8(%rdi) /* write r2, r3 */ 34642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 34742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 34842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 34942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 3509c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 32(%rdx) /* hopefully stride is zero */ 35142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_perspective_loop 35242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 35342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_perspective_done: 35442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul femms 35542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 35642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 35742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 3582b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_3dnow_transform_points4_2d_no_rot 359124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_3dnow_transform_points4_2d_no_rot 3602b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_3dnow_transform_points4_2d_no_rot: 36142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 36242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 3633fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric movzbl V4F_STRIDE(%rdx), %eax /* stride */ 36442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 36542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 36642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 36742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 36842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 36942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 37042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x90 /* manual align += 1 */ 37142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_2d_no_rot_done 37242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 37342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 37442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 37542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 37642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd (%rsi), %mm0 /* | m00 */ 3779c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 (%rdx) 37842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 37942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 38042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 48(%rsi), %mm1 /* m31 | m30 */ 38142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 38242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_no_rot_loop: 38342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 3849c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 38542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 38642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq (%rdx), %mm4 /* x1 | x0 */ 38742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 8(%rdx), %mm5 /* x3 | x2 */ 38842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 38942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 39042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, %mm6 /* x3 | x2 */ 39142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 39242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckhdq %mm6, %mm6 /* x3 | x3 */ 39342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 39442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 39542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 39642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 3979c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 32(%rdx) /* hopefully stride is zero */ 39842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 39942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 40042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm6, (%rdi) /* write r0, r1 */ 40142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, 8(%rdi) /* write r2, r3 */ 40242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 40342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 40442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 40542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 40642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_2d_no_rot_loop 40742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 40842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_no_rot_done: 40942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul femms 41042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 41142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 41242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 41342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul.align 16 4142b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick.globl _mesa_3dnow_transform_points4_2d 415124ef032233d7afc2725e8ded0939838e7b2a76bAdam Jackson.hidden _mesa_3dnow_transform_points4_2d 4162b8d8989fb6f9c36baf166fc715182a1407ebadbIan Romanick_mesa_3dnow_transform_points4_2d: 41742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 41842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl V4F_COUNT(%rdx), %ecx /* count */ 4193fda80246f0c41edebdfb4b1ce35bb4726a8c521Dimitry Andric movzbl V4F_STRIDE(%rdx), %eax /* stride */ 42042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 42142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 42242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movl $4, V4F_SIZE(%rdi) /* set dest size */ 42342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 42442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 42542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 42642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul test %ecx, %ecx 42742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 42842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jz p4_2d_done 42942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 43042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 43142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 43242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 43342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd (%rsi), %mm0 /* | m00 */ 43442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movd 4(%rsi), %mm1 /* | m01 */ 43542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 4369c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 (%rdx) 43742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 43842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 16(%rsi), %mm0 /* m10 | m00 */ 43942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 44042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckldq 20(%rsi), %mm1 /* m11 | m01 */ 44142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 44242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 48(%rsi), %mm2 /* m31 | m30 */ 44342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 44442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_loop: 44542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 4469c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 44742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 44842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq (%rdx), %mm3 /* x1 | x0 */ 44942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq 8(%rdx), %mm5 /* x3 | x2 */ 45042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 45142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm3, %mm4 /* x1 | x0 */ 45242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, %mm6 /* x3 | x2 */ 45342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 45442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */ 45542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul punpckhdq %mm6, %mm6 /* x3 | x3 */ 45642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 45742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */ 45842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 45942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq %rax, %rdx 46042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ 46142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 46242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ 4639c78cfd547a69f6f45d7acaa8ade681640caee95Patrick Baggett prefetcht1 32(%rdx) /* hopefully stride is zero */ 46442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 46542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul pfadd %mm6, %mm3 /* r1 | r0 */ 46642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 46742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm3, (%rdi) /* write r0, r1 */ 46842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul movq %mm5, 8(%rdi) /* write r2, r3 */ 46942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 47042fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul addq $16, %rdi 47142fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 47242fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul decl %ecx 47342fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul jnz p4_2d_loop 47442fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 47542fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paulp4_2d_done: 47642fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul femms 47742fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul ret 47842fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul 47942fa81275c67d7d1ad8d255120af0ffeeb46b963Brian Paul#endif 480fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg 481fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#if defined (__ELF__) && defined (__linux__) 482fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg .section .note.GNU-stack,"",%progbits 483fcdc6a7d2488defd66bc7e8398c6d8c9a6190a1aKristian Høgsberg#endif 484