190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14df37111358d02836cb29bbcb9c6e4c95dff90a16JohannSECTION .text 15df37111358d02836cb29bbcb9c6e4c95dff90a16Johann 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) 171b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_short_walsh4x4_sse2) PRIVATE 1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_short_walsh4x4_sse2): 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 3 221b362b15af34006e6a11974088a46d42b903418eJohann SAVE_XMM 7 23538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber GET_GOT rbx 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rsi, arg(0) ; input 29538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdi, arg(1) ; output 30538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movsxd rdx, dword ptr arg(2) ; pitch 31538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 32538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; first for loop 33538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm0, MMWORD PTR [rsi] ; load input 34538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm1, MMWORD PTR [rsi + rdx] 35538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rsi, [rsi + rdx*2] 36538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm2, MMWORD PTR [rsi] 37538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm3, MMWORD PTR [rsi + rdx] 38538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 39538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklwd xmm0, xmm1 40538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklwd xmm2, xmm3 41538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 42538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, xmm0 43538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckldq xmm0, xmm2 ; ip[1] ip[0] 44538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhdq xmm1, xmm2 ; ip[3] ip[2] 45538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 46538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm0 47538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, xmm1 48538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm2, xmm1 49538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 50538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psllw xmm0, 2 ; d1 a1 51538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psllw xmm2, 2 ; c1 b1 52538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 53538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, xmm0 54538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm0, xmm2 ; b1 a1 55538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm1, xmm2 ; c1 d1 56538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 57538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, xmm6 58538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm6, xmm0 59538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, xmm7 60538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqw xmm7, xmm6 61538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm7, [GLOBAL(c1)] 62538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 63538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm0 64538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, xmm1 ; b1+c1 a1+d1 65538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm2, xmm1 ; b1-c1 a1-d1 66538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0) 67538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 68538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; second for loop 69538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; input: 13 9 5 1 12 8 4 0 (xmm0) 70538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; 14 10 6 2 15 11 7 3 (xmm2) 71538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; after shuffle: 72538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; 13 5 9 1 12 4 8 0 (xmm0) 73538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; 14 6 10 2 15 7 11 3 (xmm1) 74538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshuflw xmm3, xmm0, 0xd8 75538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufhw xmm0, xmm3, 0xd8 76538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshuflw xmm3, xmm2, 0xd8 77538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufhw xmm1, xmm3, 0xd8 78538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 79538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm0 80538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10 81538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10 82538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, xmm1 83538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13 84538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13 85538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 86538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10 87538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10 88538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12 89538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12 90538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 91538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm0, xmm4 92538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10 93538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10 94538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, xmm6 95538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12 96538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12 97538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 98538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm0 99538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm0, xmm4 ; b21 b20 a21 a20 100538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubd xmm2, xmm4 ; c21 c20 d21 d20 101538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, xmm1 102538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm1, xmm6 ; b23 b22 a23 a22 103538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubd xmm3, xmm6 ; c23 c22 d23 d22 104538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 105538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm4, xmm4 10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm5, xmm4 107538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpgtd xmm4, xmm0 108538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpgtd xmm5, xmm2 109538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm4, [GLOBAL(cd1)] 110538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm5, [GLOBAL(cd1)] 111538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 112538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, xmm6 113538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, xmm6 114538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpgtd xmm6, xmm1 115538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpgtd xmm7, xmm3 116538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm6, [GLOBAL(cd1)] 117538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm7, [GLOBAL(cd1)] 118538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 119538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm0, xmm4 120538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm2, xmm5 121538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm0, [GLOBAL(cd3)] 122538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm2, [GLOBAL(cd3)] 123538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm1, xmm6 124538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm3, xmm7 125538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm1, [GLOBAL(cd3)] 126538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm3, [GLOBAL(cd3)] 127538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 128538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrad xmm0, 3 129538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrad xmm1, 3 130538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrad xmm2, 3 131538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrad xmm3, 3 132538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, xmm0 133538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20 134538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20 135538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, xmm2 136538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20 137538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20 138538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 139538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20 140538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20 141538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 142538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdi], xmm0 143538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdi + 16], xmm2 14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 148538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber RESTORE_GOT 149538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber RESTORE_XMM 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 153538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 154538f6170b788de7408b06efc6613dc98579aa6a6Andreas HuberSECTION_RODATA 155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16 156538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberc1: 157538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 158538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16 159538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercn1: 160538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff 161538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16 162538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercd1: 163538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 164538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16 165538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercd3: 166538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber dd 0x00000003, 0x00000003, 0x00000003, 0x00000003 167