190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch) 151b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_short_walsh4x4_sse2) PRIVATE 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_short_walsh4x4_sse2): 1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 3 201b362b15af34006e6a11974088a46d42b903418eJohann SAVE_XMM 7 21538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber GET_GOT rbx 2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rsi, arg(0) ; input 27538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber mov rdi, arg(1) ; output 28538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movsxd rdx, dword ptr arg(2) ; pitch 29538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 30538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; first for loop 31538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm0, MMWORD PTR [rsi] ; load input 32538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm1, MMWORD PTR [rsi + rdx] 33538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber lea rsi, [rsi + rdx*2] 34538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm2, MMWORD PTR [rsi] 35538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm3, MMWORD PTR [rsi + rdx] 36538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 37538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklwd xmm0, xmm1 38538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklwd xmm2, xmm3 39538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 40538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, xmm0 41538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckldq xmm0, xmm2 ; ip[1] ip[0] 42538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhdq xmm1, xmm2 ; ip[3] ip[2] 43538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 44538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm0 45538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, xmm1 46538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm2, xmm1 47538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 48538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psllw xmm0, 2 ; d1 a1 49538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psllw xmm2, 2 ; c1 b1 50538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 51538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, xmm0 52538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm0, xmm2 ; b1 a1 53538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm1, xmm2 ; c1 d1 54538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 55538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, xmm6 56538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq xmm6, xmm0 57538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm7, xmm7 58538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpeqw xmm7, xmm6 59538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm7, [GLOBAL(c1)] 60538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 61538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm0 62538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, xmm1 ; b1+c1 a1+d1 63538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubw xmm2, xmm1 ; b1-c1 a1-d1 64538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0) 65538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 66538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; second for loop 67538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; input: 13 9 5 1 12 8 4 0 (xmm0) 68538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; 14 10 6 2 15 11 7 3 (xmm2) 69538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; after shuffle: 70538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; 13 5 9 1 12 4 8 0 (xmm0) 71538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber ; 14 6 10 2 15 7 11 3 (xmm1) 72538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshuflw xmm3, xmm0, 0xd8 73538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufhw xmm0, xmm3, 0xd8 74538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshuflw xmm3, xmm2, 0xd8 75538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufhw xmm1, xmm3, 0xd8 76538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 77538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm0 78538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10 79538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10 80538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, xmm1 81538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13 82538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13 83538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 84538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10 85538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10 86538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12 87538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12 88538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 89538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm0, xmm4 90538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10 91538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10 92538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm1, xmm6 93538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12 94538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12 95538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 96538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm2, xmm0 97538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm0, xmm4 ; b21 b20 a21 a20 98538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubd xmm2, xmm4 ; c21 c20 d21 d20 99538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm3, xmm1 100538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm1, xmm6 ; b23 b22 a23 a22 101538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psubd xmm3, xmm6 ; c23 c22 d23 d22 102538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 103538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm4, xmm4 10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm5, xmm4 105538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpgtd xmm4, xmm0 106538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpgtd xmm5, xmm2 107538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm4, [GLOBAL(cd1)] 108538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm5, [GLOBAL(cd1)] 109538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 110538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pxor xmm6, xmm6 111538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm7, xmm6 112538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpgtd xmm6, xmm1 113538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pcmpgtd xmm7, xmm3 114538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm6, [GLOBAL(cd1)] 115538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber pand xmm7, [GLOBAL(cd1)] 116538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 117538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm0, xmm4 118538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm2, xmm5 119538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm0, [GLOBAL(cd3)] 120538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm2, [GLOBAL(cd3)] 121538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm1, xmm6 122538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm3, xmm7 123538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm1, [GLOBAL(cd3)] 124538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddd xmm3, [GLOBAL(cd3)] 125538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 126538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrad xmm0, 3 127538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrad xmm1, 3 128538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrad xmm2, 3 129538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber psrad xmm3, 3 130538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm4, xmm0 131538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20 132538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20 133538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa xmm5, xmm2 134538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20 135538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20 136538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 137538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20 138538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20 139538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 140538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdi], xmm0 141538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movdqa XMMWORD PTR [rdi + 16], xmm2 14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 146538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber RESTORE_GOT 147538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber RESTORE_XMM 14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 151538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber 152538f6170b788de7408b06efc6613dc98579aa6a6Andreas HuberSECTION_RODATA 153538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16 154538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberc1: 155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 156538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16 157538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercn1: 158538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff 159538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16 160538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercd1: 161538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber dd 0x00000001, 0x00000001, 0x00000001, 0x00000001 162538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16 163538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercd3: 164538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber dd 0x00000003, 0x00000003, 0x00000003, 0x00000003 165