190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14df37111358d02836cb29bbcb9c6e4c95dff90a16JohannSECTION .text
15df37111358d02836cb29bbcb9c6e4c95dff90a16Johann
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
171b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_short_walsh4x4_sse2) PRIVATE
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_short_walsh4x4_sse2):
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 3
221b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 7
23538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    GET_GOT     rbx
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov     rsi, arg(0)           ; input
29538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov     rdi, arg(1)           ; output
30538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movsxd  rdx, dword ptr arg(2) ; pitch
31538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
32538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; first for loop
33538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq    xmm0, MMWORD PTR [rsi]           ; load input
34538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq    xmm1, MMWORD PTR [rsi + rdx]
35538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    lea     rsi,  [rsi + rdx*2]
36538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq    xmm2, MMWORD PTR [rsi]
37538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq    xmm3, MMWORD PTR [rsi + rdx]
38538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
39538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklwd xmm0,  xmm1
40538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklwd xmm2,  xmm3
41538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
42538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm1, xmm0
43538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckldq xmm0, xmm2           ; ip[1] ip[0]
44538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
45538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
46538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm2, xmm0
47538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw     xmm0, xmm1
48538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw     xmm2, xmm1
49538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
50538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psllw     xmm0, 2              ; d1  a1
51538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psllw     xmm2, 2              ; c1  b1
52538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
53538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm1, xmm0
54538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm0, xmm2          ; b1  a1
55538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm1, xmm2          ; c1  d1
56538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
57538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor      xmm6, xmm6
58538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq      xmm6, xmm0
59538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor      xmm7, xmm7
60538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpeqw   xmm7, xmm6
61538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw     xmm7, [GLOBAL(c1)]
62538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
63538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm2, xmm0
64538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw     xmm0, xmm1           ; b1+c1  a1+d1
65538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw     xmm2, xmm1           ; b1-c1  a1-d1
66538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
67538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
68538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; second for loop
69538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; input: 13  9  5  1 12  8  4  0 (xmm0)
70538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ;        14 10  6  2 15 11  7  3 (xmm2)
71538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; after shuffle:
72538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ;        13  5  9  1 12  4  8  0 (xmm0)
73538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ;        14  6 10  2 15  7 11  3 (xmm1)
74538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshuflw   xmm3, xmm0, 0xd8
75538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufhw   xmm0, xmm3, 0xd8
76538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshuflw   xmm3, xmm2, 0xd8
77538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufhw   xmm1, xmm3, 0xd8
78538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
79538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm2, xmm0
80538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
81538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
82538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm3, xmm1
83538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
84538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
85538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
86538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
87538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
88538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
89538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
90538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
91538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm0, xmm4
92538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
93538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
94538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm1, xmm6
95538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
96538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
97538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
98538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm2, xmm0
99538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm0, xmm4            ; b21 b20 a21 a20
100538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubd     xmm2, xmm4            ; c21 c20 d21 d20
101538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm3, xmm1
102538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm1, xmm6            ; b23 b22 a23 a22
103538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubd     xmm3, xmm6            ; c23 c22 d23 d22
104538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
105538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor      xmm4, xmm4
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    movdqa    xmm5, xmm4
107538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpgtd   xmm4, xmm0
108538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpgtd   xmm5, xmm2
109538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pand      xmm4, [GLOBAL(cd1)]
110538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pand      xmm5, [GLOBAL(cd1)]
111538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
112538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor      xmm6, xmm6
113538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm7, xmm6
114538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpgtd   xmm6, xmm1
115538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpgtd   xmm7, xmm3
116538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pand      xmm6, [GLOBAL(cd1)]
117538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pand      xmm7, [GLOBAL(cd1)]
118538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
119538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm0, xmm4
120538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm2, xmm5
121538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm0, [GLOBAL(cd3)]
122538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm2, [GLOBAL(cd3)]
123538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm1, xmm6
124538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm3, xmm7
125538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm1, [GLOBAL(cd3)]
126538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm3, [GLOBAL(cd3)]
127538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
128538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrad     xmm0, 3
129538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrad     xmm1, 3
130538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrad     xmm2, 3
131538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrad     xmm3, 3
132538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm4, xmm0
133538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
134538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
135538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm5, xmm2
136538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
137538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
138538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
139538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
140538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
141538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
142538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa  XMMWORD PTR [rdi], xmm0
143538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa  XMMWORD PTR [rdi + 16], xmm2
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
148538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    RESTORE_GOT
149538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    RESTORE_XMM
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
153538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
154538f6170b788de7408b06efc6613dc98579aa6a6Andreas HuberSECTION_RODATA
155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16
156538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberc1:
157538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
158538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16
159538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercn1:
160538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
161538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16
162538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercd1:
163538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
164538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16
165538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercd3:
166538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
167