190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
151b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_short_walsh4x4_sse2) PRIVATE
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_short_walsh4x4_sse2):
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 3
201b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 7
21538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    GET_GOT     rbx
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov     rsi, arg(0)           ; input
27538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov     rdi, arg(1)           ; output
28538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movsxd  rdx, dword ptr arg(2) ; pitch
29538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
30538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; first for loop
31538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq    xmm0, MMWORD PTR [rsi]           ; load input
32538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq    xmm1, MMWORD PTR [rsi + rdx]
33538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    lea     rsi,  [rsi + rdx*2]
34538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq    xmm2, MMWORD PTR [rsi]
35538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq    xmm3, MMWORD PTR [rsi + rdx]
36538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
37538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklwd xmm0,  xmm1
38538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklwd xmm2,  xmm3
39538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
40538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm1, xmm0
41538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckldq xmm0, xmm2           ; ip[1] ip[0]
42538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
43538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
44538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm2, xmm0
45538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw     xmm0, xmm1
46538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw     xmm2, xmm1
47538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
48538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psllw     xmm0, 2              ; d1  a1
49538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psllw     xmm2, 2              ; c1  b1
50538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
51538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm1, xmm0
52538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm0, xmm2          ; b1  a1
53538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm1, xmm2          ; c1  d1
54538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
55538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor      xmm6, xmm6
56538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq      xmm6, xmm0
57538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor      xmm7, xmm7
58538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpeqw   xmm7, xmm6
59538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw     xmm7, [GLOBAL(c1)]
60538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
61538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm2, xmm0
62538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw     xmm0, xmm1           ; b1+c1  a1+d1
63538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw     xmm2, xmm1           ; b1-c1  a1-d1
64538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
65538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
66538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; second for loop
67538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; input: 13  9  5  1 12  8  4  0 (xmm0)
68538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ;        14 10  6  2 15 11  7  3 (xmm2)
69538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; after shuffle:
70538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ;        13  5  9  1 12  4  8  0 (xmm0)
71538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ;        14  6 10  2 15  7 11  3 (xmm1)
72538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshuflw   xmm3, xmm0, 0xd8
73538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufhw   xmm0, xmm3, 0xd8
74538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshuflw   xmm3, xmm2, 0xd8
75538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufhw   xmm1, xmm3, 0xd8
76538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
77538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm2, xmm0
78538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
79538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
80538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm3, xmm1
81538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
82538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
83538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
84538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
85538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
86538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
87538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
88538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
89538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm0, xmm4
90538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
91538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
92538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm1, xmm6
93538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
94538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
95538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
96538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm2, xmm0
97538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm0, xmm4            ; b21 b20 a21 a20
98538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubd     xmm2, xmm4            ; c21 c20 d21 d20
99538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm3, xmm1
100538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm1, xmm6            ; b23 b22 a23 a22
101538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubd     xmm3, xmm6            ; c23 c22 d23 d22
102538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
103538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor      xmm4, xmm4
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    movdqa    xmm5, xmm4
105538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpgtd   xmm4, xmm0
106538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpgtd   xmm5, xmm2
107538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pand      xmm4, [GLOBAL(cd1)]
108538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pand      xmm5, [GLOBAL(cd1)]
109538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
110538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor      xmm6, xmm6
111538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm7, xmm6
112538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpgtd   xmm6, xmm1
113538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpgtd   xmm7, xmm3
114538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pand      xmm6, [GLOBAL(cd1)]
115538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pand      xmm7, [GLOBAL(cd1)]
116538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
117538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm0, xmm4
118538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm2, xmm5
119538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm0, [GLOBAL(cd3)]
120538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm2, [GLOBAL(cd3)]
121538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm1, xmm6
122538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm3, xmm7
123538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm1, [GLOBAL(cd3)]
124538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd     xmm3, [GLOBAL(cd3)]
125538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
126538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrad     xmm0, 3
127538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrad     xmm1, 3
128538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrad     xmm2, 3
129538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrad     xmm3, 3
130538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm4, xmm0
131538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
132538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
133538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa    xmm5, xmm2
134538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
135538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
136538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
137538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
138538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
139538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
140538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa  XMMWORD PTR [rdi], xmm0
141538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa  XMMWORD PTR [rdi + 16], xmm2
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
146538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    RESTORE_GOT
147538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    RESTORE_XMM
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
151538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
152538f6170b788de7408b06efc6613dc98579aa6a6Andreas HuberSECTION_RODATA
153538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16
154538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberc1:
155538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
156538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16
157538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercn1:
158538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
159538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16
160538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercd1:
161538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
162538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberalign 16
163538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubercd3:
164538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
165