190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14f71323e297a928af368937089d3ed71239786f86Andreas Huber;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
15f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_short_fdct4x4_sse2)
16f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_short_fdct4x4_sse2):
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
19f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 3
20f71323e297a928af368937089d3ed71239786f86Andreas Huber;;    SAVE_XMM
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
22f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
23f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rsi, arg(0)
27f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsxd      rax, DWORD PTR arg(2)
28f71323e297a928af368937089d3ed71239786f86Andreas Huber    lea         rdi, [rsi + rax*2]
29f71323e297a928af368937089d3ed71239786f86Andreas Huber
30f71323e297a928af368937089d3ed71239786f86Andreas Huber    movq        xmm0, MMWORD PTR[rsi   ]        ;03 02 01 00
31f71323e297a928af368937089d3ed71239786f86Andreas Huber    movq        xmm2, MMWORD PTR[rsi + rax]     ;13 12 11 10
32f71323e297a928af368937089d3ed71239786f86Andreas Huber    movq        xmm1, MMWORD PTR[rsi + rax*2]   ;23 22 21 20
33f71323e297a928af368937089d3ed71239786f86Andreas Huber    movq        xmm3, MMWORD PTR[rdi + rax]     ;33 32 31 30
34f71323e297a928af368937089d3ed71239786f86Andreas Huber
35f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
36f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
37f71323e297a928af368937089d3ed71239786f86Andreas Huber
38f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rdi, arg(1)
39f71323e297a928af368937089d3ed71239786f86Andreas Huber
40f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm2, xmm0
41f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
42f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
43f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm1, xmm0
44f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
45f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
46f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
47f71323e297a928af368937089d3ed71239786f86Andreas Huber
48f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
49f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm3, xmm0
50f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
51f71323e297a928af368937089d3ed71239786f86Andreas Huber    psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
52f71323e297a928af368937089d3ed71239786f86Andreas Huber    psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
53f71323e297a928af368937089d3ed71239786f86Andreas Huber    psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
54f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm1, xmm0
5576e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
5676e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
57f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm4, xmm3
5876e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
5976e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
60f71323e297a928af368937089d3ed71239786f86Andreas Huber
6176e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
6276e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
63f71323e297a928af368937089d3ed71239786f86Andreas Huber    psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
64f71323e297a928af368937089d3ed71239786f86Andreas Huber    psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
65f71323e297a928af368937089d3ed71239786f86Andreas Huber
66f71323e297a928af368937089d3ed71239786f86Andreas Huber    packssdw    xmm0, xmm1                      ;op[2] op[0]
67f71323e297a928af368937089d3ed71239786f86Andreas Huber    packssdw    xmm3, xmm4                      ;op[3] op[1]
68f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; 23 22 21 20 03 02 01 00
69f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;
70f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; 33 32 31 30 13 12 11 10
71f71323e297a928af368937089d3ed71239786f86Andreas Huber    ;
72f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm2, xmm0
73f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
74f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
75f71323e297a928af368937089d3ed71239786f86Andreas Huber
76f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm3, xmm0
77f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
78f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
79f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm2, xmm0
80f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
81f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
82f71323e297a928af368937089d3ed71239786f86Andreas Huber
8376e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
84f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshufd      xmm2, xmm2, 04eh
85f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm3, xmm0
86f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
87f71323e297a928af368937089d3ed71239786f86Andreas Huber    psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
88f71323e297a928af368937089d3ed71239786f86Andreas Huber
89f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
90f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm2, xmm3                      ;save d1 for compare
91f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
92f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
93f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
94f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
95f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
96f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm1, xmm0
9776e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
9876e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
99f71323e297a928af368937089d3ed71239786f86Andreas Huber
100f71323e297a928af368937089d3ed71239786f86Andreas Huber    pxor        xmm4, xmm4                      ;zero out for compare
101f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddd       xmm0, xmm5
102f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddd       xmm1, xmm5
103f71323e297a928af368937089d3ed71239786f86Andreas Huber    pcmpeqw     xmm2, xmm4
104f71323e297a928af368937089d3ed71239786f86Andreas Huber    psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
105f71323e297a928af368937089d3ed71239786f86Andreas Huber    psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
10676e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
10776e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber                                                     ;and keep bit 0 of lower
108f71323e297a928af368937089d3ed71239786f86Andreas Huber
109f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm4, xmm3
11076e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
11176e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
11276e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
11376e0247ec867fcc232fc79f21e9bf85d3c3a5a3fAndreas Huber    paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
114f71323e297a928af368937089d3ed71239786f86Andreas Huber    packssdw    xmm0, xmm1                      ;op[8] op[0]
115f71323e297a928af368937089d3ed71239786f86Andreas Huber    psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
116f71323e297a928af368937089d3ed71239786f86Andreas Huber    psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
117f71323e297a928af368937089d3ed71239786f86Andreas Huber
118f71323e297a928af368937089d3ed71239786f86Andreas Huber    packssdw    xmm3, xmm4                      ;op[12] op[4]
119f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm1, xmm0
120f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
121f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
122f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
123f71323e297a928af368937089d3ed71239786f86Andreas Huber
124f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      XMMWORD PTR[rdi + 0], xmm0
125f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      XMMWORD PTR[rdi + 16], xmm1
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
128f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rdi
129f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop rsi
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
131f71323e297a928af368937089d3ed71239786f86Andreas Huber;;    RESTORE_XMM
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
138f71323e297a928af368937089d3ed71239786f86Andreas Huber_5352_2217:
139f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 5352
140f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 2217
141f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 5352
142f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 2217
143f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 5352
144f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 2217
145f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 5352
146f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 2217
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
148f71323e297a928af368937089d3ed71239786f86Andreas Huber_2217_neg5352:
149f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 2217
150f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw -5352
151f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 2217
152f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw -5352
153f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 2217
154f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw -5352
155f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 2217
156f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw -5352
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
158f71323e297a928af368937089d3ed71239786f86Andreas Huber_mult_add:
159f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 8 dw 1
160f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
161f71323e297a928af368937089d3ed71239786f86Andreas Huber_cmp_mask:
162f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 4 dw 1
163f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 4 dw 0
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
166f71323e297a928af368937089d3ed71239786f86Andreas Huber_mult_sub:
167f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 1
168f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw -1
169f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 1
170f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw -1
171f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 1
172f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw -1
173f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw 1
174f71323e297a928af368937089d3ed71239786f86Andreas Huber    dw -1
175f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
176f71323e297a928af368937089d3ed71239786f86Andreas Huber_7:
177f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 4 dd 7
178f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
179f71323e297a928af368937089d3ed71239786f86Andreas Huber_14500:
180f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 4 dd 14500
181f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
182f71323e297a928af368937089d3ed71239786f86Andreas Huber_7500:
183f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 4 dd 7500
184f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
185f71323e297a928af368937089d3ed71239786f86Andreas Huber_12000:
186f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 4 dd 12000
187f71323e297a928af368937089d3ed71239786f86Andreas Huberalign 16
188f71323e297a928af368937089d3ed71239786f86Andreas Huber_51000:
189f71323e297a928af368937089d3ed71239786f86Andreas Huber    times 4 dd 51000
190