1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
15global sym(vp8_short_inv_walsh4x4_sse2)
16sym(vp8_short_inv_walsh4x4_sse2):
17    push        rbp
18    mov         rbp, rsp
19    SHADOW_ARGS_TO_STACK 2
20    SAVE_XMM
21    push        rsi
22    push        rdi
23    ; end prolog
24
25    mov     rsi, arg(0)
26    mov     rdi, arg(1)
27    mov     rax, 3
28
29    movdqa    xmm0, [rsi + 0]       ;ip[4] ip[0]
30    movdqa    xmm1, [rsi + 16]      ;ip[12] ip[8]
31
32    shl     rax, 16
33    or      rax, 3            ;00030003h
34
35    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
36    movdqa    xmm3, xmm0          ;ip[4] ip[0]
37
38    paddw   xmm0, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
39    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
40
41    movdqa    xmm4, xmm0
42    punpcklqdq  xmm0, xmm3          ;d1 a1
43    punpckhqdq  xmm4, xmm3          ;c1 b1
44    movd    xmm7, eax
45
46    movdqa    xmm1, xmm4          ;c1 b1
47    paddw   xmm4, xmm0          ;dl+cl a1+b1 aka op[4] op[0]
48    psubw   xmm0, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
49
50;;;temp output
51;;  movdqu  [rdi + 0], xmm4
52;;  movdqu  [rdi + 16], xmm3
53
54;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55    ; 13 12 11 10 03 02 01 00
56    ;
57    ; 33 32 31 30 23 22 21 20
58    ;
59    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
60    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
61    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
62    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
63    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
64    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
65    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
66    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
67    movdqa    xmm3, xmm4          ;ip[4] ip[0]
68
69    pshufd    xmm7, xmm7, 0       ;03 03 03 03 03 03 03 03
70
71    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
72    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
73
74    movdqa    xmm5, xmm4
75    punpcklqdq  xmm4, xmm3          ;d1 a1
76    punpckhqdq  xmm5, xmm3          ;c1 b1
77
78    movdqa    xmm1, xmm5          ;c1 b1
79    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
80    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
81;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
82    ; 13 12 11 10 03 02 01 00
83    ;
84    ; 33 32 31 30 23 22 21 20
85    ;
86    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
87    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
88    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
89    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
90    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
91    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
92;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
93    paddw   xmm5, xmm7
94    paddw   xmm1, xmm7
95
96    psraw   xmm5, 3
97    psraw   xmm1, 3
98
99    movdqa  [rdi + 0], xmm5
100    movdqa  [rdi + 16], xmm1
101
102    ; begin epilog
103    pop rdi
104    pop rsi
105    RESTORE_XMM
106    UNSHADOW_ARGS
107    pop         rbp
108    ret
109
110SECTION_RODATA
111align 16
112x_s1sqr2:
113    times 4 dw 0x8A8C
114align 16
115x_c1sqr2less1:
116    times 4 dw 0x4E7B
117align 16
118fours:
119    times 4 dw 0x0004
120