1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
15global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
16sym(vp8_short_inv_walsh4x4_mmx):
17    push        rbp
18    mov         rbp, rsp
19    SHADOW_ARGS_TO_STACK 2
20    ; end prolog
21
22    mov         rdx, arg(0)
23    mov         rax, 30003h
24
25    movq        mm0, [rdx + 0]    ;ip[0]
26    movq        mm1, [rdx + 8]    ;ip[4]
27    movq        mm7, rax
28
29    movq        mm2, [rdx + 16]   ;ip[8]
30    movq        mm3, [rdx + 24]   ;ip[12]
31    punpcklwd   mm7, mm7          ;0003000300030003h
32    mov         rdx, arg(1)
33
34    movq        mm4, mm0
35    movq        mm5, mm1
36
37    paddw       mm4, mm3          ;ip[0] + ip[12] aka al
38    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
39
40    movq        mm6, mm4          ;temp al
41    paddw       mm4, mm5          ;al + bl
42    psubw       mm6, mm5          ;al - bl
43
44    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
45    psubw       mm1, mm2          ;ip[4] - ip[8] aka c1
46
47    movq        mm5, mm0          ;temp dl
48    paddw       mm0, mm1          ;dl + cl
49    psubw       mm5, mm1          ;dl - cl
50
51    ; 03 02 01 00
52    ; 13 12 11 10
53    ; 23 22 21 20
54    ; 33 32 31 30
55
56    movq        mm3, mm4          ; 03 02 01 00
57    punpcklwd   mm4, mm0          ; 11 01 10 00
58    punpckhwd   mm3, mm0          ; 13 03 12 02
59
60    movq        mm1, mm6          ; 23 22 21 20
61    punpcklwd   mm6, mm5          ; 31 21 30 20
62    punpckhwd   mm1, mm5          ; 33 23 32 22
63
64    movq        mm0, mm4          ; 11 01 10 00
65    movq        mm2, mm3          ; 13 03 12 02
66
67    punpckldq   mm0, mm6          ; 30 20 10 00 aka ip[0]
68    punpckhdq   mm4, mm6          ; 31 21 11 01 aka ip[4]
69
70    punpckldq   mm2, mm1          ; 32 22 12 02 aka ip[8]
71    punpckhdq   mm3, mm1          ; 33 23 13 03 aka ip[12]
72;~~~~~~~~~~~~~~~~~~~~~
73    movq        mm1, mm0
74    movq        mm5, mm4
75    paddw       mm1, mm3          ;ip[0] + ip[12] aka al
76    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
77
78    movq        mm6, mm1          ;temp al
79    paddw       mm1, mm5          ;al + bl
80    psubw       mm6, mm5          ;al - bl
81    paddw       mm1, mm7
82    paddw       mm6, mm7
83    psraw       mm1, 3
84    psraw       mm6, 3
85
86    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
87    psubw       mm4, mm2          ;ip[4] - ip[8] aka c1
88
89    movq        mm5, mm0          ;temp dl
90    paddw       mm0, mm4          ;dl + cl
91    psubw       mm5, mm4          ;dl - cl
92    paddw       mm0, mm7
93    paddw       mm5, mm7
94    psraw       mm0, 3
95    psraw       mm5, 3
96;~~~~~~~~~~~~~~~~~~~~~
97
98    movd        eax, mm1
99    movd        ecx, mm0
100    psrlq       mm0, 32
101    psrlq       mm1, 32
102    mov         word ptr[rdx+32*0], ax
103    mov         word ptr[rdx+32*1], cx
104    shr         eax, 16
105    shr         ecx, 16
106    mov         word ptr[rdx+32*4], ax
107    mov         word ptr[rdx+32*5], cx
108    movd        eax, mm1
109    movd        ecx, mm0
110    mov         word ptr[rdx+32*8], ax
111    mov         word ptr[rdx+32*9], cx
112    shr         eax, 16
113    shr         ecx, 16
114    mov         word ptr[rdx+32*12], ax
115    mov         word ptr[rdx+32*13], cx
116
117    movd        eax, mm6
118    movd        ecx, mm5
119    psrlq       mm5, 32
120    psrlq       mm6, 32
121    mov         word ptr[rdx+32*2], ax
122    mov         word ptr[rdx+32*3], cx
123    shr         eax, 16
124    shr         ecx, 16
125    mov         word ptr[rdx+32*6], ax
126    mov         word ptr[rdx+32*7], cx
127    movd        eax, mm6
128    movd        ecx, mm5
129    mov         word ptr[rdx+32*10], ax
130    mov         word ptr[rdx+32*11], cx
131    shr         eax, 16
132    shr         ecx, 16
133    mov         word ptr[rdx+32*14], ax
134    mov         word ptr[rdx+32*15], cx
135
136    ; begin epilog
137    UNSHADOW_ARGS
138    pop         rbp
139    ret
140
141