1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14
15;void copy_mem8x8_mmx(
16;    unsigned char *src,
17;    int src_stride,
18;    unsigned char *dst,
19;    int dst_stride
20;    )
21global sym(vp8_copy_mem8x8_mmx) PRIVATE
22sym(vp8_copy_mem8x8_mmx):
23    push        rbp
24    mov         rbp, rsp
25    SHADOW_ARGS_TO_STACK 4
26    push        rsi
27    push        rdi
28    ; end prolog
29
30        mov         rsi,        arg(0) ;src;
31        movq        mm0,        [rsi]
32
33        movsxd      rax,        dword ptr arg(1) ;src_stride;
34        mov         rdi,        arg(2) ;dst;
35
36        movq        mm1,        [rsi+rax]
37        movq        mm2,        [rsi+rax*2]
38
39        movsxd      rcx,        dword ptr arg(3) ;dst_stride
40        lea         rsi,        [rsi+rax*2]
41
42        movq        [rdi],      mm0
43        add         rsi,        rax
44
45        movq        [rdi+rcx],      mm1
46        movq        [rdi+rcx*2],    mm2
47
48
49        lea         rdi,        [rdi+rcx*2]
50        movq        mm3,        [rsi]
51
52        add         rdi,        rcx
53        movq        mm4,        [rsi+rax]
54
55        movq        mm5,        [rsi+rax*2]
56        movq        [rdi],      mm3
57
58        lea         rsi,        [rsi+rax*2]
59        movq        [rdi+rcx],  mm4
60
61        movq        [rdi+rcx*2],    mm5
62        lea         rdi,        [rdi+rcx*2]
63
64        movq        mm0,        [rsi+rax]
65        movq        mm1,        [rsi+rax*2]
66
67        movq        [rdi+rcx],  mm0
68        movq        [rdi+rcx*2],mm1
69
70    ; begin epilog
71    pop rdi
72    pop rsi
73    UNSHADOW_ARGS
74    pop         rbp
75    ret
76
77
78;void copy_mem8x4_mmx(
79;    unsigned char *src,
80;    int src_stride,
81;    unsigned char *dst,
82;    int dst_stride
83;    )
84global sym(vp8_copy_mem8x4_mmx) PRIVATE
85sym(vp8_copy_mem8x4_mmx):
86    push        rbp
87    mov         rbp, rsp
88    SHADOW_ARGS_TO_STACK 4
89    push        rsi
90    push        rdi
91    ; end prolog
92
93        mov         rsi,        arg(0) ;src;
94        movq        mm0,        [rsi]
95
96        movsxd      rax,        dword ptr arg(1) ;src_stride;
97        mov         rdi,        arg(2) ;dst;
98
99        movq        mm1,        [rsi+rax]
100        movq        mm2,        [rsi+rax*2]
101
102        movsxd      rcx,        dword ptr arg(3) ;dst_stride
103        lea         rsi,        [rsi+rax*2]
104
105        movq        [rdi],      mm0
106        movq        [rdi+rcx],      mm1
107
108        movq        [rdi+rcx*2],    mm2
109        lea         rdi,        [rdi+rcx*2]
110
111        movq        mm3,        [rsi+rax]
112        movq        [rdi+rcx],      mm3
113
114    ; begin epilog
115    pop rdi
116    pop rsi
117    UNSHADOW_ARGS
118    pop         rbp
119    ret
120
121
122;void copy_mem16x16_mmx(
123;    unsigned char *src,
124;    int src_stride,
125;    unsigned char *dst,
126;    int dst_stride
127;    )
128global sym(vp8_copy_mem16x16_mmx) PRIVATE
129sym(vp8_copy_mem16x16_mmx):
130    push        rbp
131    mov         rbp, rsp
132    SHADOW_ARGS_TO_STACK 4
133    push        rsi
134    push        rdi
135    ; end prolog
136
137        mov         rsi,        arg(0) ;src;
138        movsxd      rax,        dword ptr arg(1) ;src_stride;
139
140        mov         rdi,        arg(2) ;dst;
141        movsxd      rcx,        dword ptr arg(3) ;dst_stride
142
143        movq        mm0,            [rsi]
144        movq        mm3,            [rsi+8];
145
146        movq        mm1,            [rsi+rax]
147        movq        mm4,            [rsi+rax+8]
148
149        movq        mm2,            [rsi+rax*2]
150        movq        mm5,            [rsi+rax*2+8]
151
152        lea         rsi,            [rsi+rax*2]
153        add         rsi,            rax
154
155        movq        [rdi],          mm0
156        movq        [rdi+8],        mm3
157
158        movq        [rdi+rcx],      mm1
159        movq        [rdi+rcx+8],    mm4
160
161        movq        [rdi+rcx*2],    mm2
162        movq        [rdi+rcx*2+8],  mm5
163
164        lea         rdi,            [rdi+rcx*2]
165        add         rdi,            rcx
166
167        movq        mm0,            [rsi]
168        movq        mm3,            [rsi+8];
169
170        movq        mm1,            [rsi+rax]
171        movq        mm4,            [rsi+rax+8]
172
173        movq        mm2,            [rsi+rax*2]
174        movq        mm5,            [rsi+rax*2+8]
175
176        lea         rsi,            [rsi+rax*2]
177        add         rsi,            rax
178
179        movq        [rdi],          mm0
180        movq        [rdi+8],        mm3
181
182        movq        [rdi+rcx],      mm1
183        movq        [rdi+rcx+8],    mm4
184
185        movq        [rdi+rcx*2],    mm2
186        movq        [rdi+rcx*2+8],  mm5
187
188        lea         rdi,            [rdi+rcx*2]
189        add         rdi,            rcx
190
191        movq        mm0,            [rsi]
192        movq        mm3,            [rsi+8];
193
194        movq        mm1,            [rsi+rax]
195        movq        mm4,            [rsi+rax+8]
196
197        movq        mm2,            [rsi+rax*2]
198        movq        mm5,            [rsi+rax*2+8]
199
200        lea         rsi,            [rsi+rax*2]
201        add         rsi,            rax
202
203        movq        [rdi],          mm0
204        movq        [rdi+8],        mm3
205
206        movq        [rdi+rcx],      mm1
207        movq        [rdi+rcx+8],    mm4
208
209        movq        [rdi+rcx*2],    mm2
210        movq        [rdi+rcx*2+8],  mm5
211
212        lea         rdi,            [rdi+rcx*2]
213        add         rdi,            rcx
214
215        movq        mm0,            [rsi]
216        movq        mm3,            [rsi+8];
217
218        movq        mm1,            [rsi+rax]
219        movq        mm4,            [rsi+rax+8]
220
221        movq        mm2,            [rsi+rax*2]
222        movq        mm5,            [rsi+rax*2+8]
223
224        lea         rsi,            [rsi+rax*2]
225        add         rsi,            rax
226
227        movq        [rdi],          mm0
228        movq        [rdi+8],        mm3
229
230        movq        [rdi+rcx],      mm1
231        movq        [rdi+rcx+8],    mm4
232
233        movq        [rdi+rcx*2],    mm2
234        movq        [rdi+rcx*2+8],  mm5
235
236        lea         rdi,            [rdi+rcx*2]
237        add         rdi,            rcx
238
239        movq        mm0,            [rsi]
240        movq        mm3,            [rsi+8];
241
242        movq        mm1,            [rsi+rax]
243        movq        mm4,            [rsi+rax+8]
244
245        movq        mm2,            [rsi+rax*2]
246        movq        mm5,            [rsi+rax*2+8]
247
248        lea         rsi,            [rsi+rax*2]
249        add         rsi,            rax
250
251        movq        [rdi],          mm0
252        movq        [rdi+8],        mm3
253
254        movq        [rdi+rcx],      mm1
255        movq        [rdi+rcx+8],    mm4
256
257        movq        [rdi+rcx*2],    mm2
258        movq        [rdi+rcx*2+8],  mm5
259
260        lea         rdi,            [rdi+rcx*2]
261        add         rdi,            rcx
262
263        movq        mm0,            [rsi]
264        movq        mm3,            [rsi+8];
265
266        movq        [rdi],          mm0
267        movq        [rdi+8],        mm3
268
269    ; begin epilog
270    pop rdi
271    pop rsi
272    UNSHADOW_ARGS
273    pop         rbp
274    ret
275