1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
15;                            short *diff, unsigned char *Predictor,
16;                            int pitch);
17global sym(vp8_subtract_b_mmx_impl)
18sym(vp8_subtract_b_mmx_impl):
19    push        rbp
20    mov         rbp, rsp
21    SHADOW_ARGS_TO_STACK 5
22    push rsi
23    push rdi
24    ; end prolog
25
26
27        mov     rdi,        arg(2) ;diff
28        mov     rax,        arg(3) ;Predictor
29        mov     rsi,        arg(0) ;z
30        movsxd  rdx,        dword ptr arg(1);src_stride;
31        movsxd  rcx,        dword ptr arg(4);pitch
32        pxor    mm7,        mm7
33
34        movd    mm0,        [rsi]
35        movd    mm1,        [rax]
36        punpcklbw   mm0,    mm7
37        punpcklbw   mm1,    mm7
38        psubw   mm0,        mm1
39        movq    [rdi],      mm0
40
41
42        movd    mm0,        [rsi+rdx]
43        movd    mm1,        [rax+rcx]
44        punpcklbw   mm0,    mm7
45        punpcklbw   mm1,    mm7
46        psubw   mm0,        mm1
47        movq    [rdi+rcx*2],mm0
48
49
50        movd    mm0,        [rsi+rdx*2]
51        movd    mm1,        [rax+rcx*2]
52        punpcklbw   mm0,    mm7
53        punpcklbw   mm1,    mm7
54        psubw   mm0,        mm1
55        movq    [rdi+rcx*4],        mm0
56
57        lea     rsi,        [rsi+rdx*2]
58        lea     rcx,        [rcx+rcx*2]
59
60
61
62        movd    mm0,        [rsi+rdx]
63        movd    mm1,        [rax+rcx]
64        punpcklbw   mm0,    mm7
65        punpcklbw   mm1,    mm7
66        psubw   mm0,        mm1
67        movq    [rdi+rcx*2],        mm0
68
69    ; begin epilog
70    pop rdi
71    pop rsi
72    UNSHADOW_ARGS
73    pop         rbp
74    ret
75
76;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
77global sym(vp8_subtract_mby_mmx)
78sym(vp8_subtract_mby_mmx):
79    push        rbp
80    mov         rbp, rsp
81    SHADOW_ARGS_TO_STACK 4
82    push rsi
83    push rdi
84    ; end prolog
85
86
87            mov         rsi,            arg(1) ;src
88            mov         rdi,            arg(0) ;diff
89
90            mov         rax,            arg(2) ;pred
91            movsxd      rdx,            dword ptr arg(3) ;stride
92
93            mov         rcx,            16
94            pxor        mm0,            mm0
95
96submby_loop:
97
98            movq        mm1,            [rsi]
99            movq        mm3,            [rax]
100
101            movq        mm2,            mm1
102            movq        mm4,            mm3
103
104            punpcklbw   mm1,            mm0
105            punpcklbw   mm3,            mm0
106
107            punpckhbw   mm2,            mm0
108            punpckhbw   mm4,            mm0
109
110            psubw       mm1,            mm3
111            psubw       mm2,            mm4
112
113            movq        [rdi],          mm1
114            movq        [rdi+8],        mm2
115
116
117            movq        mm1,            [rsi+8]
118            movq        mm3,            [rax+8]
119
120            movq        mm2,            mm1
121            movq        mm4,            mm3
122
123            punpcklbw   mm1,            mm0
124            punpcklbw   mm3,            mm0
125
126            punpckhbw   mm2,            mm0
127            punpckhbw   mm4,            mm0
128
129            psubw       mm1,            mm3
130            psubw       mm2,            mm4
131
132            movq        [rdi+16],       mm1
133            movq        [rdi+24],       mm2
134
135
136            add         rdi,            32
137            add         rax,            16
138
139            lea         rsi,            [rsi+rdx]
140
141            sub         rcx,            1
142            jnz         submby_loop
143
144    pop rdi
145    pop rsi
146    ; begin epilog
147    UNSHADOW_ARGS
148    pop         rbp
149    ret
150
151
152;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
153global sym(vp8_subtract_mbuv_mmx)
154sym(vp8_subtract_mbuv_mmx):
155    push        rbp
156    mov         rbp, rsp
157    SHADOW_ARGS_TO_STACK 5
158    push rsi
159    push rdi
160    ; end prolog
161
162    ;short *udiff = diff + 256;
163    ;short *vdiff = diff + 320;
164    ;unsigned char *upred = pred + 256;
165    ;unsigned char *vpred = pred + 320;
166
167        ;unsigned char  *z    = usrc;
168        ;unsigned short *diff = udiff;
169        ;unsigned char  *Predictor= upred;
170
171            mov     rdi,        arg(0) ;diff
172            mov     rax,        arg(3) ;pred
173            mov     rsi,        arg(1) ;z = usrc
174            add     rdi,        256*2  ;diff = diff + 256 (shorts)
175            add     rax,        256    ;Predictor = pred + 256
176            movsxd  rdx,        dword ptr arg(4) ;stride;
177            pxor    mm7,        mm7
178
179            movq    mm0,        [rsi]
180            movq    mm1,        [rax]
181            movq    mm3,        mm0
182            movq    mm4,        mm1
183            punpcklbw   mm0,    mm7
184            punpcklbw   mm1,    mm7
185            punpckhbw   mm3,    mm7
186            punpckhbw   mm4,    mm7
187            psubw   mm0,        mm1
188            psubw   mm3,        mm4
189            movq    [rdi],      mm0
190            movq    [rdi+8],    mm3
191
192
193            movq    mm0,        [rsi+rdx]
194            movq    mm1,        [rax+8]
195            movq    mm3,        mm0
196            movq    mm4,        mm1
197            punpcklbw   mm0,    mm7
198            punpcklbw   mm1,    mm7
199            punpckhbw   mm3,    mm7
200            punpckhbw   mm4,    mm7
201            psubw   mm0,        mm1
202            psubw   mm3,        mm4
203            movq    [rdi+16],   mm0
204            movq    [rdi+24],   mm3
205
206            movq    mm0,        [rsi+rdx*2]
207            movq    mm1,        [rax+16]
208            movq    mm3,        mm0
209            movq    mm4,        mm1
210            punpcklbw   mm0,    mm7
211            punpcklbw   mm1,    mm7
212            punpckhbw   mm3,    mm7
213            punpckhbw   mm4,    mm7
214            psubw   mm0,        mm1
215            psubw   mm3,        mm4
216            movq    [rdi+32],   mm0
217            movq    [rdi+40],   mm3
218            lea     rsi,        [rsi+rdx*2]
219
220
221            movq    mm0,        [rsi+rdx]
222            movq    mm1,        [rax+24]
223            movq    mm3,        mm0
224            movq    mm4,        mm1
225            punpcklbw   mm0,    mm7
226            punpcklbw   mm1,    mm7
227            punpckhbw   mm3,    mm7
228            punpckhbw   mm4,    mm7
229            psubw   mm0,        mm1
230            psubw   mm3,        mm4
231
232            movq    [rdi+48],   mm0
233            movq    [rdi+56],   mm3
234
235
236            add     rdi,        64
237            add     rax,        32
238            lea     rsi,        [rsi+rdx*2]
239
240
241            movq    mm0,        [rsi]
242            movq    mm1,        [rax]
243            movq    mm3,        mm0
244            movq    mm4,        mm1
245            punpcklbw   mm0,    mm7
246            punpcklbw   mm1,    mm7
247            punpckhbw   mm3,    mm7
248            punpckhbw   mm4,    mm7
249            psubw   mm0,        mm1
250            psubw   mm3,        mm4
251            movq    [rdi],      mm0
252            movq    [rdi+8],    mm3
253
254
255            movq    mm0,        [rsi+rdx]
256            movq    mm1,        [rax+8]
257            movq    mm3,        mm0
258            movq    mm4,        mm1
259            punpcklbw   mm0,    mm7
260            punpcklbw   mm1,    mm7
261            punpckhbw   mm3,    mm7
262            punpckhbw   mm4,    mm7
263            psubw   mm0,        mm1
264            psubw   mm3,        mm4
265            movq    [rdi+16],   mm0
266            movq    [rdi+24],   mm3
267
268            movq    mm0,        [rsi+rdx*2]
269            movq    mm1,        [rax+16]
270            movq    mm3,        mm0
271            movq    mm4,        mm1
272            punpcklbw   mm0,    mm7
273            punpcklbw   mm1,    mm7
274            punpckhbw   mm3,    mm7
275            punpckhbw   mm4,    mm7
276            psubw   mm0,        mm1
277            psubw   mm3,        mm4
278            movq    [rdi+32],   mm0
279            movq    [rdi+40],   mm3
280            lea     rsi,        [rsi+rdx*2]
281
282
283            movq    mm0,        [rsi+rdx]
284            movq    mm1,        [rax+24]
285            movq    mm3,        mm0
286            movq    mm4,        mm1
287            punpcklbw   mm0,    mm7
288            punpcklbw   mm1,    mm7
289            punpckhbw   mm3,    mm7
290            punpckhbw   mm4,    mm7
291            psubw   mm0,        mm1
292            psubw   mm3,        mm4
293
294            movq    [rdi+48],   mm0
295            movq    [rdi+56],   mm3
296
297        ;unsigned char  *z    = vsrc;
298        ;unsigned short *diff = vdiff;
299        ;unsigned char  *Predictor= vpred;
300
301            mov     rdi,        arg(0) ;diff
302            mov     rax,        arg(3) ;pred
303            mov     rsi,        arg(2) ;z = usrc
304            add     rdi,        320*2  ;diff = diff + 320 (shorts)
305            add     rax,        320    ;Predictor = pred + 320
306            movsxd  rdx,        dword ptr arg(4) ;stride;
307            pxor    mm7,        mm7
308
309            movq    mm0,        [rsi]
310            movq    mm1,        [rax]
311            movq    mm3,        mm0
312            movq    mm4,        mm1
313            punpcklbw   mm0,    mm7
314            punpcklbw   mm1,    mm7
315            punpckhbw   mm3,    mm7
316            punpckhbw   mm4,    mm7
317            psubw   mm0,        mm1
318            psubw   mm3,        mm4
319            movq    [rdi],      mm0
320            movq    [rdi+8],    mm3
321
322
323            movq    mm0,        [rsi+rdx]
324            movq    mm1,        [rax+8]
325            movq    mm3,        mm0
326            movq    mm4,        mm1
327            punpcklbw   mm0,    mm7
328            punpcklbw   mm1,    mm7
329            punpckhbw   mm3,    mm7
330            punpckhbw   mm4,    mm7
331            psubw   mm0,        mm1
332            psubw   mm3,        mm4
333            movq    [rdi+16],   mm0
334            movq    [rdi+24],   mm3
335
336            movq    mm0,        [rsi+rdx*2]
337            movq    mm1,        [rax+16]
338            movq    mm3,        mm0
339            movq    mm4,        mm1
340            punpcklbw   mm0,    mm7
341            punpcklbw   mm1,    mm7
342            punpckhbw   mm3,    mm7
343            punpckhbw   mm4,    mm7
344            psubw   mm0,        mm1
345            psubw   mm3,        mm4
346            movq    [rdi+32],   mm0
347            movq    [rdi+40],   mm3
348            lea     rsi,        [rsi+rdx*2]
349
350
351            movq    mm0,        [rsi+rdx]
352            movq    mm1,        [rax+24]
353            movq    mm3,        mm0
354            movq    mm4,        mm1
355            punpcklbw   mm0,    mm7
356            punpcklbw   mm1,    mm7
357            punpckhbw   mm3,    mm7
358            punpckhbw   mm4,    mm7
359            psubw   mm0,        mm1
360            psubw   mm3,        mm4
361
362            movq    [rdi+48],   mm0
363            movq    [rdi+56],   mm3
364
365
366            add     rdi,        64
367            add     rax,        32
368            lea     rsi,        [rsi+rdx*2]
369
370
371            movq    mm0,        [rsi]
372            movq    mm1,        [rax]
373            movq    mm3,        mm0
374            movq    mm4,        mm1
375            punpcklbw   mm0,    mm7
376            punpcklbw   mm1,    mm7
377            punpckhbw   mm3,    mm7
378            punpckhbw   mm4,    mm7
379            psubw   mm0,        mm1
380            psubw   mm3,        mm4
381            movq    [rdi],      mm0
382            movq    [rdi+8],    mm3
383
384
385            movq    mm0,        [rsi+rdx]
386            movq    mm1,        [rax+8]
387            movq    mm3,        mm0
388            movq    mm4,        mm1
389            punpcklbw   mm0,    mm7
390            punpcklbw   mm1,    mm7
391            punpckhbw   mm3,    mm7
392            punpckhbw   mm4,    mm7
393            psubw   mm0,        mm1
394            psubw   mm3,        mm4
395            movq    [rdi+16],   mm0
396            movq    [rdi+24],   mm3
397
398            movq    mm0,        [rsi+rdx*2]
399            movq    mm1,        [rax+16]
400            movq    mm3,        mm0
401            movq    mm4,        mm1
402            punpcklbw   mm0,    mm7
403            punpcklbw   mm1,    mm7
404            punpckhbw   mm3,    mm7
405            punpckhbw   mm4,    mm7
406            psubw   mm0,        mm1
407            psubw   mm3,        mm4
408            movq    [rdi+32],   mm0
409            movq    [rdi+40],   mm3
410            lea     rsi,        [rsi+rdx*2]
411
412
413            movq    mm0,        [rsi+rdx]
414            movq    mm1,        [rax+24]
415            movq    mm3,        mm0
416            movq    mm4,        mm1
417            punpcklbw   mm0,    mm7
418            punpcklbw   mm1,    mm7
419            punpckhbw   mm3,    mm7
420            punpckhbw   mm4,    mm7
421            psubw   mm0,        mm1
422            psubw   mm3,        mm4
423
424            movq    [rdi+48],   mm0
425            movq    [rdi+56],   mm3
426
427    ; begin epilog
428    pop rdi
429    pop rsi
430    UNSHADOW_ARGS
431    pop         rbp
432    ret
433