1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
15global sym(vp8_get_mb_ss_mmx) PRIVATE
16sym(vp8_get_mb_ss_mmx):
17    push        rbp
18    mov         rbp, rsp
19    SHADOW_ARGS_TO_STACK 7
20    GET_GOT     rbx
21    push rsi
22    push rdi
23    sub         rsp, 8
24    ; end prolog
25
26        mov         rax, arg(0) ;src_ptr
27        mov         rcx, 16
28        pxor        mm4, mm4
29
30.NEXTROW:
31        movq        mm0, [rax]
32        movq        mm1, [rax+8]
33        movq        mm2, [rax+16]
34        movq        mm3, [rax+24]
35        pmaddwd     mm0, mm0
36        pmaddwd     mm1, mm1
37        pmaddwd     mm2, mm2
38        pmaddwd     mm3, mm3
39
40        paddd       mm4, mm0
41        paddd       mm4, mm1
42        paddd       mm4, mm2
43        paddd       mm4, mm3
44
45        add         rax, 32
46        dec         rcx
47        ja          .NEXTROW
48        movq        QWORD PTR [rsp], mm4
49
50        ;return sum[0]+sum[1];
51        movsxd      rax, dword ptr [rsp]
52        movsxd      rcx, dword ptr [rsp+4]
53        add         rax, rcx
54
55
56    ; begin epilog
57    add rsp, 8
58    pop rdi
59    pop rsi
60    RESTORE_GOT
61    UNSHADOW_ARGS
62    pop         rbp
63    ret
64
65
66;unsigned int vp8_get8x8var_mmx
67;(
68;    unsigned char *src_ptr,
69;    int  source_stride,
70;    unsigned char *ref_ptr,
71;    int  recon_stride,
72;    unsigned int *SSE,
73;    int *Sum
74;)
75global sym(vp8_get8x8var_mmx) PRIVATE
76sym(vp8_get8x8var_mmx):
77    push        rbp
78    mov         rbp, rsp
79    SHADOW_ARGS_TO_STACK 6
80    push rsi
81    push rdi
82    push rbx
83    sub         rsp, 16
84    ; end prolog
85
86
87        pxor        mm5, mm5                    ; Blank mmx6
88        pxor        mm6, mm6                    ; Blank mmx7
89        pxor        mm7, mm7                    ; Blank mmx7
90
91        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
92        mov         rbx, arg(2) ;[ref_ptr]
93        movsxd      rcx, dword ptr arg(1) ;[source_stride]
94        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
95
96        ; Row 1
97        movq        mm0, [rax]                  ; Copy eight bytes to mm0
98        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
99        movq        mm2, mm0                    ; Take copies
100        movq        mm3, mm1                    ; Take copies
101
102        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
103        punpcklbw   mm1, mm6
104        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
105        punpckhbw   mm3, mm6
106        psubsw      mm0, mm1                    ; A-B (low order) to MM0
107        psubsw      mm2, mm3                    ; A-B (high order) to MM2
108
109        paddw       mm5, mm0                    ; accumulate differences in mm5
110        paddw       mm5, mm2                    ; accumulate differences in mm5
111
112        pmaddwd     mm0, mm0                    ; square and accumulate
113        pmaddwd     mm2, mm2                    ; square and accumulate
114        add         rbx,rdx                     ; Inc pointer into ref data
115        add         rax,rcx                     ; Inc pointer into the new data
116        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
117        paddd       mm7, mm0                    ; accumulate in mm7
118        paddd       mm7, mm2                    ; accumulate in mm7
119
120
121        ; Row 2
122        movq        mm0, [rax]                  ; Copy eight bytes to mm0
123        movq        mm2, mm0                    ; Take copies
124        movq        mm3, mm1                    ; Take copies
125
126        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
127        punpcklbw   mm1, mm6
128        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
129        punpckhbw   mm3, mm6
130        psubsw      mm0, mm1                    ; A-B (low order) to MM0
131        psubsw      mm2, mm3                    ; A-B (high order) to MM2
132
133        paddw       mm5, mm0                    ; accumulate differences in mm5
134        paddw       mm5, mm2                    ; accumulate differences in mm5
135
136        pmaddwd     mm0, mm0                    ; square and accumulate
137        pmaddwd     mm2, mm2                    ; square and accumulate
138        add         rbx,rdx                     ; Inc pointer into ref data
139        add         rax,rcx                     ; Inc pointer into the new data
140        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
141        paddd       mm7, mm0                    ; accumulate in mm7
142        paddd       mm7, mm2                    ; accumulate in mm7
143
144        ; Row 3
145        movq        mm0, [rax]                  ; Copy eight bytes to mm0
146        movq        mm2, mm0                    ; Take copies
147        movq        mm3, mm1                    ; Take copies
148
149        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
150        punpcklbw   mm1, mm6
151        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
152        punpckhbw   mm3, mm6
153        psubsw      mm0, mm1                    ; A-B (low order) to MM0
154        psubsw      mm2, mm3                    ; A-B (high order) to MM2
155
156        paddw       mm5, mm0                    ; accumulate differences in mm5
157        paddw       mm5, mm2                    ; accumulate differences in mm5
158
159        pmaddwd     mm0, mm0                    ; square and accumulate
160        pmaddwd     mm2, mm2                    ; square and accumulate
161        add         rbx,rdx                     ; Inc pointer into ref data
162        add         rax,rcx                     ; Inc pointer into the new data
163        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
164        paddd       mm7, mm0                    ; accumulate in mm7
165        paddd       mm7, mm2                    ; accumulate in mm7
166
167        ; Row 4
168        movq        mm0, [rax]                  ; Copy eight bytes to mm0
169        movq        mm2, mm0                    ; Take copies
170        movq        mm3, mm1                    ; Take copies
171
172        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
173        punpcklbw   mm1, mm6
174        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
175        punpckhbw   mm3, mm6
176        psubsw      mm0, mm1                    ; A-B (low order) to MM0
177        psubsw      mm2, mm3                    ; A-B (high order) to MM2
178
179        paddw       mm5, mm0                    ; accumulate differences in mm5
180        paddw       mm5, mm2                    ; accumulate differences in mm5
181
182        pmaddwd     mm0, mm0                    ; square and accumulate
183        pmaddwd     mm2, mm2                    ; square and accumulate
184        add         rbx,rdx                     ; Inc pointer into ref data
185        add         rax,rcx                     ; Inc pointer into the new data
186        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
187        paddd       mm7, mm0                    ; accumulate in mm7
188        paddd       mm7, mm2                    ; accumulate in mm7
189
190        ; Row 5
191        movq        mm0, [rax]                  ; Copy eight bytes to mm0
192        movq        mm2, mm0                    ; Take copies
193        movq        mm3, mm1                    ; Take copies
194
195        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
196        punpcklbw   mm1, mm6
197        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
198        punpckhbw   mm3, mm6
199        psubsw      mm0, mm1                    ; A-B (low order) to MM0
200        psubsw      mm2, mm3                    ; A-B (high order) to MM2
201
202        paddw       mm5, mm0                    ; accumulate differences in mm5
203        paddw       mm5, mm2                    ; accumulate differences in mm5
204
205        pmaddwd     mm0, mm0                    ; square and accumulate
206        pmaddwd     mm2, mm2                    ; square and accumulate
207        add         rbx,rdx                     ; Inc pointer into ref data
208        add         rax,rcx                     ; Inc pointer into the new data
209        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
210        ;              movq        mm4, [rbx + rdx]
211        paddd       mm7, mm0                    ; accumulate in mm7
212        paddd       mm7, mm2                    ; accumulate in mm7
213
214        ; Row 6
215        movq        mm0, [rax]                  ; Copy eight bytes to mm0
216        movq        mm2, mm0                    ; Take copies
217        movq        mm3, mm1                    ; Take copies
218
219        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
220        punpcklbw   mm1, mm6
221        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
222        punpckhbw   mm3, mm6
223        psubsw      mm0, mm1                    ; A-B (low order) to MM0
224        psubsw      mm2, mm3                    ; A-B (high order) to MM2
225
226        paddw       mm5, mm0                    ; accumulate differences in mm5
227        paddw       mm5, mm2                    ; accumulate differences in mm5
228
229        pmaddwd     mm0, mm0                    ; square and accumulate
230        pmaddwd     mm2, mm2                    ; square and accumulate
231        add         rbx,rdx                     ; Inc pointer into ref data
232        add         rax,rcx                     ; Inc pointer into the new data
233        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
234        paddd       mm7, mm0                    ; accumulate in mm7
235        paddd       mm7, mm2                    ; accumulate in mm7
236
237        ; Row 7
238        movq        mm0, [rax]                  ; Copy eight bytes to mm0
239        movq        mm2, mm0                    ; Take copies
240        movq        mm3, mm1                    ; Take copies
241
242        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
243        punpcklbw   mm1, mm6
244        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
245        punpckhbw   mm3, mm6
246        psubsw      mm0, mm1                    ; A-B (low order) to MM0
247        psubsw      mm2, mm3                    ; A-B (high order) to MM2
248
249        paddw       mm5, mm0                    ; accumulate differences in mm5
250        paddw       mm5, mm2                    ; accumulate differences in mm5
251
252        pmaddwd     mm0, mm0                    ; square and accumulate
253        pmaddwd     mm2, mm2                    ; square and accumulate
254        add         rbx,rdx                     ; Inc pointer into ref data
255        add         rax,rcx                     ; Inc pointer into the new data
256        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
257        paddd       mm7, mm0                    ; accumulate in mm7
258        paddd       mm7, mm2                    ; accumulate in mm7
259
260        ; Row 8
261        movq        mm0, [rax]                  ; Copy eight bytes to mm0
262        movq        mm2, mm0                    ; Take copies
263        movq        mm3, mm1                    ; Take copies
264
265        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
266        punpcklbw   mm1, mm6
267        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
268        punpckhbw   mm3, mm6
269        psubsw      mm0, mm1                    ; A-B (low order) to MM0
270        psubsw      mm2, mm3                    ; A-B (high order) to MM2
271
272        paddw       mm5, mm0                    ; accumulate differences in mm5
273        paddw       mm5, mm2                    ; accumulate differences in mm5
274
275        pmaddwd     mm0, mm0                    ; square and accumulate
276        pmaddwd     mm2, mm2                    ; square and accumulate
277        add         rbx,rdx                     ; Inc pointer into ref data
278        add         rax,rcx                     ; Inc pointer into the new data
279        paddd       mm7, mm0                    ; accumulate in mm7
280        paddd       mm7, mm2                    ; accumulate in mm7
281
282        ; Now accumulate the final results.
283        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
284        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
285        movsx       rdx, WORD PTR [rsp+8]
286        movsx       rcx, WORD PTR [rsp+10]
287        movsx       rbx, WORD PTR [rsp+12]
288        movsx       rax, WORD PTR [rsp+14]
289        add         rdx, rcx
290        add         rbx, rax
291        add         rdx, rbx    ;XSum
292        movsxd      rax, DWORD PTR [rsp]
293        movsxd      rcx, DWORD PTR [rsp+4]
294        add         rax, rcx    ;XXSum
295        mov         rsi, arg(4) ;SSE
296        mov         rdi, arg(5) ;Sum
297        mov         dword ptr [rsi], eax
298        mov         dword ptr [rdi], edx
299        xor         rax, rax    ; return 0
300
301
302    ; begin epilog
303    add rsp, 16
304    pop rbx
305    pop rdi
306    pop rsi
307    UNSHADOW_ARGS
308    pop         rbp
309    ret
310
311
312
313;unsigned int
314;vp8_get4x4var_mmx
315;(
316;    unsigned char *src_ptr,
317;    int  source_stride,
318;    unsigned char *ref_ptr,
319;    int  recon_stride,
320;    unsigned int *SSE,
321;    int *Sum
322;)
323global sym(vp8_get4x4var_mmx) PRIVATE
324sym(vp8_get4x4var_mmx):
325    push        rbp
326    mov         rbp, rsp
327    SHADOW_ARGS_TO_STACK 6
328    push rsi
329    push rdi
330    push rbx
331    sub         rsp, 16
332    ; end prolog
333
334
335        pxor        mm5, mm5                    ; Blank mmx6
336        pxor        mm6, mm6                    ; Blank mmx7
337        pxor        mm7, mm7                    ; Blank mmx7
338
339        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
340        mov         rbx, arg(2) ;[ref_ptr]
341        movsxd      rcx, dword ptr arg(1) ;[source_stride]
342        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
343
344        ; Row 1
345        movq        mm0, [rax]                  ; Copy eight bytes to mm0
346        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
347        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
348        punpcklbw   mm1, mm6
349        psubsw      mm0, mm1                    ; A-B (low order) to MM0
350        paddw       mm5, mm0                    ; accumulate differences in mm5
351        pmaddwd     mm0, mm0                    ; square and accumulate
352        add         rbx,rdx                     ; Inc pointer into ref data
353        add         rax,rcx                     ; Inc pointer into the new data
354        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
355        paddd       mm7, mm0                    ; accumulate in mm7
356
357
358        ; Row 2
359        movq        mm0, [rax]                  ; Copy eight bytes to mm0
360        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
361        punpcklbw   mm1, mm6
362        psubsw      mm0, mm1                    ; A-B (low order) to MM0
363        paddw       mm5, mm0                    ; accumulate differences in mm5
364
365        pmaddwd     mm0, mm0                    ; square and accumulate
366        add         rbx,rdx                     ; Inc pointer into ref data
367        add         rax,rcx                     ; Inc pointer into the new data
368        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
369        paddd       mm7, mm0                    ; accumulate in mm7
370
371        ; Row 3
372        movq        mm0, [rax]                  ; Copy eight bytes to mm0
373        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
374        punpcklbw   mm1, mm6
375        psubsw      mm0, mm1                    ; A-B (low order) to MM0
376        paddw       mm5, mm0                    ; accumulate differences in mm5
377
378        pmaddwd     mm0, mm0                    ; square and accumulate
379        add         rbx,rdx                     ; Inc pointer into ref data
380        add         rax,rcx                     ; Inc pointer into the new data
381        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
382        paddd       mm7, mm0                    ; accumulate in mm7
383
384        ; Row 4
385        movq        mm0, [rax]                  ; Copy eight bytes to mm0
386
387        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
388        punpcklbw   mm1, mm6
389        psubsw      mm0, mm1                    ; A-B (low order) to MM0
390
391        paddw       mm5, mm0                    ; accumulate differences in mm5
392
393        pmaddwd     mm0, mm0                    ; square and accumulate
394        paddd       mm7, mm0                    ; accumulate in mm7
395
396
397        ; Now accumulate the final results.
398        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
399        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
400        movsx       rdx, WORD PTR [rsp+8]
401        movsx       rcx, WORD PTR [rsp+10]
402        movsx       rbx, WORD PTR [rsp+12]
403        movsx       rax, WORD PTR [rsp+14]
404        add         rdx, rcx
405        add         rbx, rax
406        add         rdx, rbx    ;XSum
407        movsxd      rax, DWORD PTR [rsp]
408        movsxd      rcx, DWORD PTR [rsp+4]
409        add         rax, rcx    ;XXSum
410        mov         rsi, arg(4) ;SSE
411        mov         rdi, arg(5) ;Sum
412        mov         dword ptr [rsi], eax
413        mov         dword ptr [rdi], edx
414        xor         rax, rax    ; return 0
415
416
417    ; begin epilog
418    add rsp, 16
419    pop rbx
420    pop rdi
421    pop rsi
422    UNSHADOW_ARGS
423    pop         rbp
424    ret
425
426
427
428;unsigned int
429;vp8_get4x4sse_cs_mmx
430;(
431;    unsigned char *src_ptr,
432;    int  source_stride,
433;    unsigned char *ref_ptr,
434;    int  recon_stride
435;)
436global sym(vp8_get4x4sse_cs_mmx) PRIVATE
437sym(vp8_get4x4sse_cs_mmx):
438    push        rbp
439    mov         rbp, rsp
440    SHADOW_ARGS_TO_STACK 4
441    push rsi
442    push rdi
443    push rbx
444    ; end prolog
445
446
447        pxor        mm6, mm6                    ; Blank mmx7
448        pxor        mm7, mm7                    ; Blank mmx7
449
450        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
451        mov         rbx, arg(2) ;[ref_ptr]
452        movsxd      rcx, dword ptr arg(1) ;[source_stride]
453        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
454        ; Row 1
455        movd        mm0, [rax]                  ; Copy eight bytes to mm0
456        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
457        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
458        punpcklbw   mm1, mm6
459        psubsw      mm0, mm1                    ; A-B (low order) to MM0
460        pmaddwd     mm0, mm0                    ; square and accumulate
461        add         rbx,rdx                     ; Inc pointer into ref data
462        add         rax,rcx                     ; Inc pointer into the new data
463        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
464        paddd       mm7, mm0                    ; accumulate in mm7
465
466        ; Row 2
467        movd        mm0, [rax]                  ; Copy eight bytes to mm0
468        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
469        punpcklbw   mm1, mm6
470        psubsw      mm0, mm1                    ; A-B (low order) to MM0
471        pmaddwd     mm0, mm0                    ; square and accumulate
472        add         rbx,rdx                     ; Inc pointer into ref data
473        add         rax,rcx                     ; Inc pointer into the new data
474        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
475        paddd       mm7, mm0                    ; accumulate in mm7
476
477        ; Row 3
478        movd        mm0, [rax]                  ; Copy eight bytes to mm0
479        punpcklbw   mm1, mm6
480        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
481        psubsw      mm0, mm1                    ; A-B (low order) to MM0
482
483        pmaddwd     mm0, mm0                    ; square and accumulate
484        add         rbx,rdx                     ; Inc pointer into ref data
485        add         rax,rcx                     ; Inc pointer into the new data
486        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
487        paddd       mm7, mm0                    ; accumulate in mm7
488
489        ; Row 4
490        movd        mm0, [rax]                  ; Copy eight bytes to mm0
491        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
492        punpcklbw   mm1, mm6
493        psubsw      mm0, mm1                    ; A-B (low order) to MM0
494        pmaddwd     mm0, mm0                    ; square and accumulate
495        paddd       mm7, mm0                    ; accumulate in mm7
496
497        movq        mm0,    mm7                 ;
498        psrlq       mm7,    32
499
500        paddd       mm0,    mm7
501        movq        rax,    mm0
502
503
504    ; begin epilog
505    pop rbx
506    pop rdi
507    pop rsi
508    UNSHADOW_ARGS
509    pop         rbp
510    ret
511
512%define mmx_filter_shift            7
513
514;void vp8_filter_block2d_bil4x4_var_mmx
515;(
516;    unsigned char *ref_ptr,
517;    int ref_pixels_per_line,
518;    unsigned char *src_ptr,
519;    int src_pixels_per_line,
520;    unsigned short *HFilter,
521;    unsigned short *VFilter,
522;    int *sum,
523;    unsigned int *sumsquared
524;)
525global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
526sym(vp8_filter_block2d_bil4x4_var_mmx):
527    push        rbp
528    mov         rbp, rsp
529    SHADOW_ARGS_TO_STACK 8
530    GET_GOT     rbx
531    push rsi
532    push rdi
533    sub         rsp, 16
534    ; end prolog
535
536
537        pxor            mm6,            mm6                 ;
538        pxor            mm7,            mm7                 ;
539
540        mov             rax,            arg(4) ;HFilter             ;
541        mov             rdx,            arg(5) ;VFilter             ;
542
543        mov             rsi,            arg(0) ;ref_ptr              ;
544        mov             rdi,            arg(2) ;src_ptr              ;
545
546        mov             rcx,            4                   ;
547        pxor            mm0,            mm0                 ;
548
549        movd            mm1,            [rsi]               ;
550        movd            mm3,            [rsi+1]             ;
551
552        punpcklbw       mm1,            mm0                 ;
553        pmullw          mm1,            [rax]               ;
554
555        punpcklbw       mm3,            mm0                 ;
556        pmullw          mm3,            [rax+8]             ;
557
558        paddw           mm1,            mm3                 ;
559        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
560
561        psraw           mm1,            mmx_filter_shift    ;
562        movq            mm5,            mm1
563
564%if ABI_IS_32BIT
565        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
566%else
567        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
568        add             rsi, r8
569%endif
570
571.filter_block2d_bil4x4_var_mmx_loop:
572
573        movd            mm1,            [rsi]               ;
574        movd            mm3,            [rsi+1]             ;
575
576        punpcklbw       mm1,            mm0                 ;
577        pmullw          mm1,            [rax]               ;
578
579        punpcklbw       mm3,            mm0                 ;
580        pmullw          mm3,            [rax+8]             ;
581
582        paddw           mm1,            mm3                 ;
583        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
584
585        psraw           mm1,            mmx_filter_shift    ;
586        movq            mm3,            mm5                 ;
587
588        movq            mm5,            mm1                 ;
589        pmullw          mm3,            [rdx]               ;
590
591        pmullw          mm1,            [rdx+8]             ;
592        paddw           mm1,            mm3                 ;
593
594
595        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
596        psraw           mm1,            mmx_filter_shift    ;
597
598        movd            mm3,            [rdi]               ;
599        punpcklbw       mm3,            mm0                 ;
600
601        psubw           mm1,            mm3                 ;
602        paddw           mm6,            mm1                 ;
603
604        pmaddwd         mm1,            mm1                 ;
605        paddd           mm7,            mm1                 ;
606
607%if ABI_IS_32BIT
608        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
609        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
610%else
611        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
612        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
613        add             rsi,            r8
614        add             rdi,            r9
615%endif
616        sub             rcx,            1                   ;
617        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
618
619
620        pxor            mm3,            mm3                 ;
621        pxor            mm2,            mm2                 ;
622
623        punpcklwd       mm2,            mm6                 ;
624        punpckhwd       mm3,            mm6                 ;
625
626        paddd           mm2,            mm3                 ;
627        movq            mm6,            mm2                 ;
628
629        psrlq           mm6,            32                  ;
630        paddd           mm2,            mm6                 ;
631
632        psrad           mm2,            16                  ;
633        movq            mm4,            mm7                 ;
634
635        psrlq           mm4,            32                  ;
636        paddd           mm4,            mm7                 ;
637
638        mov             rdi,            arg(6) ;sum
639        mov             rsi,            arg(7) ;sumsquared
640
641        movd            dword ptr [rdi],          mm2                 ;
642        movd            dword ptr [rsi],          mm4                 ;
643
644
645
646    ; begin epilog
647    add rsp, 16
648    pop rdi
649    pop rsi
650    RESTORE_GOT
651    UNSHADOW_ARGS
652    pop         rbp
653    ret
654
655
656
657
658;void vp8_filter_block2d_bil_var_mmx
659;(
660;    unsigned char *ref_ptr,
661;    int ref_pixels_per_line,
662;    unsigned char *src_ptr,
663;    int src_pixels_per_line,
664;    unsigned int Height,
665;    unsigned short *HFilter,
666;    unsigned short *VFilter,
667;    int *sum,
668;    unsigned int *sumsquared
669;)
670global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
671sym(vp8_filter_block2d_bil_var_mmx):
672    push        rbp
673    mov         rbp, rsp
674    SHADOW_ARGS_TO_STACK 9
675    GET_GOT     rbx
676    push rsi
677    push rdi
678    sub         rsp, 16
679    ; end prolog
680
681        pxor            mm6,            mm6                 ;
682        pxor            mm7,            mm7                 ;
683        mov             rax,            arg(5) ;HFilter             ;
684
685        mov             rdx,            arg(6) ;VFilter             ;
686        mov             rsi,            arg(0) ;ref_ptr              ;
687
688        mov             rdi,            arg(2) ;src_ptr              ;
689        movsxd          rcx,            dword ptr arg(4) ;Height              ;
690
691        pxor            mm0,            mm0                 ;
692        movq            mm1,            [rsi]               ;
693
694        movq            mm3,            [rsi+1]             ;
695        movq            mm2,            mm1                 ;
696
697        movq            mm4,            mm3                 ;
698        punpcklbw       mm1,            mm0                 ;
699
700        punpckhbw       mm2,            mm0                 ;
701        pmullw          mm1,            [rax]               ;
702
703        pmullw          mm2,            [rax]               ;
704        punpcklbw       mm3,            mm0                 ;
705
706        punpckhbw       mm4,            mm0                 ;
707        pmullw          mm3,            [rax+8]             ;
708
709        pmullw          mm4,            [rax+8]             ;
710        paddw           mm1,            mm3                 ;
711
712        paddw           mm2,            mm4                 ;
713        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
714
715        psraw           mm1,            mmx_filter_shift    ;
716        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
717
718        psraw           mm2,            mmx_filter_shift    ;
719        movq            mm5,            mm1
720
721        packuswb        mm5,            mm2                 ;
722%if ABI_IS_32BIT
723        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
724%else
725        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
726        add             rsi,            r8
727%endif
728
729.filter_block2d_bil_var_mmx_loop:
730
731        movq            mm1,            [rsi]               ;
732        movq            mm3,            [rsi+1]             ;
733
734        movq            mm2,            mm1                 ;
735        movq            mm4,            mm3                 ;
736
737        punpcklbw       mm1,            mm0                 ;
738        punpckhbw       mm2,            mm0                 ;
739
740        pmullw          mm1,            [rax]               ;
741        pmullw          mm2,            [rax]               ;
742
743        punpcklbw       mm3,            mm0                 ;
744        punpckhbw       mm4,            mm0                 ;
745
746        pmullw          mm3,            [rax+8]             ;
747        pmullw          mm4,            [rax+8]             ;
748
749        paddw           mm1,            mm3                 ;
750        paddw           mm2,            mm4                 ;
751
752        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
753        psraw           mm1,            mmx_filter_shift    ;
754
755        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
756        psraw           mm2,            mmx_filter_shift    ;
757
758        movq            mm3,            mm5                 ;
759        movq            mm4,            mm5                 ;
760
761        punpcklbw       mm3,            mm0                 ;
762        punpckhbw       mm4,            mm0                 ;
763
764        movq            mm5,            mm1                 ;
765        packuswb        mm5,            mm2                 ;
766
767        pmullw          mm3,            [rdx]               ;
768        pmullw          mm4,            [rdx]               ;
769
770        pmullw          mm1,            [rdx+8]             ;
771        pmullw          mm2,            [rdx+8]             ;
772
773        paddw           mm1,            mm3                 ;
774        paddw           mm2,            mm4                 ;
775
776        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
777        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
778
779        psraw           mm1,            mmx_filter_shift    ;
780        psraw           mm2,            mmx_filter_shift    ;
781
782        movq            mm3,            [rdi]               ;
783        movq            mm4,            mm3                 ;
784
785        punpcklbw       mm3,            mm0                 ;
786        punpckhbw       mm4,            mm0                 ;
787
788        psubw           mm1,            mm3                 ;
789        psubw           mm2,            mm4                 ;
790
791        paddw           mm6,            mm1                 ;
792        pmaddwd         mm1,            mm1                 ;
793
794        paddw           mm6,            mm2                 ;
795        pmaddwd         mm2,            mm2                 ;
796
797        paddd           mm7,            mm1                 ;
798        paddd           mm7,            mm2                 ;
799
800%if ABI_IS_32BIT
801        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
802        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
803%else
804        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
805        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
806        add             rsi,            r8
807        add             rdi,            r9
808%endif
809        sub             rcx,            1                   ;
810        jnz             .filter_block2d_bil_var_mmx_loop       ;
811
812
813        pxor            mm3,            mm3                 ;
814        pxor            mm2,            mm2                 ;
815
816        punpcklwd       mm2,            mm6                 ;
817        punpckhwd       mm3,            mm6                 ;
818
819        paddd           mm2,            mm3                 ;
820        movq            mm6,            mm2                 ;
821
822        psrlq           mm6,            32                  ;
823        paddd           mm2,            mm6                 ;
824
825        psrad           mm2,            16                  ;
826        movq            mm4,            mm7                 ;
827
828        psrlq           mm4,            32                  ;
829        paddd           mm4,            mm7                 ;
830
831        mov             rdi,            arg(7) ;sum
832        mov             rsi,            arg(8) ;sumsquared
833
834        movd            dword ptr [rdi],          mm2                 ;
835        movd            dword ptr [rsi],          mm4                 ;
836
837    ; begin epilog
838    add rsp, 16
839    pop rdi
840    pop rsi
841    RESTORE_GOT
842    UNSHADOW_ARGS
843    pop         rbp
844    ret
845
846
847SECTION_RODATA
848;short mmx_bi_rd[4] = { 64, 64, 64, 64};
849align 16
850mmx_bi_rd:
851    times 4 dw 64
852