1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define mmx_filter_shift            7
15
16;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
17global sym(vpx_get_mb_ss_mmx) PRIVATE
18sym(vpx_get_mb_ss_mmx):
19    push        rbp
20    mov         rbp, rsp
21    SHADOW_ARGS_TO_STACK 7
22    GET_GOT     rbx
23    push rsi
24    push rdi
25    sub         rsp, 8
26    ; end prolog
27
28        mov         rax, arg(0) ;src_ptr
29        mov         rcx, 16
30        pxor        mm4, mm4
31
32.NEXTROW:
33        movq        mm0, [rax]
34        movq        mm1, [rax+8]
35        movq        mm2, [rax+16]
36        movq        mm3, [rax+24]
37        pmaddwd     mm0, mm0
38        pmaddwd     mm1, mm1
39        pmaddwd     mm2, mm2
40        pmaddwd     mm3, mm3
41
42        paddd       mm4, mm0
43        paddd       mm4, mm1
44        paddd       mm4, mm2
45        paddd       mm4, mm3
46
47        add         rax, 32
48        dec         rcx
49        ja          .NEXTROW
50        movq        QWORD PTR [rsp], mm4
51
52        ;return sum[0]+sum[1];
53        movsxd      rax, dword ptr [rsp]
54        movsxd      rcx, dword ptr [rsp+4]
55        add         rax, rcx
56
57    ; begin epilog
58    add rsp, 8
59    pop rdi
60    pop rsi
61    RESTORE_GOT
62    UNSHADOW_ARGS
63    pop         rbp
64    ret
65
66;void vpx_get8x8var_mmx
67;(
68;    unsigned char *src_ptr,
69;    int  source_stride,
70;    unsigned char *ref_ptr,
71;    int  recon_stride,
72;    unsigned int *SSE,
73;    int *Sum
74;)
75global sym(vpx_get8x8var_mmx) PRIVATE
76sym(vpx_get8x8var_mmx):
77    push        rbp
78    mov         rbp, rsp
79    SHADOW_ARGS_TO_STACK 6
80    push rsi
81    push rdi
82    push rbx
83    sub         rsp, 16
84    ; end prolog
85
86        pxor        mm5, mm5                    ; Blank mmx6
87        pxor        mm6, mm6                    ; Blank mmx7
88        pxor        mm7, mm7                    ; Blank mmx7
89
90        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
91        mov         rbx, arg(2) ;[ref_ptr]
92        movsxd      rcx, dword ptr arg(1) ;[source_stride]
93        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
94
95        ; Row 1
96        movq        mm0, [rax]                  ; Copy eight bytes to mm0
97        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
98        movq        mm2, mm0                    ; Take copies
99        movq        mm3, mm1                    ; Take copies
100
101        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
102        punpcklbw   mm1, mm6
103        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
104        punpckhbw   mm3, mm6
105        psubsw      mm0, mm1                    ; A-B (low order) to MM0
106        psubsw      mm2, mm3                    ; A-B (high order) to MM2
107
108        paddw       mm5, mm0                    ; accumulate differences in mm5
109        paddw       mm5, mm2                    ; accumulate differences in mm5
110
111        pmaddwd     mm0, mm0                    ; square and accumulate
112        pmaddwd     mm2, mm2                    ; square and accumulate
113        add         rbx,rdx                     ; Inc pointer into ref data
114        add         rax,rcx                     ; Inc pointer into the new data
115        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
116        paddd       mm7, mm0                    ; accumulate in mm7
117        paddd       mm7, mm2                    ; accumulate in mm7
118
119        ; Row 2
120        movq        mm0, [rax]                  ; Copy eight bytes to mm0
121        movq        mm2, mm0                    ; Take copies
122        movq        mm3, mm1                    ; Take copies
123
124        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
125        punpcklbw   mm1, mm6
126        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
127        punpckhbw   mm3, mm6
128        psubsw      mm0, mm1                    ; A-B (low order) to MM0
129        psubsw      mm2, mm3                    ; A-B (high order) to MM2
130
131        paddw       mm5, mm0                    ; accumulate differences in mm5
132        paddw       mm5, mm2                    ; accumulate differences in mm5
133
134        pmaddwd     mm0, mm0                    ; square and accumulate
135        pmaddwd     mm2, mm2                    ; square and accumulate
136        add         rbx,rdx                     ; Inc pointer into ref data
137        add         rax,rcx                     ; Inc pointer into the new data
138        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
139        paddd       mm7, mm0                    ; accumulate in mm7
140        paddd       mm7, mm2                    ; accumulate in mm7
141
142        ; Row 3
143        movq        mm0, [rax]                  ; Copy eight bytes to mm0
144        movq        mm2, mm0                    ; Take copies
145        movq        mm3, mm1                    ; Take copies
146
147        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
148        punpcklbw   mm1, mm6
149        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
150        punpckhbw   mm3, mm6
151        psubsw      mm0, mm1                    ; A-B (low order) to MM0
152        psubsw      mm2, mm3                    ; A-B (high order) to MM2
153
154        paddw       mm5, mm0                    ; accumulate differences in mm5
155        paddw       mm5, mm2                    ; accumulate differences in mm5
156
157        pmaddwd     mm0, mm0                    ; square and accumulate
158        pmaddwd     mm2, mm2                    ; square and accumulate
159        add         rbx,rdx                     ; Inc pointer into ref data
160        add         rax,rcx                     ; Inc pointer into the new data
161        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
162        paddd       mm7, mm0                    ; accumulate in mm7
163        paddd       mm7, mm2                    ; accumulate in mm7
164
165        ; Row 4
166        movq        mm0, [rax]                  ; Copy eight bytes to mm0
167        movq        mm2, mm0                    ; Take copies
168        movq        mm3, mm1                    ; Take copies
169
170        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
171        punpcklbw   mm1, mm6
172        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
173        punpckhbw   mm3, mm6
174        psubsw      mm0, mm1                    ; A-B (low order) to MM0
175        psubsw      mm2, mm3                    ; A-B (high order) to MM2
176
177        paddw       mm5, mm0                    ; accumulate differences in mm5
178        paddw       mm5, mm2                    ; accumulate differences in mm5
179
180        pmaddwd     mm0, mm0                    ; square and accumulate
181        pmaddwd     mm2, mm2                    ; square and accumulate
182        add         rbx,rdx                     ; Inc pointer into ref data
183        add         rax,rcx                     ; Inc pointer into the new data
184        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
185        paddd       mm7, mm0                    ; accumulate in mm7
186        paddd       mm7, mm2                    ; accumulate in mm7
187
188        ; Row 5
189        movq        mm0, [rax]                  ; Copy eight bytes to mm0
190        movq        mm2, mm0                    ; Take copies
191        movq        mm3, mm1                    ; Take copies
192
193        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
194        punpcklbw   mm1, mm6
195        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
196        punpckhbw   mm3, mm6
197        psubsw      mm0, mm1                    ; A-B (low order) to MM0
198        psubsw      mm2, mm3                    ; A-B (high order) to MM2
199
200        paddw       mm5, mm0                    ; accumulate differences in mm5
201        paddw       mm5, mm2                    ; accumulate differences in mm5
202
203        pmaddwd     mm0, mm0                    ; square and accumulate
204        pmaddwd     mm2, mm2                    ; square and accumulate
205        add         rbx,rdx                     ; Inc pointer into ref data
206        add         rax,rcx                     ; Inc pointer into the new data
207        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
208        ;              movq        mm4, [rbx + rdx]
209        paddd       mm7, mm0                    ; accumulate in mm7
210        paddd       mm7, mm2                    ; accumulate in mm7
211
212        ; Row 6
213        movq        mm0, [rax]                  ; Copy eight bytes to mm0
214        movq        mm2, mm0                    ; Take copies
215        movq        mm3, mm1                    ; Take copies
216
217        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
218        punpcklbw   mm1, mm6
219        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
220        punpckhbw   mm3, mm6
221        psubsw      mm0, mm1                    ; A-B (low order) to MM0
222        psubsw      mm2, mm3                    ; A-B (high order) to MM2
223
224        paddw       mm5, mm0                    ; accumulate differences in mm5
225        paddw       mm5, mm2                    ; accumulate differences in mm5
226
227        pmaddwd     mm0, mm0                    ; square and accumulate
228        pmaddwd     mm2, mm2                    ; square and accumulate
229        add         rbx,rdx                     ; Inc pointer into ref data
230        add         rax,rcx                     ; Inc pointer into the new data
231        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
232        paddd       mm7, mm0                    ; accumulate in mm7
233        paddd       mm7, mm2                    ; accumulate in mm7
234
235        ; Row 7
236        movq        mm0, [rax]                  ; Copy eight bytes to mm0
237        movq        mm2, mm0                    ; Take copies
238        movq        mm3, mm1                    ; Take copies
239
240        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
241        punpcklbw   mm1, mm6
242        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
243        punpckhbw   mm3, mm6
244        psubsw      mm0, mm1                    ; A-B (low order) to MM0
245        psubsw      mm2, mm3                    ; A-B (high order) to MM2
246
247        paddw       mm5, mm0                    ; accumulate differences in mm5
248        paddw       mm5, mm2                    ; accumulate differences in mm5
249
250        pmaddwd     mm0, mm0                    ; square and accumulate
251        pmaddwd     mm2, mm2                    ; square and accumulate
252        add         rbx,rdx                     ; Inc pointer into ref data
253        add         rax,rcx                     ; Inc pointer into the new data
254        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
255        paddd       mm7, mm0                    ; accumulate in mm7
256        paddd       mm7, mm2                    ; accumulate in mm7
257
258        ; Row 8
259        movq        mm0, [rax]                  ; Copy eight bytes to mm0
260        movq        mm2, mm0                    ; Take copies
261        movq        mm3, mm1                    ; Take copies
262
263        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
264        punpcklbw   mm1, mm6
265        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
266        punpckhbw   mm3, mm6
267        psubsw      mm0, mm1                    ; A-B (low order) to MM0
268        psubsw      mm2, mm3                    ; A-B (high order) to MM2
269
270        paddw       mm5, mm0                    ; accumulate differences in mm5
271        paddw       mm5, mm2                    ; accumulate differences in mm5
272
273        pmaddwd     mm0, mm0                    ; square and accumulate
274        pmaddwd     mm2, mm2                    ; square and accumulate
275        add         rbx,rdx                     ; Inc pointer into ref data
276        add         rax,rcx                     ; Inc pointer into the new data
277        paddd       mm7, mm0                    ; accumulate in mm7
278        paddd       mm7, mm2                    ; accumulate in mm7
279
280        ; Now accumulate the final results.
281        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
282        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
283        movsx       rdx, WORD PTR [rsp+8]
284        movsx       rcx, WORD PTR [rsp+10]
285        movsx       rbx, WORD PTR [rsp+12]
286        movsx       rax, WORD PTR [rsp+14]
287        add         rdx, rcx
288        add         rbx, rax
289        add         rdx, rbx    ;XSum
290        movsxd      rax, DWORD PTR [rsp]
291        movsxd      rcx, DWORD PTR [rsp+4]
292        add         rax, rcx    ;XXSum
293        mov         rsi, arg(4) ;SSE
294        mov         rdi, arg(5) ;Sum
295        mov         dword ptr [rsi], eax
296        mov         dword ptr [rdi], edx
297        xor         rax, rax    ; return 0
298
299    ; begin epilog
300    add rsp, 16
301    pop rbx
302    pop rdi
303    pop rsi
304    UNSHADOW_ARGS
305    pop         rbp
306    ret
307
308;void
309;vpx_get4x4var_mmx
310;(
311;    unsigned char *src_ptr,
312;    int  source_stride,
313;    unsigned char *ref_ptr,
314;    int  recon_stride,
315;    unsigned int *SSE,
316;    int *Sum
317;)
318global sym(vpx_get4x4var_mmx) PRIVATE
319sym(vpx_get4x4var_mmx):
320    push        rbp
321    mov         rbp, rsp
322    SHADOW_ARGS_TO_STACK 6
323    push rsi
324    push rdi
325    push rbx
326    sub         rsp, 16
327    ; end prolog
328
329        pxor        mm5, mm5                    ; Blank mmx6
330        pxor        mm6, mm6                    ; Blank mmx7
331        pxor        mm7, mm7                    ; Blank mmx7
332
333        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
334        mov         rbx, arg(2) ;[ref_ptr]
335        movsxd      rcx, dword ptr arg(1) ;[source_stride]
336        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
337
338        ; Row 1
339        movd        mm0, [rax]                  ; Copy four bytes to mm0
340        movd        mm1, [rbx]                  ; Copy four bytes to mm1
341        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
342        punpcklbw   mm1, mm6
343        psubsw      mm0, mm1                    ; A-B (low order) to MM0
344        paddw       mm5, mm0                    ; accumulate differences in mm5
345        pmaddwd     mm0, mm0                    ; square and accumulate
346        add         rbx,rdx                     ; Inc pointer into ref data
347        add         rax,rcx                     ; Inc pointer into the new data
348        movd        mm1, [rbx]                  ; Copy four bytes to mm1
349        paddd       mm7, mm0                    ; accumulate in mm7
350
351        ; Row 2
352        movd        mm0, [rax]                  ; Copy four bytes to mm0
353        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
354        punpcklbw   mm1, mm6
355        psubsw      mm0, mm1                    ; A-B (low order) to MM0
356        paddw       mm5, mm0                    ; accumulate differences in mm5
357
358        pmaddwd     mm0, mm0                    ; square and accumulate
359        add         rbx,rdx                     ; Inc pointer into ref data
360        add         rax,rcx                     ; Inc pointer into the new data
361        movd        mm1, [rbx]                  ; Copy four bytes to mm1
362        paddd       mm7, mm0                    ; accumulate in mm7
363
364        ; Row 3
365        movd        mm0, [rax]                  ; Copy four bytes to mm0
366        punpcklbw   mm0, mm6                    ; unpack to higher precision
367        punpcklbw   mm1, mm6
368        psubsw      mm0, mm1                    ; A-B (low order) to MM0
369        paddw       mm5, mm0                    ; accumulate differences in mm5
370
371        pmaddwd     mm0, mm0                    ; square and accumulate
372        add         rbx,rdx                     ; Inc pointer into ref data
373        add         rax,rcx                     ; Inc pointer into the new data
374        movd        mm1, [rbx]                  ; Copy four bytes to mm1
375        paddd       mm7, mm0                    ; accumulate in mm7
376
377        ; Row 4
378        movd        mm0, [rax]                  ; Copy four bytes to mm0
379
380        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
381        punpcklbw   mm1, mm6
382        psubsw      mm0, mm1                    ; A-B (low order) to MM0
383
384        paddw       mm5, mm0                    ; accumulate differences in mm5
385
386        pmaddwd     mm0, mm0                    ; square and accumulate
387        paddd       mm7, mm0                    ; accumulate in mm7
388
389        ; Now accumulate the final results.
390        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
391        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
392        movsx       rdx, WORD PTR [rsp+8]
393        movsx       rcx, WORD PTR [rsp+10]
394        movsx       rbx, WORD PTR [rsp+12]
395        movsx       rax, WORD PTR [rsp+14]
396        add         rdx, rcx
397        add         rbx, rax
398        add         rdx, rbx    ;XSum
399        movsxd      rax, DWORD PTR [rsp]
400        movsxd      rcx, DWORD PTR [rsp+4]
401        add         rax, rcx    ;XXSum
402        mov         rsi, arg(4) ;SSE
403        mov         rdi, arg(5) ;Sum
404        mov         dword ptr [rsi], eax
405        mov         dword ptr [rdi], edx
406        xor         rax, rax    ; return 0
407
408    ; begin epilog
409    add rsp, 16
410    pop rbx
411    pop rdi
412    pop rsi
413    UNSHADOW_ARGS
414    pop         rbp
415    ret
416
417;void vpx_filter_block2d_bil4x4_var_mmx
418;(
419;    unsigned char *ref_ptr,
420;    int ref_pixels_per_line,
421;    unsigned char *src_ptr,
422;    int src_pixels_per_line,
423;    unsigned short *HFilter,
424;    unsigned short *VFilter,
425;    int *sum,
426;    unsigned int *sumsquared
427;)
428global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
429sym(vpx_filter_block2d_bil4x4_var_mmx):
430    push        rbp
431    mov         rbp, rsp
432    SHADOW_ARGS_TO_STACK 8
433    GET_GOT     rbx
434    push rsi
435    push rdi
436    sub         rsp, 16
437    ; end prolog
438
439        pxor            mm6,            mm6                 ;
440        pxor            mm7,            mm7                 ;
441
442        mov             rax,            arg(4) ;HFilter             ;
443        mov             rdx,            arg(5) ;VFilter             ;
444
445        mov             rsi,            arg(0) ;ref_ptr              ;
446        mov             rdi,            arg(2) ;src_ptr              ;
447
448        mov             rcx,            4                   ;
449        pxor            mm0,            mm0                 ;
450
451        movd            mm1,            [rsi]               ;
452        movd            mm3,            [rsi+1]             ;
453
454        punpcklbw       mm1,            mm0                 ;
455        pmullw          mm1,            [rax]               ;
456
457        punpcklbw       mm3,            mm0                 ;
458        pmullw          mm3,            [rax+8]             ;
459
460        paddw           mm1,            mm3                 ;
461        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
462
463        psraw           mm1,            mmx_filter_shift    ;
464        movq            mm5,            mm1
465
466%if ABI_IS_32BIT
467        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
468%else
469        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
470        add             rsi, r8
471%endif
472
473.filter_block2d_bil4x4_var_mmx_loop:
474
475        movd            mm1,            [rsi]               ;
476        movd            mm3,            [rsi+1]             ;
477
478        punpcklbw       mm1,            mm0                 ;
479        pmullw          mm1,            [rax]               ;
480
481        punpcklbw       mm3,            mm0                 ;
482        pmullw          mm3,            [rax+8]             ;
483
484        paddw           mm1,            mm3                 ;
485        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
486
487        psraw           mm1,            mmx_filter_shift    ;
488        movq            mm3,            mm5                 ;
489
490        movq            mm5,            mm1                 ;
491        pmullw          mm3,            [rdx]               ;
492
493        pmullw          mm1,            [rdx+8]             ;
494        paddw           mm1,            mm3                 ;
495
496        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
497        psraw           mm1,            mmx_filter_shift    ;
498
499        movd            mm3,            [rdi]               ;
500        punpcklbw       mm3,            mm0                 ;
501
502        psubw           mm1,            mm3                 ;
503        paddw           mm6,            mm1                 ;
504
505        pmaddwd         mm1,            mm1                 ;
506        paddd           mm7,            mm1                 ;
507
508%if ABI_IS_32BIT
509        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
510        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
511%else
512        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
513        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
514        add             rsi,            r8
515        add             rdi,            r9
516%endif
517        sub             rcx,            1                   ;
518        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
519
520        pxor            mm3,            mm3                 ;
521        pxor            mm2,            mm2                 ;
522
523        punpcklwd       mm2,            mm6                 ;
524        punpckhwd       mm3,            mm6                 ;
525
526        paddd           mm2,            mm3                 ;
527        movq            mm6,            mm2                 ;
528
529        psrlq           mm6,            32                  ;
530        paddd           mm2,            mm6                 ;
531
532        psrad           mm2,            16                  ;
533        movq            mm4,            mm7                 ;
534
535        psrlq           mm4,            32                  ;
536        paddd           mm4,            mm7                 ;
537
538        mov             rdi,            arg(6) ;sum
539        mov             rsi,            arg(7) ;sumsquared
540
541        movd            dword ptr [rdi],          mm2                 ;
542        movd            dword ptr [rsi],          mm4                 ;
543
544    ; begin epilog
545    add rsp, 16
546    pop rdi
547    pop rsi
548    RESTORE_GOT
549    UNSHADOW_ARGS
550    pop         rbp
551    ret
552
553;void vpx_filter_block2d_bil_var_mmx
554;(
555;    unsigned char *ref_ptr,
556;    int ref_pixels_per_line,
557;    unsigned char *src_ptr,
558;    int src_pixels_per_line,
559;    unsigned int Height,
560;    unsigned short *HFilter,
561;    unsigned short *VFilter,
562;    int *sum,
563;    unsigned int *sumsquared
564;)
565global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
566sym(vpx_filter_block2d_bil_var_mmx):
567    push        rbp
568    mov         rbp, rsp
569    SHADOW_ARGS_TO_STACK 9
570    GET_GOT     rbx
571    push rsi
572    push rdi
573    sub         rsp, 16
574    ; end prolog
575
576        pxor            mm6,            mm6                 ;
577        pxor            mm7,            mm7                 ;
578        mov             rax,            arg(5) ;HFilter             ;
579
580        mov             rdx,            arg(6) ;VFilter             ;
581        mov             rsi,            arg(0) ;ref_ptr              ;
582
583        mov             rdi,            arg(2) ;src_ptr              ;
584        movsxd          rcx,            dword ptr arg(4) ;Height              ;
585
586        pxor            mm0,            mm0                 ;
587        movq            mm1,            [rsi]               ;
588
589        movq            mm3,            [rsi+1]             ;
590        movq            mm2,            mm1                 ;
591
592        movq            mm4,            mm3                 ;
593        punpcklbw       mm1,            mm0                 ;
594
595        punpckhbw       mm2,            mm0                 ;
596        pmullw          mm1,            [rax]               ;
597
598        pmullw          mm2,            [rax]               ;
599        punpcklbw       mm3,            mm0                 ;
600
601        punpckhbw       mm4,            mm0                 ;
602        pmullw          mm3,            [rax+8]             ;
603
604        pmullw          mm4,            [rax+8]             ;
605        paddw           mm1,            mm3                 ;
606
607        paddw           mm2,            mm4                 ;
608        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
609
610        psraw           mm1,            mmx_filter_shift    ;
611        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
612
613        psraw           mm2,            mmx_filter_shift    ;
614        movq            mm5,            mm1
615
616        packuswb        mm5,            mm2                 ;
617%if ABI_IS_32BIT
618        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
619%else
620        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
621        add             rsi,            r8
622%endif
623
624.filter_block2d_bil_var_mmx_loop:
625
626        movq            mm1,            [rsi]               ;
627        movq            mm3,            [rsi+1]             ;
628
629        movq            mm2,            mm1                 ;
630        movq            mm4,            mm3                 ;
631
632        punpcklbw       mm1,            mm0                 ;
633        punpckhbw       mm2,            mm0                 ;
634
635        pmullw          mm1,            [rax]               ;
636        pmullw          mm2,            [rax]               ;
637
638        punpcklbw       mm3,            mm0                 ;
639        punpckhbw       mm4,            mm0                 ;
640
641        pmullw          mm3,            [rax+8]             ;
642        pmullw          mm4,            [rax+8]             ;
643
644        paddw           mm1,            mm3                 ;
645        paddw           mm2,            mm4                 ;
646
647        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
648        psraw           mm1,            mmx_filter_shift    ;
649
650        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
651        psraw           mm2,            mmx_filter_shift    ;
652
653        movq            mm3,            mm5                 ;
654        movq            mm4,            mm5                 ;
655
656        punpcklbw       mm3,            mm0                 ;
657        punpckhbw       mm4,            mm0                 ;
658
659        movq            mm5,            mm1                 ;
660        packuswb        mm5,            mm2                 ;
661
662        pmullw          mm3,            [rdx]               ;
663        pmullw          mm4,            [rdx]               ;
664
665        pmullw          mm1,            [rdx+8]             ;
666        pmullw          mm2,            [rdx+8]             ;
667
668        paddw           mm1,            mm3                 ;
669        paddw           mm2,            mm4                 ;
670
671        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
672        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
673
674        psraw           mm1,            mmx_filter_shift    ;
675        psraw           mm2,            mmx_filter_shift    ;
676
677        movq            mm3,            [rdi]               ;
678        movq            mm4,            mm3                 ;
679
680        punpcklbw       mm3,            mm0                 ;
681        punpckhbw       mm4,            mm0                 ;
682
683        psubw           mm1,            mm3                 ;
684        psubw           mm2,            mm4                 ;
685
686        paddw           mm6,            mm1                 ;
687        pmaddwd         mm1,            mm1                 ;
688
689        paddw           mm6,            mm2                 ;
690        pmaddwd         mm2,            mm2                 ;
691
692        paddd           mm7,            mm1                 ;
693        paddd           mm7,            mm2                 ;
694
695%if ABI_IS_32BIT
696        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
697        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
698%else
699        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
700        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
701        add             rsi,            r8
702        add             rdi,            r9
703%endif
704        sub             rcx,            1                   ;
705        jnz             .filter_block2d_bil_var_mmx_loop       ;
706
707        pxor            mm3,            mm3                 ;
708        pxor            mm2,            mm2                 ;
709
710        punpcklwd       mm2,            mm6                 ;
711        punpckhwd       mm3,            mm6                 ;
712
713        paddd           mm2,            mm3                 ;
714        movq            mm6,            mm2                 ;
715
716        psrlq           mm6,            32                  ;
717        paddd           mm2,            mm6                 ;
718
719        psrad           mm2,            16                  ;
720        movq            mm4,            mm7                 ;
721
722        psrlq           mm4,            32                  ;
723        paddd           mm4,            mm7                 ;
724
725        mov             rdi,            arg(7) ;sum
726        mov             rsi,            arg(8) ;sumsquared
727
728        movd            dword ptr [rdi],          mm2                 ;
729        movd            dword ptr [rsi],          mm4                 ;
730
731    ; begin epilog
732    add rsp, 16
733    pop rdi
734    pop rsi
735    RESTORE_GOT
736    UNSHADOW_ARGS
737    pop         rbp
738    ret
739
740SECTION_RODATA
741;short mmx_bi_rd[4] = { 64, 64, 64, 64};
742align 16
743mmx_bi_rd:
744    times 4 dw 64
745