1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define xmm_filter_shift            7
15
16;unsigned int vp8_get_mb_ss_sse2
17;(
18;    short *src_ptr
19;)
20global sym(vp8_get_mb_ss_sse2)
21sym(vp8_get_mb_ss_sse2):
22    push        rbp
23    mov         rbp, rsp
24    SHADOW_ARGS_TO_STACK 1
25    GET_GOT     rbx
26    push rsi
27    push rdi
28    sub         rsp, 16
29    ; end prolog
30
31
32        mov         rax, arg(0) ;[src_ptr]
33        mov         rcx, 8
34        pxor        xmm4, xmm4
35
36NEXTROW:
37        movdqa      xmm0, [rax]
38        movdqa      xmm1, [rax+16]
39        movdqa      xmm2, [rax+32]
40        movdqa      xmm3, [rax+48]
41        pmaddwd     xmm0, xmm0
42        pmaddwd     xmm1, xmm1
43        pmaddwd     xmm2, xmm2
44        pmaddwd     xmm3, xmm3
45
46        paddd       xmm0, xmm1
47        paddd       xmm2, xmm3
48        paddd       xmm4, xmm0
49        paddd       xmm4, xmm2
50
51        add         rax, 0x40
52        dec         rcx
53        ja          NEXTROW
54
55        movdqa      xmm3,xmm4
56        psrldq      xmm4,8
57        paddd       xmm4,xmm3
58        movdqa      xmm3,xmm4
59        psrldq      xmm4,4
60        paddd       xmm4,xmm3
61        movq        rax,xmm4
62
63
64    ; begin epilog
65    add rsp, 16
66    pop rdi
67    pop rsi
68    RESTORE_GOT
69    UNSHADOW_ARGS
70    pop         rbp
71    ret
72
73
74;unsigned int vp8_get16x16var_sse2
75;(
76;    unsigned char   *  src_ptr,
77;    int             source_stride,
78;    unsigned char   *  ref_ptr,
79;    int             recon_stride,
80;    unsigned int    *  SSE,
81;    int             *  Sum
82;)
83global sym(vp8_get16x16var_sse2)
84sym(vp8_get16x16var_sse2):
85    push        rbp
86    mov         rbp, rsp
87    SHADOW_ARGS_TO_STACK 6
88    push rbx
89    push rsi
90    push rdi
91    ; end prolog
92
93        mov         rsi,            arg(0) ;[src_ptr]
94        mov         rdi,            arg(2) ;[ref_ptr]
95
96        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
97        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
98
99        ; Prefetch data
100        lea             rcx,    [rax+rax*2]
101        prefetcht0      [rsi]
102        prefetcht0      [rsi+rax]
103        prefetcht0      [rsi+rax*2]
104        prefetcht0      [rsi+rcx]
105        lea             rbx,    [rsi+rax*4]
106        prefetcht0      [rbx]
107        prefetcht0      [rbx+rax]
108        prefetcht0      [rbx+rax*2]
109        prefetcht0      [rbx+rcx]
110
111        lea             rcx,    [rdx+rdx*2]
112        prefetcht0      [rdi]
113        prefetcht0      [rdi+rdx]
114        prefetcht0      [rdi+rdx*2]
115        prefetcht0      [rdi+rcx]
116        lea             rbx,    [rdi+rdx*4]
117        prefetcht0      [rbx]
118        prefetcht0      [rbx+rdx]
119        prefetcht0      [rbx+rdx*2]
120        prefetcht0      [rbx+rcx]
121
122        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
123        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
124
125        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
126        mov         rcx,            16
127
128var16loop:
129        movdqu      xmm1,           XMMWORD PTR [rsi]
130        movdqu      xmm2,           XMMWORD PTR [rdi]
131
132        prefetcht0      [rsi+rax*8]
133        prefetcht0      [rdi+rdx*8]
134
135        movdqa      xmm3,           xmm1
136        movdqa      xmm4,           xmm2
137
138
139        punpcklbw   xmm1,           xmm0
140        punpckhbw   xmm3,           xmm0
141
142        punpcklbw   xmm2,           xmm0
143        punpckhbw   xmm4,           xmm0
144
145
146        psubw       xmm1,           xmm2
147        psubw       xmm3,           xmm4
148
149        paddw       xmm7,           xmm1
150        pmaddwd     xmm1,           xmm1
151
152        paddw       xmm7,           xmm3
153        pmaddwd     xmm3,           xmm3
154
155        paddd       xmm6,           xmm1
156        paddd       xmm6,           xmm3
157
158        add         rsi,            rax
159        add         rdi,            rdx
160
161        sub         rcx,            1
162        jnz         var16loop
163
164
165        movdqa      xmm1,           xmm6
166        pxor        xmm6,           xmm6
167
168        pxor        xmm5,           xmm5
169        punpcklwd   xmm6,           xmm7
170
171        punpckhwd   xmm5,           xmm7
172        psrad       xmm5,           16
173
174        psrad       xmm6,           16
175        paddd       xmm6,           xmm5
176
177        movdqa      xmm2,           xmm1
178        punpckldq   xmm1,           xmm0
179
180        punpckhdq   xmm2,           xmm0
181        movdqa      xmm7,           xmm6
182
183        paddd       xmm1,           xmm2
184        punpckldq   xmm6,           xmm0
185
186        punpckhdq   xmm7,           xmm0
187        paddd       xmm6,           xmm7
188
189        movdqa      xmm2,           xmm1
190        movdqa      xmm7,           xmm6
191
192        psrldq      xmm1,           8
193        psrldq      xmm6,           8
194
195        paddd       xmm7,           xmm6
196        paddd       xmm1,           xmm2
197
198        mov         rax,            arg(5) ;[Sum]
199        mov         rdi,            arg(4) ;[SSE]
200
201        movd DWORD PTR [rax],       xmm7
202        movd DWORD PTR [rdi],       xmm1
203
204
205    ; begin epilog
206    pop rdi
207    pop rsi
208    pop rbx
209    UNSHADOW_ARGS
210    pop         rbp
211    ret
212
213
214;unsigned int vp8_get16x16pred_error_sse2
215;(
216;   unsigned char *src_ptr,
217;    int src_stride,
218;    unsigned char *ref_ptr,
219;    int ref_stride
220;)
221global sym(vp8_get16x16pred_error_sse2)
222sym(vp8_get16x16pred_error_sse2):
223    push        rbp
224    mov         rbp, rsp
225    SHADOW_ARGS_TO_STACK 4
226    GET_GOT     rbx
227    push rsi
228    push rdi
229    sub         rsp, 16
230    ; end prolog
231
232        mov         rsi,            arg(0) ;[src_ptr]
233        mov         rdi,            arg(2) ;[ref_ptr]
234
235        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
236        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
237
238        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
239        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
240
241        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
242        mov         rcx,            16
243
244var16peloop:
245        movdqu      xmm1,           XMMWORD PTR [rsi]
246        movdqu      xmm2,           XMMWORD PTR [rdi]
247
248        movdqa      xmm3,           xmm1
249        movdqa      xmm4,           xmm2
250
251        punpcklbw   xmm1,           xmm0
252        punpckhbw   xmm3,           xmm0
253
254        punpcklbw   xmm2,           xmm0
255        punpckhbw   xmm4,           xmm0
256
257        psubw       xmm1,           xmm2
258        psubw       xmm3,           xmm4
259
260        paddw       xmm7,           xmm1
261        pmaddwd     xmm1,           xmm1
262
263        paddw       xmm7,           xmm3
264        pmaddwd     xmm3,           xmm3
265
266        paddd       xmm6,           xmm1
267        paddd       xmm6,           xmm3
268
269        add         rsi,            rax
270        add         rdi,            rdx
271
272        sub         rcx,            1
273        jnz         var16peloop
274
275
276        movdqa      xmm1,           xmm6
277        pxor        xmm6,           xmm6
278
279        pxor        xmm5,           xmm5
280        punpcklwd   xmm6,           xmm7
281
282        punpckhwd   xmm5,           xmm7
283        psrad       xmm5,           16
284
285        psrad       xmm6,           16
286        paddd       xmm6,           xmm5
287
288        movdqa      xmm2,           xmm1
289        punpckldq   xmm1,           xmm0
290
291        punpckhdq   xmm2,           xmm0
292        movdqa      xmm7,           xmm6
293
294        paddd       xmm1,           xmm2
295        punpckldq   xmm6,           xmm0
296
297        punpckhdq   xmm7,           xmm0
298        paddd       xmm6,           xmm7
299
300        movdqa      xmm2,           xmm1
301        movdqa      xmm7,           xmm6
302
303        psrldq      xmm1,           8
304        psrldq      xmm6,           8
305
306        paddd       xmm7,           xmm6
307        paddd       xmm1,           xmm2
308
309        movd DWORD PTR [rsp],       xmm7  ;Sum
310        movd DWORD PTR [rsp+4],     xmm1  ;SSE
311
312        ; return (SSE-((Sum*Sum)>>8));
313        movsxd      rdx, dword ptr [rsp]
314        imul        rdx, rdx
315        sar         rdx, 8
316        movsxd      rax, dword ptr [rsp + 4]
317        sub         rax, rdx
318
319    ; begin epilog
320    add rsp, 16
321    pop rdi
322    pop rsi
323    RESTORE_GOT
324    UNSHADOW_ARGS
325    pop         rbp
326    ret
327
328
329
330;unsigned int vp8_get8x8var_sse2
331;(
332;    unsigned char   *  src_ptr,
333;    int             source_stride,
334;    unsigned char   *  ref_ptr,
335;    int             recon_stride,
336;    unsigned int    *  SSE,
337;    int             *  Sum
338;)
339global sym(vp8_get8x8var_sse2)
340sym(vp8_get8x8var_sse2):
341    push        rbp
342    mov         rbp, rsp
343    SHADOW_ARGS_TO_STACK 6
344    GET_GOT     rbx
345    push rsi
346    push rdi
347    sub         rsp, 16
348    ; end prolog
349
350        mov         rsi,            arg(0) ;[src_ptr]
351        mov         rdi,            arg(2) ;[ref_ptr]
352
353        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
354        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
355
356        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
357        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
358
359        movq        xmm1,           QWORD PTR [rsi]
360        movq        xmm2,           QWORD PTR [rdi]
361
362        punpcklbw   xmm1,           xmm0
363        punpcklbw   xmm2,           xmm0
364
365        psubsw      xmm1,           xmm2
366        paddw       xmm7,           xmm1
367
368        pmaddwd     xmm1,           xmm1
369
370        movq        xmm2,           QWORD PTR[rsi + rax]
371        movq        xmm3,           QWORD PTR[rdi + rdx]
372
373        punpcklbw   xmm2,           xmm0
374        punpcklbw   xmm3,           xmm0
375
376        psubsw      xmm2,           xmm3
377        paddw       xmm7,           xmm2
378
379        pmaddwd     xmm2,           xmm2
380        paddd       xmm1,           xmm2
381
382
383        movq        xmm2,           QWORD PTR[rsi + rax * 2]
384        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
385
386        punpcklbw   xmm2,           xmm0
387        punpcklbw   xmm3,           xmm0
388
389        psubsw      xmm2,           xmm3
390        paddw       xmm7,           xmm2
391
392        pmaddwd     xmm2,           xmm2
393        paddd       xmm1,           xmm2
394
395
396        lea         rsi,            [rsi + rax * 2]
397        lea         rdi,            [rdi + rdx * 2]
398        movq        xmm2,           QWORD PTR[rsi + rax]
399        movq        xmm3,           QWORD PTR[rdi + rdx]
400
401        punpcklbw   xmm2,           xmm0
402        punpcklbw   xmm3,           xmm0
403
404        psubsw      xmm2,           xmm3
405        paddw       xmm7,           xmm2
406
407        pmaddwd     xmm2,           xmm2
408        paddd       xmm1,           xmm2
409
410        movq        xmm2,           QWORD PTR[rsi + rax *2]
411        movq        xmm3,           QWORD PTR[rdi + rdx *2]
412
413        punpcklbw   xmm2,           xmm0
414        punpcklbw   xmm3,           xmm0
415
416        psubsw      xmm2,           xmm3
417        paddw       xmm7,           xmm2
418
419        pmaddwd     xmm2,           xmm2
420        paddd       xmm1,           xmm2
421
422
423        lea         rsi,            [rsi + rax * 2]
424        lea         rdi,            [rdi + rdx * 2]
425
426
427        movq        xmm2,           QWORD PTR[rsi + rax]
428        movq        xmm3,           QWORD PTR[rdi + rdx]
429
430        punpcklbw   xmm2,           xmm0
431        punpcklbw   xmm3,           xmm0
432
433        psubsw      xmm2,           xmm3
434        paddw       xmm7,           xmm2
435
436        pmaddwd     xmm2,           xmm2
437        paddd       xmm1,           xmm2
438
439        movq        xmm2,           QWORD PTR[rsi + rax *2]
440        movq        xmm3,           QWORD PTR[rdi + rdx *2]
441
442        punpcklbw   xmm2,           xmm0
443        punpcklbw   xmm3,           xmm0
444
445        psubsw      xmm2,           xmm3
446        paddw       xmm7,           xmm2
447
448        pmaddwd     xmm2,           xmm2
449        paddd       xmm1,           xmm2
450
451
452        lea         rsi,            [rsi + rax * 2]
453        lea         rdi,            [rdi + rdx * 2]
454
455        movq        xmm2,           QWORD PTR[rsi + rax]
456        movq        xmm3,           QWORD PTR[rdi + rdx]
457
458        punpcklbw   xmm2,           xmm0
459        punpcklbw   xmm3,           xmm0
460
461        psubsw      xmm2,           xmm3
462        paddw       xmm7,           xmm2
463
464        pmaddwd     xmm2,           xmm2
465        paddd       xmm1,           xmm2
466
467
468        movdqa      xmm6,           xmm7
469        punpcklwd   xmm6,           xmm0
470
471        punpckhwd   xmm7,           xmm0
472        movdqa      xmm2,           xmm1
473
474        paddw       xmm6,           xmm7
475        punpckldq   xmm1,           xmm0
476
477        punpckhdq   xmm2,           xmm0
478        movdqa      xmm7,           xmm6
479
480        paddd       xmm1,           xmm2
481        punpckldq   xmm6,           xmm0
482
483        punpckhdq   xmm7,           xmm0
484        paddw       xmm6,           xmm7
485
486        movdqa      xmm2,           xmm1
487        movdqa      xmm7,           xmm6
488
489        psrldq      xmm1,           8
490        psrldq      xmm6,           8
491
492        paddw       xmm7,           xmm6
493        paddd       xmm1,           xmm2
494
495        mov         rax,            arg(5) ;[Sum]
496        mov         rdi,            arg(4) ;[SSE]
497
498        movq        rdx,            xmm7
499        movsx       rcx,            dx
500
501        mov  dword ptr [rax],       ecx
502        movd DWORD PTR [rdi],       xmm1
503
504    ; begin epilog
505    add rsp, 16
506    pop rdi
507    pop rsi
508    RESTORE_GOT
509    UNSHADOW_ARGS
510    pop         rbp
511    ret
512
513;void vp8_filter_block2d_bil_var_sse2
514;(
515;    unsigned char *ref_ptr,
516;    int ref_pixels_per_line,
517;    unsigned char *src_ptr,
518;    int src_pixels_per_line,
519;    unsigned int Height,
520;    int  xoffset,
521;    int  yoffset,
522;    int *sum,
523;    unsigned int *sumsquared;;
524;
525;)
526global sym(vp8_filter_block2d_bil_var_sse2)
527sym(vp8_filter_block2d_bil_var_sse2):
528    push        rbp
529    mov         rbp, rsp
530    SHADOW_ARGS_TO_STACK 9
531    SAVE_XMM
532    GET_GOT     rbx
533    push rsi
534    push rdi
535    push rbx
536    ; end prolog
537
538        pxor            xmm6,           xmm6                 ;
539        pxor            xmm7,           xmm7                 ;
540
541        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
542        movdqa          xmm4,           XMMWORD PTR [rsi]
543
544        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
545        movsxd          rax,            dword ptr arg(5)     ; xoffset
546
547        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
548        je              filter_block2d_bil_var_sse2_sp_only
549
550        shl             rax,            5                    ; point to filter coeff with xoffset
551        lea             rax,            [rax + rcx]          ; HFilter
552
553        movsxd          rdx,            dword ptr arg(6)     ; yoffset
554
555        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
556        je              filter_block2d_bil_var_sse2_fp_only
557
558        shl             rdx,            5
559        lea             rdx,            [rdx + rcx]          ; VFilter
560
561        mov             rsi,            arg(0)               ;ref_ptr
562        mov             rdi,            arg(2)               ;src_ptr
563        movsxd          rcx,            dword ptr arg(4)     ;Height
564
565        pxor            xmm0,           xmm0                 ;
566        movq            xmm1,           QWORD PTR [rsi]      ;
567        movq            xmm3,           QWORD PTR [rsi+1]    ;
568
569        punpcklbw       xmm1,           xmm0                 ;
570        pmullw          xmm1,           [rax]                ;
571        punpcklbw       xmm3,           xmm0
572        pmullw          xmm3,           [rax+16]             ;
573
574        paddw           xmm1,           xmm3                 ;
575        paddw           xmm1,           xmm4                 ;
576        psraw           xmm1,           xmm_filter_shift     ;
577        movdqa          xmm5,           xmm1
578
579        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
580        lea             rsi,            [rsi + rbx]
581%if ABI_IS_32BIT=0
582        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
583%endif
584
585filter_block2d_bil_var_sse2_loop:
586        movq            xmm1,           QWORD PTR [rsi]               ;
587        movq            xmm3,           QWORD PTR [rsi+1]             ;
588
589        punpcklbw       xmm1,           xmm0                 ;
590        pmullw          xmm1,           [rax]               ;
591        punpcklbw       xmm3,           xmm0                 ;
592        pmullw          xmm3,           [rax+16]             ;
593
594        paddw           xmm1,           xmm3                 ;
595        paddw           xmm1,           xmm4               ;
596        psraw           xmm1,           xmm_filter_shift    ;
597
598        movdqa          xmm3,           xmm5                 ;
599        movdqa          xmm5,           xmm1                 ;
600
601        pmullw          xmm3,           [rdx]               ;
602        pmullw          xmm1,           [rdx+16]             ;
603        paddw           xmm1,           xmm3                 ;
604        paddw           xmm1,           xmm4                 ;
605        psraw           xmm1,           xmm_filter_shift    ;
606
607        movq            xmm3,           QWORD PTR [rdi]               ;
608        punpcklbw       xmm3,           xmm0                 ;
609
610        psubw           xmm1,           xmm3                 ;
611        paddw           xmm6,           xmm1                 ;
612
613        pmaddwd         xmm1,           xmm1                 ;
614        paddd           xmm7,           xmm1                 ;
615
616        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
617%if ABI_IS_32BIT
618        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
619%else
620        lea             rdi,            [rdi + r9]
621%endif
622
623        sub             rcx,            1                   ;
624        jnz             filter_block2d_bil_var_sse2_loop       ;
625
626        jmp             filter_block2d_bil_variance
627
628filter_block2d_bil_var_sse2_sp_only:
629        movsxd          rdx,            dword ptr arg(6)     ; yoffset
630
631        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
632        je              filter_block2d_bil_var_sse2_full_pixel
633
634        shl             rdx,            5
635        lea             rdx,            [rdx + rcx]          ; VFilter
636
637        mov             rsi,            arg(0)               ;ref_ptr
638        mov             rdi,            arg(2)               ;src_ptr
639        movsxd          rcx,            dword ptr arg(4)     ;Height
640        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
641
642        pxor            xmm0,           xmm0                 ;
643        movq            xmm1,           QWORD PTR [rsi]      ;
644        punpcklbw       xmm1,           xmm0                 ;
645
646        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
647        lea             rsi,            [rsi + rax]
648
649filter_block2d_bil_sp_only_loop:
650        movq            xmm3,           QWORD PTR [rsi]             ;
651        punpcklbw       xmm3,           xmm0                 ;
652        movdqa          xmm5,           xmm3
653
654        pmullw          xmm1,           [rdx]               ;
655        pmullw          xmm3,           [rdx+16]             ;
656        paddw           xmm1,           xmm3                 ;
657        paddw           xmm1,           xmm4                 ;
658        psraw           xmm1,           xmm_filter_shift    ;
659
660        movq            xmm3,           QWORD PTR [rdi]               ;
661        punpcklbw       xmm3,           xmm0                 ;
662
663        psubw           xmm1,           xmm3                 ;
664        paddw           xmm6,           xmm1                 ;
665
666        pmaddwd         xmm1,           xmm1                 ;
667        paddd           xmm7,           xmm1                 ;
668
669        movdqa          xmm1,           xmm5                 ;
670        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
671        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
672
673        sub             rcx,            1                   ;
674        jnz             filter_block2d_bil_sp_only_loop       ;
675
676        jmp             filter_block2d_bil_variance
677
678filter_block2d_bil_var_sse2_full_pixel:
679        mov             rsi,            arg(0)               ;ref_ptr
680        mov             rdi,            arg(2)               ;src_ptr
681        movsxd          rcx,            dword ptr arg(4)     ;Height
682        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
683        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
684        pxor            xmm0,           xmm0                 ;
685
686filter_block2d_bil_full_pixel_loop:
687        movq            xmm1,           QWORD PTR [rsi]               ;
688        punpcklbw       xmm1,           xmm0                 ;
689
690        movq            xmm2,           QWORD PTR [rdi]               ;
691        punpcklbw       xmm2,           xmm0                 ;
692
693        psubw           xmm1,           xmm2                 ;
694        paddw           xmm6,           xmm1                 ;
695
696        pmaddwd         xmm1,           xmm1                 ;
697        paddd           xmm7,           xmm1                 ;
698
699        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
700        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
701
702        sub             rcx,            1                   ;
703        jnz             filter_block2d_bil_full_pixel_loop       ;
704
705        jmp             filter_block2d_bil_variance
706
707filter_block2d_bil_var_sse2_fp_only:
708        mov             rsi,            arg(0)               ;ref_ptr
709        mov             rdi,            arg(2)               ;src_ptr
710        movsxd          rcx,            dword ptr arg(4)     ;Height
711        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
712
713        pxor            xmm0,           xmm0                 ;
714        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
715
716filter_block2d_bil_fp_only_loop:
717        movq            xmm1,           QWORD PTR [rsi]       ;
718        movq            xmm3,           QWORD PTR [rsi+1]     ;
719
720        punpcklbw       xmm1,           xmm0                 ;
721        pmullw          xmm1,           [rax]               ;
722        punpcklbw       xmm3,           xmm0                 ;
723        pmullw          xmm3,           [rax+16]             ;
724
725        paddw           xmm1,           xmm3                 ;
726        paddw           xmm1,           xmm4  ;
727        psraw           xmm1,           xmm_filter_shift    ;
728
729        movq            xmm3,           QWORD PTR [rdi]     ;
730        punpcklbw       xmm3,           xmm0                 ;
731
732        psubw           xmm1,           xmm3                 ;
733        paddw           xmm6,           xmm1                 ;
734
735        pmaddwd         xmm1,           xmm1                 ;
736        paddd           xmm7,           xmm1                 ;
737        lea             rsi,            [rsi + rdx]
738        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
739
740        sub             rcx,            1                   ;
741        jnz             filter_block2d_bil_fp_only_loop       ;
742
743        jmp             filter_block2d_bil_variance
744
745filter_block2d_bil_variance:
746        movdq2q         mm6,            xmm6                ;
747        movdq2q         mm7,            xmm7                ;
748
749        psrldq          xmm6,           8
750        psrldq          xmm7,           8
751
752        movdq2q         mm2,            xmm6
753        movdq2q         mm3,            xmm7
754
755        paddw           mm6,            mm2
756        paddd           mm7,            mm3
757
758        pxor            mm3,            mm3                 ;
759        pxor            mm2,            mm2                 ;
760
761        punpcklwd       mm2,            mm6                 ;
762        punpckhwd       mm3,            mm6                 ;
763
764        paddd           mm2,            mm3                 ;
765        movq            mm6,            mm2                 ;
766
767        psrlq           mm6,            32                  ;
768        paddd           mm2,            mm6                 ;
769
770        psrad           mm2,            16                  ;
771        movq            mm4,            mm7                 ;
772
773        psrlq           mm4,            32                  ;
774        paddd           mm4,            mm7                 ;
775
776        mov             rsi,            arg(7) ; sum
777        mov             rdi,            arg(8) ; sumsquared
778
779        movd            [rsi],          mm2    ; xsum
780        movd            [rdi],          mm4    ; xxsum
781
782    ; begin epilog
783    pop rbx
784    pop rdi
785    pop rsi
786    RESTORE_GOT
787    RESTORE_XMM
788    UNSHADOW_ARGS
789    pop         rbp
790    ret
791
792
793;void vp8_half_horiz_vert_variance8x_h_sse2
794;(
795;    unsigned char *ref_ptr,
796;    int ref_pixels_per_line,
797;    unsigned char *src_ptr,
798;    int src_pixels_per_line,
799;    unsigned int Height,
800;    int *sum,
801;    unsigned int *sumsquared
802;)
803global sym(vp8_half_horiz_vert_variance8x_h_sse2)
804sym(vp8_half_horiz_vert_variance8x_h_sse2):
805    push        rbp
806    mov         rbp, rsp
807    SHADOW_ARGS_TO_STACK 7
808    GET_GOT     rbx
809    push rsi
810    push rdi
811    ; end prolog
812
813%if ABI_IS_32BIT=0
814    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
815    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
816%endif
817
818        pxor            xmm6,           xmm6                ;  error accumulator
819        pxor            xmm7,           xmm7                ;  sse eaccumulator
820        mov             rsi,            arg(0) ;ref_ptr              ;
821
822        mov             rdi,            arg(2) ;src_ptr              ;
823        movsxd          rcx,            dword ptr arg(4) ;Height              ;
824        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
825
826        pxor            xmm0,           xmm0                ;
827
828        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
829        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
830        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
831
832%if ABI_IS_32BIT
833        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
834%else
835        add             rsi, r8
836%endif
837
838vp8_half_horiz_vert_variance8x_h_1:
839
840        movq            xmm1,           QWORD PTR [rsi]     ;
841        movq            xmm2,           QWORD PTR [rsi+1]   ;
842        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
843
844        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
845        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
846
847        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
848        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
849
850        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
851        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
852        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
853        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
854
855        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
856
857%if ABI_IS_32BIT
858        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
859        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
860%else
861        add             rsi, r8
862        add             rdi, r9
863%endif
864
865        sub             rcx,            1                   ;
866        jnz             vp8_half_horiz_vert_variance8x_h_1     ;
867
868        movdq2q         mm6,            xmm6                ;
869        movdq2q         mm7,            xmm7                ;
870
871        psrldq          xmm6,           8
872        psrldq          xmm7,           8
873
874        movdq2q         mm2,            xmm6
875        movdq2q         mm3,            xmm7
876
877        paddw           mm6,            mm2
878        paddd           mm7,            mm3
879
880        pxor            mm3,            mm3                 ;
881        pxor            mm2,            mm2                 ;
882
883        punpcklwd       mm2,            mm6                 ;
884        punpckhwd       mm3,            mm6                 ;
885
886        paddd           mm2,            mm3                 ;
887        movq            mm6,            mm2                 ;
888
889        psrlq           mm6,            32                  ;
890        paddd           mm2,            mm6                 ;
891
892        psrad           mm2,            16                  ;
893        movq            mm4,            mm7                 ;
894
895        psrlq           mm4,            32                  ;
896        paddd           mm4,            mm7                 ;
897
898        mov             rsi,            arg(5) ; sum
899        mov             rdi,            arg(6) ; sumsquared
900
901        movd            [rsi],          mm2                 ;
902        movd            [rdi],          mm4                 ;
903
904
905    ; begin epilog
906    pop rdi
907    pop rsi
908    RESTORE_GOT
909    UNSHADOW_ARGS
910    pop         rbp
911    ret
912
913;void vp8_half_horiz_vert_variance16x_h_sse2
914;(
915;    unsigned char *ref_ptr,
916;    int ref_pixels_per_line,
917;    unsigned char *src_ptr,
918;    int src_pixels_per_line,
919;    unsigned int Height,
920;    int *sum,
921;    unsigned int *sumsquared
922;)
923global sym(vp8_half_horiz_vert_variance16x_h_sse2)
924sym(vp8_half_horiz_vert_variance16x_h_sse2):
925    push        rbp
926    mov         rbp, rsp
927    SHADOW_ARGS_TO_STACK 7
928    SAVE_XMM
929    GET_GOT     rbx
930    push rsi
931    push rdi
932    ; end prolog
933
934        pxor            xmm6,           xmm6                ;  error accumulator
935        pxor            xmm7,           xmm7                ;  sse eaccumulator
936        mov             rsi,            arg(0) ;ref_ptr              ;
937
938        mov             rdi,            arg(2) ;src_ptr              ;
939        movsxd          rcx,            dword ptr arg(4) ;Height              ;
940        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
941        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
942
943        pxor            xmm0,           xmm0                ;
944
945        movdqu          xmm5,           XMMWORD PTR [rsi]
946        movdqu          xmm3,           XMMWORD PTR [rsi+1]
947        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
948
949        lea             rsi,            [rsi + rax]
950
951vp8_half_horiz_vert_variance16x_h_1:
952        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
953        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
954        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
955
956        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
957
958        movdqa          xmm4,           xmm5
959        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
960        punpckhbw       xmm4,           xmm0
961
962        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
963        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
964        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
965
966        movq            xmm3,           QWORD PTR [rdi+8]
967        punpcklbw       xmm3,           xmm0
968        psubw           xmm4,           xmm3
969
970        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
971        paddw           xmm6,           xmm4
972        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
973        pmaddwd         xmm4,           xmm4
974        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
975        paddd           xmm7,           xmm4
976
977        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
978
979        lea             rsi,            [rsi + rax]
980        lea             rdi,            [rdi + rdx]
981
982        sub             rcx,            1                   ;
983        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
984
985        pxor        xmm1,           xmm1
986        pxor        xmm5,           xmm5
987
988        punpcklwd   xmm0,           xmm6
989        punpckhwd   xmm1,           xmm6
990        psrad       xmm0,           16
991        psrad       xmm1,           16
992        paddd       xmm0,           xmm1
993        movdqa      xmm1,           xmm0
994
995        movdqa      xmm6,           xmm7
996        punpckldq   xmm6,           xmm5
997        punpckhdq   xmm7,           xmm5
998        paddd       xmm6,           xmm7
999
1000        punpckldq   xmm0,           xmm5
1001        punpckhdq   xmm1,           xmm5
1002        paddd       xmm0,           xmm1
1003
1004        movdqa      xmm7,           xmm6
1005        movdqa      xmm1,           xmm0
1006
1007        psrldq      xmm7,           8
1008        psrldq      xmm1,           8
1009
1010        paddd       xmm6,           xmm7
1011        paddd       xmm0,           xmm1
1012
1013        mov         rsi,            arg(5) ;[Sum]
1014        mov         rdi,            arg(6) ;[SSE]
1015
1016        movd        [rsi],       xmm0
1017        movd        [rdi],       xmm6
1018
1019    ; begin epilog
1020    pop rdi
1021    pop rsi
1022    RESTORE_GOT
1023    RESTORE_XMM
1024    UNSHADOW_ARGS
1025    pop         rbp
1026    ret
1027
1028
1029;void vp8_half_vert_variance8x_h_sse2
1030;(
1031;    unsigned char *ref_ptr,
1032;    int ref_pixels_per_line,
1033;    unsigned char *src_ptr,
1034;    int src_pixels_per_line,
1035;    unsigned int Height,
1036;    int *sum,
1037;    unsigned int *sumsquared
1038;)
1039global sym(vp8_half_vert_variance8x_h_sse2)
1040sym(vp8_half_vert_variance8x_h_sse2):
1041    push        rbp
1042    mov         rbp, rsp
1043    SHADOW_ARGS_TO_STACK 7
1044    GET_GOT     rbx
1045    push rsi
1046    push rdi
1047    ; end prolog
1048
1049%if ABI_IS_32BIT=0
1050    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
1051    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
1052%endif
1053
1054        pxor            xmm6,           xmm6                ;  error accumulator
1055        pxor            xmm7,           xmm7                ;  sse eaccumulator
1056        mov             rsi,            arg(0) ;ref_ptr              ;
1057
1058        mov             rdi,            arg(2) ;src_ptr              ;
1059        movsxd          rcx,            dword ptr arg(4) ;Height              ;
1060        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
1061
1062        pxor            xmm0,           xmm0                ;
1063vp8_half_vert_variance8x_h_1:
1064        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
1065        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
1066
1067        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
1068        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
1069
1070        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
1071        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
1072
1073        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
1074        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
1075        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
1076        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
1077
1078%if ABI_IS_32BIT
1079        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
1080        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
1081%else
1082        add             rsi, r8
1083        add             rdi, r9
1084%endif
1085
1086        sub             rcx,            1                   ;
1087        jnz             vp8_half_vert_variance8x_h_1          ;
1088
1089        movdq2q         mm6,            xmm6                ;
1090        movdq2q         mm7,            xmm7                ;
1091
1092        psrldq          xmm6,           8
1093        psrldq          xmm7,           8
1094
1095        movdq2q         mm2,            xmm6
1096        movdq2q         mm3,            xmm7
1097
1098        paddw           mm6,            mm2
1099        paddd           mm7,            mm3
1100
1101        pxor            mm3,            mm3                 ;
1102        pxor            mm2,            mm2                 ;
1103
1104        punpcklwd       mm2,            mm6                 ;
1105        punpckhwd       mm3,            mm6                 ;
1106
1107        paddd           mm2,            mm3                 ;
1108        movq            mm6,            mm2                 ;
1109
1110        psrlq           mm6,            32                  ;
1111        paddd           mm2,            mm6                 ;
1112
1113        psrad           mm2,            16                  ;
1114        movq            mm4,            mm7                 ;
1115
1116        psrlq           mm4,            32                  ;
1117        paddd           mm4,            mm7                 ;
1118
1119        mov             rsi,            arg(5) ; sum
1120        mov             rdi,            arg(6) ; sumsquared
1121
1122        movd            [rsi],          mm2                 ;
1123        movd            [rdi],          mm4                 ;
1124
1125
1126    ; begin epilog
1127    pop rdi
1128    pop rsi
1129    RESTORE_GOT
1130    UNSHADOW_ARGS
1131    pop         rbp
1132    ret
1133
1134;void vp8_half_vert_variance16x_h_sse2
1135;(
1136;    unsigned char *ref_ptr,
1137;    int ref_pixels_per_line,
1138;    unsigned char *src_ptr,
1139;    int src_pixels_per_line,
1140;    unsigned int Height,
1141;    int *sum,
1142;    unsigned int *sumsquared
1143;)
1144global sym(vp8_half_vert_variance16x_h_sse2)
1145sym(vp8_half_vert_variance16x_h_sse2):
1146    push        rbp
1147    mov         rbp, rsp
1148    SHADOW_ARGS_TO_STACK 7
1149    SAVE_XMM
1150    GET_GOT     rbx
1151    push rsi
1152    push rdi
1153    ; end prolog
1154
1155        pxor            xmm6,           xmm6                ;  error accumulator
1156        pxor            xmm7,           xmm7                ;  sse eaccumulator
1157        mov             rsi,            arg(0)              ;ref_ptr
1158
1159        mov             rdi,            arg(2)              ;src_ptr
1160        movsxd          rcx,            dword ptr arg(4)    ;Height
1161        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
1162        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
1163
1164        movdqu          xmm5,           XMMWORD PTR [rsi]
1165        lea             rsi,            [rsi + rax          ]
1166        pxor            xmm0,           xmm0
1167
1168vp8_half_vert_variance16x_h_1:
1169        movdqu          xmm3,           XMMWORD PTR [rsi]
1170
1171        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
1172        movdqa          xmm4,           xmm5
1173        punpcklbw       xmm5,           xmm0
1174        punpckhbw       xmm4,           xmm0
1175
1176        movq            xmm2,           QWORD PTR [rdi]
1177        punpcklbw       xmm2,           xmm0
1178        psubw           xmm5,           xmm2
1179        movq            xmm2,           QWORD PTR [rdi+8]
1180        punpcklbw       xmm2,           xmm0
1181        psubw           xmm4,           xmm2
1182
1183        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
1184        paddw           xmm6,           xmm4
1185        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
1186        pmaddwd         xmm4,           xmm4
1187        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
1188        paddd           xmm7,           xmm4
1189
1190        movdqa          xmm5,           xmm3
1191
1192        lea             rsi,            [rsi + rax]
1193        lea             rdi,            [rdi + rdx]
1194
1195        sub             rcx,            1
1196        jnz             vp8_half_vert_variance16x_h_1
1197
1198        pxor        xmm1,           xmm1
1199        pxor        xmm5,           xmm5
1200
1201        punpcklwd   xmm0,           xmm6
1202        punpckhwd   xmm1,           xmm6
1203        psrad       xmm0,           16
1204        psrad       xmm1,           16
1205        paddd       xmm0,           xmm1
1206        movdqa      xmm1,           xmm0
1207
1208        movdqa      xmm6,           xmm7
1209        punpckldq   xmm6,           xmm5
1210        punpckhdq   xmm7,           xmm5
1211        paddd       xmm6,           xmm7
1212
1213        punpckldq   xmm0,           xmm5
1214        punpckhdq   xmm1,           xmm5
1215        paddd       xmm0,           xmm1
1216
1217        movdqa      xmm7,           xmm6
1218        movdqa      xmm1,           xmm0
1219
1220        psrldq      xmm7,           8
1221        psrldq      xmm1,           8
1222
1223        paddd       xmm6,           xmm7
1224        paddd       xmm0,           xmm1
1225
1226        mov         rsi,            arg(5) ;[Sum]
1227        mov         rdi,            arg(6) ;[SSE]
1228
1229        movd        [rsi],       xmm0
1230        movd        [rdi],       xmm6
1231
1232    ; begin epilog
1233    pop rdi
1234    pop rsi
1235    RESTORE_GOT
1236    RESTORE_XMM
1237    UNSHADOW_ARGS
1238    pop         rbp
1239    ret
1240
1241
1242;void vp8_half_horiz_variance8x_h_sse2
1243;(
1244;    unsigned char *ref_ptr,
1245;    int ref_pixels_per_line,
1246;    unsigned char *src_ptr,
1247;    int src_pixels_per_line,
1248;    unsigned int Height,
1249;    int *sum,
1250;    unsigned int *sumsquared
1251;)
1252global sym(vp8_half_horiz_variance8x_h_sse2)
1253sym(vp8_half_horiz_variance8x_h_sse2):
1254    push        rbp
1255    mov         rbp, rsp
1256    SHADOW_ARGS_TO_STACK 7
1257    GET_GOT     rbx
1258    push rsi
1259    push rdi
1260    ; end prolog
1261
1262%if ABI_IS_32BIT=0
1263    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
1264    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
1265%endif
1266
1267        pxor            xmm6,           xmm6                ;  error accumulator
1268        pxor            xmm7,           xmm7                ;  sse eaccumulator
1269        mov             rsi,            arg(0) ;ref_ptr              ;
1270
1271        mov             rdi,            arg(2) ;src_ptr              ;
1272        movsxd          rcx,            dword ptr arg(4) ;Height              ;
1273
1274        pxor            xmm0,           xmm0                ;
1275vp8_half_horiz_variance8x_h_1:
1276        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
1277        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
1278
1279        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
1280        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
1281
1282        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
1283        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
1284
1285        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
1286        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
1287        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
1288        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
1289
1290%if ABI_IS_32BIT
1291        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
1292        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
1293%else
1294        add             rsi, r8
1295        add             rdi, r9
1296%endif
1297        sub             rcx,            1                   ;
1298        jnz             vp8_half_horiz_variance8x_h_1        ;
1299
1300        movdq2q         mm6,            xmm6                ;
1301        movdq2q         mm7,            xmm7                ;
1302
1303        psrldq          xmm6,           8
1304        psrldq          xmm7,           8
1305
1306        movdq2q         mm2,            xmm6
1307        movdq2q         mm3,            xmm7
1308
1309        paddw           mm6,            mm2
1310        paddd           mm7,            mm3
1311
1312        pxor            mm3,            mm3                 ;
1313        pxor            mm2,            mm2                 ;
1314
1315        punpcklwd       mm2,            mm6                 ;
1316        punpckhwd       mm3,            mm6                 ;
1317
1318        paddd           mm2,            mm3                 ;
1319        movq            mm6,            mm2                 ;
1320
1321        psrlq           mm6,            32                  ;
1322        paddd           mm2,            mm6                 ;
1323
1324        psrad           mm2,            16                  ;
1325        movq            mm4,            mm7                 ;
1326
1327        psrlq           mm4,            32                  ;
1328        paddd           mm4,            mm7                 ;
1329
1330        mov             rsi,            arg(5) ; sum
1331        mov             rdi,            arg(6) ; sumsquared
1332
1333        movd            [rsi],          mm2                 ;
1334        movd            [rdi],          mm4                 ;
1335
1336
1337    ; begin epilog
1338    pop rdi
1339    pop rsi
1340    RESTORE_GOT
1341    UNSHADOW_ARGS
1342    pop         rbp
1343    ret
1344
1345;void vp8_half_horiz_variance16x_h_sse2
1346;(
1347;    unsigned char *ref_ptr,
1348;    int ref_pixels_per_line,
1349;    unsigned char *src_ptr,
1350;    int src_pixels_per_line,
1351;    unsigned int Height,
1352;    int *sum,
1353;    unsigned int *sumsquared
1354;)
1355global sym(vp8_half_horiz_variance16x_h_sse2)
1356sym(vp8_half_horiz_variance16x_h_sse2):
1357    push        rbp
1358    mov         rbp, rsp
1359    SHADOW_ARGS_TO_STACK 7
1360    SAVE_XMM
1361    GET_GOT     rbx
1362    push rsi
1363    push rdi
1364    ; end prolog
1365
1366        pxor            xmm6,           xmm6                ;  error accumulator
1367        pxor            xmm7,           xmm7                ;  sse eaccumulator
1368        mov             rsi,            arg(0) ;ref_ptr              ;
1369
1370        mov             rdi,            arg(2) ;src_ptr              ;
1371        movsxd          rcx,            dword ptr arg(4) ;Height              ;
1372        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
1373        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
1374
1375        pxor            xmm0,           xmm0                ;
1376
1377vp8_half_horiz_variance16x_h_1:
1378        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
1379        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
1380
1381        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
1382        movdqa          xmm1,           xmm5
1383        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
1384        punpckhbw       xmm1,           xmm0
1385
1386        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
1387        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
1388        movq            xmm2,           QWORD PTR [rdi+8]
1389        punpcklbw       xmm2,           xmm0
1390
1391        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
1392        psubw           xmm1,           xmm2
1393        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
1394        paddw           xmm6,           xmm1
1395        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
1396        pmaddwd         xmm1,           xmm1
1397        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
1398        paddd           xmm7,           xmm1
1399
1400        lea             rsi,            [rsi + rax]
1401        lea             rdi,            [rdi + rdx]
1402
1403        sub             rcx,            1                   ;
1404        jnz             vp8_half_horiz_variance16x_h_1        ;
1405
1406        pxor        xmm1,           xmm1
1407        pxor        xmm5,           xmm5
1408
1409        punpcklwd   xmm0,           xmm6
1410        punpckhwd   xmm1,           xmm6
1411        psrad       xmm0,           16
1412        psrad       xmm1,           16
1413        paddd       xmm0,           xmm1
1414        movdqa      xmm1,           xmm0
1415
1416        movdqa      xmm6,           xmm7
1417        punpckldq   xmm6,           xmm5
1418        punpckhdq   xmm7,           xmm5
1419        paddd       xmm6,           xmm7
1420
1421        punpckldq   xmm0,           xmm5
1422        punpckhdq   xmm1,           xmm5
1423        paddd       xmm0,           xmm1
1424
1425        movdqa      xmm7,           xmm6
1426        movdqa      xmm1,           xmm0
1427
1428        psrldq      xmm7,           8
1429        psrldq      xmm1,           8
1430
1431        paddd       xmm6,           xmm7
1432        paddd       xmm0,           xmm1
1433
1434        mov         rsi,            arg(5) ;[Sum]
1435        mov         rdi,            arg(6) ;[SSE]
1436
1437        movd        [rsi],       xmm0
1438        movd        [rdi],       xmm6
1439
1440    ; begin epilog
1441    pop rdi
1442    pop rsi
1443    RESTORE_GOT
1444    RESTORE_XMM
1445    UNSHADOW_ARGS
1446    pop         rbp
1447    ret
1448
1449SECTION_RODATA
1450;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
1451align 16
1452xmm_bi_rd:
1453    times 8 dw 64
1454align 16
1455vp8_bilinear_filters_sse2:
1456    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
1457    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
1458    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
1459    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
1460    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
1461    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
1462    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
1463    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
1464