1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;unsigned int vp9_get_mb_ss_sse2
15;(
16;    short *src_ptr
17;)
18global sym(vp9_get_mb_ss_sse2) PRIVATE
19sym(vp9_get_mb_ss_sse2):
20    push        rbp
21    mov         rbp, rsp
22    SHADOW_ARGS_TO_STACK 1
23    GET_GOT     rbx
24    push rsi
25    push rdi
26    sub         rsp, 16
27    ; end prolog
28
29
30        mov         rax, arg(0) ;[src_ptr]
31        mov         rcx, 8
32        pxor        xmm4, xmm4
33
34.NEXTROW:
35        movdqa      xmm0, [rax]
36        movdqa      xmm1, [rax+16]
37        movdqa      xmm2, [rax+32]
38        movdqa      xmm3, [rax+48]
39        pmaddwd     xmm0, xmm0
40        pmaddwd     xmm1, xmm1
41        pmaddwd     xmm2, xmm2
42        pmaddwd     xmm3, xmm3
43
44        paddd       xmm0, xmm1
45        paddd       xmm2, xmm3
46        paddd       xmm4, xmm0
47        paddd       xmm4, xmm2
48
49        add         rax, 0x40
50        dec         rcx
51        ja          .NEXTROW
52
53        movdqa      xmm3,xmm4
54        psrldq      xmm4,8
55        paddd       xmm4,xmm3
56        movdqa      xmm3,xmm4
57        psrldq      xmm4,4
58        paddd       xmm4,xmm3
59        movq        rax,xmm4
60
61
62    ; begin epilog
63    add rsp, 16
64    pop rdi
65    pop rsi
66    RESTORE_GOT
67    UNSHADOW_ARGS
68    pop         rbp
69    ret
70
71
72;unsigned int vp9_get16x16var_sse2
73;(
74;    unsigned char   *  src_ptr,
75;    int             source_stride,
76;    unsigned char   *  ref_ptr,
77;    int             recon_stride,
78;    unsigned int    *  SSE,
79;    int             *  Sum
80;)
81global sym(vp9_get16x16var_sse2) PRIVATE
82sym(vp9_get16x16var_sse2):
83    push        rbp
84    mov         rbp, rsp
85    SHADOW_ARGS_TO_STACK 6
86    SAVE_XMM 7
87    push rbx
88    push rsi
89    push rdi
90    ; end prolog
91
92        mov         rsi,            arg(0) ;[src_ptr]
93        mov         rdi,            arg(2) ;[ref_ptr]
94
95        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
96        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
97
98        ; Prefetch data
99        lea             rcx,    [rax+rax*2]
100        prefetcht0      [rsi]
101        prefetcht0      [rsi+rax]
102        prefetcht0      [rsi+rax*2]
103        prefetcht0      [rsi+rcx]
104        lea             rbx,    [rsi+rax*4]
105        prefetcht0      [rbx]
106        prefetcht0      [rbx+rax]
107        prefetcht0      [rbx+rax*2]
108        prefetcht0      [rbx+rcx]
109
110        lea             rcx,    [rdx+rdx*2]
111        prefetcht0      [rdi]
112        prefetcht0      [rdi+rdx]
113        prefetcht0      [rdi+rdx*2]
114        prefetcht0      [rdi+rcx]
115        lea             rbx,    [rdi+rdx*4]
116        prefetcht0      [rbx]
117        prefetcht0      [rbx+rdx]
118        prefetcht0      [rbx+rdx*2]
119        prefetcht0      [rbx+rcx]
120
121        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
122        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
123
124        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
125        mov         rcx,            16
126
127.var16loop:
128        movdqu      xmm1,           XMMWORD PTR [rsi]
129        movdqu      xmm2,           XMMWORD PTR [rdi]
130
131        prefetcht0      [rsi+rax*8]
132        prefetcht0      [rdi+rdx*8]
133
134        movdqa      xmm3,           xmm1
135        movdqa      xmm4,           xmm2
136
137
138        punpcklbw   xmm1,           xmm0
139        punpckhbw   xmm3,           xmm0
140
141        punpcklbw   xmm2,           xmm0
142        punpckhbw   xmm4,           xmm0
143
144
145        psubw       xmm1,           xmm2
146        psubw       xmm3,           xmm4
147
148        paddw       xmm7,           xmm1
149        pmaddwd     xmm1,           xmm1
150
151        paddw       xmm7,           xmm3
152        pmaddwd     xmm3,           xmm3
153
154        paddd       xmm6,           xmm1
155        paddd       xmm6,           xmm3
156
157        add         rsi,            rax
158        add         rdi,            rdx
159
160        sub         rcx,            1
161        jnz         .var16loop
162
163
164        movdqa      xmm1,           xmm6
165        pxor        xmm6,           xmm6
166
167        pxor        xmm5,           xmm5
168        punpcklwd   xmm6,           xmm7
169
170        punpckhwd   xmm5,           xmm7
171        psrad       xmm5,           16
172
173        psrad       xmm6,           16
174        paddd       xmm6,           xmm5
175
176        movdqa      xmm2,           xmm1
177        punpckldq   xmm1,           xmm0
178
179        punpckhdq   xmm2,           xmm0
180        movdqa      xmm7,           xmm6
181
182        paddd       xmm1,           xmm2
183        punpckldq   xmm6,           xmm0
184
185        punpckhdq   xmm7,           xmm0
186        paddd       xmm6,           xmm7
187
188        movdqa      xmm2,           xmm1
189        movdqa      xmm7,           xmm6
190
191        psrldq      xmm1,           8
192        psrldq      xmm6,           8
193
194        paddd       xmm7,           xmm6
195        paddd       xmm1,           xmm2
196
197        mov         rax,            arg(5) ;[Sum]
198        mov         rdi,            arg(4) ;[SSE]
199
200        movd DWORD PTR [rax],       xmm7
201        movd DWORD PTR [rdi],       xmm1
202
203
204    ; begin epilog
205    pop rdi
206    pop rsi
207    pop rbx
208    RESTORE_XMM
209    UNSHADOW_ARGS
210    pop         rbp
211    ret
212
213
214
215
216;unsigned int vp9_get8x8var_sse2
217;(
218;    unsigned char   *  src_ptr,
219;    int             source_stride,
220;    unsigned char   *  ref_ptr,
221;    int             recon_stride,
222;    unsigned int    *  SSE,
223;    int             *  Sum
224;)
225global sym(vp9_get8x8var_sse2) PRIVATE
226sym(vp9_get8x8var_sse2):
227    push        rbp
228    mov         rbp, rsp
229    SHADOW_ARGS_TO_STACK 6
230    SAVE_XMM 7
231    GET_GOT     rbx
232    push rsi
233    push rdi
234    sub         rsp, 16
235    ; end prolog
236
237        mov         rsi,            arg(0) ;[src_ptr]
238        mov         rdi,            arg(2) ;[ref_ptr]
239
240        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
241        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
242
243        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
244        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
245
246        movq        xmm1,           QWORD PTR [rsi]
247        movq        xmm2,           QWORD PTR [rdi]
248
249        punpcklbw   xmm1,           xmm0
250        punpcklbw   xmm2,           xmm0
251
252        psubsw      xmm1,           xmm2
253        paddw       xmm7,           xmm1
254
255        pmaddwd     xmm1,           xmm1
256
257        movq        xmm2,           QWORD PTR[rsi + rax]
258        movq        xmm3,           QWORD PTR[rdi + rdx]
259
260        punpcklbw   xmm2,           xmm0
261        punpcklbw   xmm3,           xmm0
262
263        psubsw      xmm2,           xmm3
264        paddw       xmm7,           xmm2
265
266        pmaddwd     xmm2,           xmm2
267        paddd       xmm1,           xmm2
268
269
270        movq        xmm2,           QWORD PTR[rsi + rax * 2]
271        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
272
273        punpcklbw   xmm2,           xmm0
274        punpcklbw   xmm3,           xmm0
275
276        psubsw      xmm2,           xmm3
277        paddw       xmm7,           xmm2
278
279        pmaddwd     xmm2,           xmm2
280        paddd       xmm1,           xmm2
281
282
283        lea         rsi,            [rsi + rax * 2]
284        lea         rdi,            [rdi + rdx * 2]
285        movq        xmm2,           QWORD PTR[rsi + rax]
286        movq        xmm3,           QWORD PTR[rdi + rdx]
287
288        punpcklbw   xmm2,           xmm0
289        punpcklbw   xmm3,           xmm0
290
291        psubsw      xmm2,           xmm3
292        paddw       xmm7,           xmm2
293
294        pmaddwd     xmm2,           xmm2
295        paddd       xmm1,           xmm2
296
297        movq        xmm2,           QWORD PTR[rsi + rax *2]
298        movq        xmm3,           QWORD PTR[rdi + rdx *2]
299
300        punpcklbw   xmm2,           xmm0
301        punpcklbw   xmm3,           xmm0
302
303        psubsw      xmm2,           xmm3
304        paddw       xmm7,           xmm2
305
306        pmaddwd     xmm2,           xmm2
307        paddd       xmm1,           xmm2
308
309
310        lea         rsi,            [rsi + rax * 2]
311        lea         rdi,            [rdi + rdx * 2]
312
313
314        movq        xmm2,           QWORD PTR[rsi + rax]
315        movq        xmm3,           QWORD PTR[rdi + rdx]
316
317        punpcklbw   xmm2,           xmm0
318        punpcklbw   xmm3,           xmm0
319
320        psubsw      xmm2,           xmm3
321        paddw       xmm7,           xmm2
322
323        pmaddwd     xmm2,           xmm2
324        paddd       xmm1,           xmm2
325
326        movq        xmm2,           QWORD PTR[rsi + rax *2]
327        movq        xmm3,           QWORD PTR[rdi + rdx *2]
328
329        punpcklbw   xmm2,           xmm0
330        punpcklbw   xmm3,           xmm0
331
332        psubsw      xmm2,           xmm3
333        paddw       xmm7,           xmm2
334
335        pmaddwd     xmm2,           xmm2
336        paddd       xmm1,           xmm2
337
338
339        lea         rsi,            [rsi + rax * 2]
340        lea         rdi,            [rdi + rdx * 2]
341
342        movq        xmm2,           QWORD PTR[rsi + rax]
343        movq        xmm3,           QWORD PTR[rdi + rdx]
344
345        punpcklbw   xmm2,           xmm0
346        punpcklbw   xmm3,           xmm0
347
348        psubsw      xmm2,           xmm3
349        paddw       xmm7,           xmm2
350
351        pmaddwd     xmm2,           xmm2
352        paddd       xmm1,           xmm2
353
354
355        movdqa      xmm6,           xmm7
356        punpcklwd   xmm6,           xmm0
357
358        punpckhwd   xmm7,           xmm0
359        movdqa      xmm2,           xmm1
360
361        paddw       xmm6,           xmm7
362        punpckldq   xmm1,           xmm0
363
364        punpckhdq   xmm2,           xmm0
365        movdqa      xmm7,           xmm6
366
367        paddd       xmm1,           xmm2
368        punpckldq   xmm6,           xmm0
369
370        punpckhdq   xmm7,           xmm0
371        paddw       xmm6,           xmm7
372
373        movdqa      xmm2,           xmm1
374        movdqa      xmm7,           xmm6
375
376        psrldq      xmm1,           8
377        psrldq      xmm6,           8
378
379        paddw       xmm7,           xmm6
380        paddd       xmm1,           xmm2
381
382        mov         rax,            arg(5) ;[Sum]
383        mov         rdi,            arg(4) ;[SSE]
384
385        movq        rdx,            xmm7
386        movsx       rcx,            dx
387
388        mov  dword ptr [rax],       ecx
389        movd DWORD PTR [rdi],       xmm1
390
391    ; begin epilog
392    add rsp, 16
393    pop rdi
394    pop rsi
395    RESTORE_GOT
396    RESTORE_XMM
397    UNSHADOW_ARGS
398    pop         rbp
399    ret
400
401;void vp9_half_horiz_vert_variance8x_h_sse2
402;(
403;    unsigned char *ref_ptr,
404;    int ref_pixels_per_line,
405;    unsigned char *src_ptr,
406;    int src_pixels_per_line,
407;    unsigned int Height,
408;    int *sum,
409;    unsigned int *sumsquared
410;)
411global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
412sym(vp9_half_horiz_vert_variance8x_h_sse2):
413    push        rbp
414    mov         rbp, rsp
415    SHADOW_ARGS_TO_STACK 7
416    SAVE_XMM 7
417    GET_GOT     rbx
418    push rsi
419    push rdi
420    ; end prolog
421
422%if ABI_IS_32BIT=0
423    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
424    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
425%endif
426
427        pxor            xmm6,           xmm6                ;  error accumulator
428        pxor            xmm7,           xmm7                ;  sse eaccumulator
429        mov             rsi,            arg(0) ;ref_ptr              ;
430
431        mov             rdi,            arg(2) ;src_ptr              ;
432        movsxd          rcx,            dword ptr arg(4) ;Height              ;
433        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
434
435        pxor            xmm0,           xmm0                ;
436
437        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
438        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
439        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
440
441%if ABI_IS_32BIT
442        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
443%else
444        add             rsi, r8
445%endif
446
447.half_horiz_vert_variance8x_h_1:
448
449        movq            xmm1,           QWORD PTR [rsi]     ;
450        movq            xmm2,           QWORD PTR [rsi+1]   ;
451        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
452
453        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
454        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
455
456        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
457        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
458
459        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
460        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
461        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
462        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
463
464        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
465
466%if ABI_IS_32BIT
467        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
468        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
469%else
470        add             rsi, r8
471        add             rdi, r9
472%endif
473
474        sub             rcx,            1                   ;
475        jnz             .half_horiz_vert_variance8x_h_1     ;
476
477        movdq2q         mm6,            xmm6                ;
478        movdq2q         mm7,            xmm7                ;
479
480        psrldq          xmm6,           8
481        psrldq          xmm7,           8
482
483        movdq2q         mm2,            xmm6
484        movdq2q         mm3,            xmm7
485
486        paddw           mm6,            mm2
487        paddd           mm7,            mm3
488
489        pxor            mm3,            mm3                 ;
490        pxor            mm2,            mm2                 ;
491
492        punpcklwd       mm2,            mm6                 ;
493        punpckhwd       mm3,            mm6                 ;
494
495        paddd           mm2,            mm3                 ;
496        movq            mm6,            mm2                 ;
497
498        psrlq           mm6,            32                  ;
499        paddd           mm2,            mm6                 ;
500
501        psrad           mm2,            16                  ;
502        movq            mm4,            mm7                 ;
503
504        psrlq           mm4,            32                  ;
505        paddd           mm4,            mm7                 ;
506
507        mov             rsi,            arg(5) ; sum
508        mov             rdi,            arg(6) ; sumsquared
509
510        movd            [rsi],          mm2                 ;
511        movd            [rdi],          mm4                 ;
512
513
514    ; begin epilog
515    pop rdi
516    pop rsi
517    RESTORE_GOT
518    RESTORE_XMM
519    UNSHADOW_ARGS
520    pop         rbp
521    ret
522
523;void vp9_half_vert_variance8x_h_sse2
524;(
525;    unsigned char *ref_ptr,
526;    int ref_pixels_per_line,
527;    unsigned char *src_ptr,
528;    int src_pixels_per_line,
529;    unsigned int Height,
530;    int *sum,
531;    unsigned int *sumsquared
532;)
533global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
534sym(vp9_half_vert_variance8x_h_sse2):
535    push        rbp
536    mov         rbp, rsp
537    SHADOW_ARGS_TO_STACK 7
538    SAVE_XMM 7
539    GET_GOT     rbx
540    push rsi
541    push rdi
542    ; end prolog
543
544%if ABI_IS_32BIT=0
545    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
546    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
547%endif
548
549        pxor            xmm6,           xmm6                ;  error accumulator
550        pxor            xmm7,           xmm7                ;  sse eaccumulator
551        mov             rsi,            arg(0) ;ref_ptr              ;
552
553        mov             rdi,            arg(2) ;src_ptr              ;
554        movsxd          rcx,            dword ptr arg(4) ;Height              ;
555        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
556
557        pxor            xmm0,           xmm0                ;
558.half_vert_variance8x_h_1:
559        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
560        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
561
562        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
563        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
564
565        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
566        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
567
568        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
569        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
570        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
571        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
572
573%if ABI_IS_32BIT
574        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
575        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
576%else
577        add             rsi, r8
578        add             rdi, r9
579%endif
580
581        sub             rcx,            1                   ;
582        jnz             .half_vert_variance8x_h_1          ;
583
584        movdq2q         mm6,            xmm6                ;
585        movdq2q         mm7,            xmm7                ;
586
587        psrldq          xmm6,           8
588        psrldq          xmm7,           8
589
590        movdq2q         mm2,            xmm6
591        movdq2q         mm3,            xmm7
592
593        paddw           mm6,            mm2
594        paddd           mm7,            mm3
595
596        pxor            mm3,            mm3                 ;
597        pxor            mm2,            mm2                 ;
598
599        punpcklwd       mm2,            mm6                 ;
600        punpckhwd       mm3,            mm6                 ;
601
602        paddd           mm2,            mm3                 ;
603        movq            mm6,            mm2                 ;
604
605        psrlq           mm6,            32                  ;
606        paddd           mm2,            mm6                 ;
607
608        psrad           mm2,            16                  ;
609        movq            mm4,            mm7                 ;
610
611        psrlq           mm4,            32                  ;
612        paddd           mm4,            mm7                 ;
613
614        mov             rsi,            arg(5) ; sum
615        mov             rdi,            arg(6) ; sumsquared
616
617        movd            [rsi],          mm2                 ;
618        movd            [rdi],          mm4                 ;
619
620
621    ; begin epilog
622    pop rdi
623    pop rsi
624    RESTORE_GOT
625    RESTORE_XMM
626    UNSHADOW_ARGS
627    pop         rbp
628    ret
629
630
631;void vp9_half_horiz_variance8x_h_sse2
632;(
633;    unsigned char *ref_ptr,
634;    int ref_pixels_per_line,
635;    unsigned char *src_ptr,
636;    int src_pixels_per_line,
637;    unsigned int Height,
638;    int *sum,
639;    unsigned int *sumsquared
640;)
641global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
642sym(vp9_half_horiz_variance8x_h_sse2):
643    push        rbp
644    mov         rbp, rsp
645    SHADOW_ARGS_TO_STACK 7
646    SAVE_XMM 7
647    GET_GOT     rbx
648    push rsi
649    push rdi
650    ; end prolog
651
652%if ABI_IS_32BIT=0
653    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
654    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
655%endif
656
657        pxor            xmm6,           xmm6                ;  error accumulator
658        pxor            xmm7,           xmm7                ;  sse eaccumulator
659        mov             rsi,            arg(0) ;ref_ptr              ;
660
661        mov             rdi,            arg(2) ;src_ptr              ;
662        movsxd          rcx,            dword ptr arg(4) ;Height              ;
663
664        pxor            xmm0,           xmm0                ;
665.half_horiz_variance8x_h_1:
666        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
667        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
668
669        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
670        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
671
672        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
673        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
674
675        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
676        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
677        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
678        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
679
680%if ABI_IS_32BIT
681        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
682        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
683%else
684        add             rsi, r8
685        add             rdi, r9
686%endif
687        sub             rcx,            1                   ;
688        jnz             .half_horiz_variance8x_h_1          ;
689
690        movdq2q         mm6,            xmm6                ;
691        movdq2q         mm7,            xmm7                ;
692
693        psrldq          xmm6,           8
694        psrldq          xmm7,           8
695
696        movdq2q         mm2,            xmm6
697        movdq2q         mm3,            xmm7
698
699        paddw           mm6,            mm2
700        paddd           mm7,            mm3
701
702        pxor            mm3,            mm3                 ;
703        pxor            mm2,            mm2                 ;
704
705        punpcklwd       mm2,            mm6                 ;
706        punpckhwd       mm3,            mm6                 ;
707
708        paddd           mm2,            mm3                 ;
709        movq            mm6,            mm2                 ;
710
711        psrlq           mm6,            32                  ;
712        paddd           mm2,            mm6                 ;
713
714        psrad           mm2,            16                  ;
715        movq            mm4,            mm7                 ;
716
717        psrlq           mm4,            32                  ;
718        paddd           mm4,            mm7                 ;
719
720        mov             rsi,            arg(5) ; sum
721        mov             rdi,            arg(6) ; sumsquared
722
723        movd            [rsi],          mm2                 ;
724        movd            [rdi],          mm4                 ;
725
726
727    ; begin epilog
728    pop rdi
729    pop rsi
730    RESTORE_GOT
731    RESTORE_XMM
732    UNSHADOW_ARGS
733    pop         rbp
734    ret
735