1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro STACK_FRAME_CREATE_X3 0
14%if ABI_IS_32BIT
15  %define     src_ptr       rsi
16  %define     src_stride    rax
17  %define     ref_ptr       rdi
18  %define     ref_stride    rdx
19  %define     end_ptr       rcx
20  %define     ret_var       rbx
21  %define     result_ptr    arg(4)
22  %define     max_err       arg(4)
23    push        rbp
24    mov         rbp,        rsp
25    push        rsi
26    push        rdi
27    push        rbx
28
29    mov         rsi,        arg(0)              ; src_ptr
30    mov         rdi,        arg(2)              ; ref_ptr
31
32    movsxd      rax,        dword ptr arg(1)    ; src_stride
33    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
34%else
35  %ifidn __OUTPUT_FORMAT__,x64
36    %define     src_ptr     rcx
37    %define     src_stride  rdx
38    %define     ref_ptr     r8
39    %define     ref_stride  r9
40    %define     end_ptr     r10
41    %define     ret_var     r11
42    %define     result_ptr  [rsp+8+4*8]
43    %define     max_err     [rsp+8+4*8]
44  %else
45    %define     src_ptr     rdi
46    %define     src_stride  rsi
47    %define     ref_ptr     rdx
48    %define     ref_stride  rcx
49    %define     end_ptr     r9
50    %define     ret_var     r10
51    %define     result_ptr  r8
52    %define     max_err     r8
53  %endif
54%endif
55
56%endmacro
57
58%macro STACK_FRAME_DESTROY_X3 0
59  %define     src_ptr
60  %define     src_stride
61  %define     ref_ptr
62  %define     ref_stride
63  %define     end_ptr
64  %define     ret_var
65  %define     result_ptr
66  %define     max_err
67
68%if ABI_IS_32BIT
69    pop         rbx
70    pop         rdi
71    pop         rsi
72    pop         rbp
73%else
74  %ifidn __OUTPUT_FORMAT__,x64
75  %endif
76%endif
77    ret
78%endmacro
79
80%macro STACK_FRAME_CREATE_X4 0
81%if ABI_IS_32BIT
82  %define     src_ptr       rsi
83  %define     src_stride    rax
84  %define     r0_ptr        rcx
85  %define     r1_ptr        rdx
86  %define     r2_ptr        rbx
87  %define     r3_ptr        rdi
88  %define     ref_stride    rbp
89  %define     result_ptr    arg(4)
90    push        rbp
91    mov         rbp,        rsp
92    push        rsi
93    push        rdi
94    push        rbx
95
96    push        rbp
97    mov         rdi,        arg(2)              ; ref_ptr_base
98
99    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
100
101    mov         rsi,        arg(0)              ; src_ptr
102
103    movsxd      rbx,        dword ptr arg(1)    ; src_stride
104    movsxd      rbp,        dword ptr arg(3)    ; ref_stride
105
106    xchg        rbx,        rax
107%else
108  %ifidn __OUTPUT_FORMAT__,x64
109    %define     src_ptr     rcx
110    %define     src_stride  rdx
111    %define     r0_ptr      rsi
112    %define     r1_ptr      r10
113    %define     r2_ptr      r11
114    %define     r3_ptr      r8
115    %define     ref_stride  r9
116    %define     result_ptr  [rsp+16+4*8]
117    push        rsi
118
119    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
120  %else
121    %define     src_ptr     rdi
122    %define     src_stride  rsi
123    %define     r0_ptr      r9
124    %define     r1_ptr      r10
125    %define     r2_ptr      r11
126    %define     r3_ptr      rdx
127    %define     ref_stride  rcx
128    %define     result_ptr  r8
129
130    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
131
132  %endif
133%endif
134%endmacro
135
136%macro STACK_FRAME_DESTROY_X4 0
137  %define     src_ptr
138  %define     src_stride
139  %define     r0_ptr
140  %define     r1_ptr
141  %define     r2_ptr
142  %define     r3_ptr
143  %define     ref_stride
144  %define     result_ptr
145
146%if ABI_IS_32BIT
147    pop         rbx
148    pop         rdi
149    pop         rsi
150    pop         rbp
151%else
152  %ifidn __OUTPUT_FORMAT__,x64
153    pop         rsi
154  %endif
155%endif
156    ret
157%endmacro
158
159%macro PROCESS_16X2X3 5
160%if %1==0
161        movdqa          xmm0,       XMMWORD PTR [%2]
162        lddqu           xmm5,       XMMWORD PTR [%3]
163        lddqu           xmm6,       XMMWORD PTR [%3+1]
164        lddqu           xmm7,       XMMWORD PTR [%3+2]
165
166        psadbw          xmm5,       xmm0
167        psadbw          xmm6,       xmm0
168        psadbw          xmm7,       xmm0
169%else
170        movdqa          xmm0,       XMMWORD PTR [%2]
171        lddqu           xmm1,       XMMWORD PTR [%3]
172        lddqu           xmm2,       XMMWORD PTR [%3+1]
173        lddqu           xmm3,       XMMWORD PTR [%3+2]
174
175        psadbw          xmm1,       xmm0
176        psadbw          xmm2,       xmm0
177        psadbw          xmm3,       xmm0
178
179        paddw           xmm5,       xmm1
180        paddw           xmm6,       xmm2
181        paddw           xmm7,       xmm3
182%endif
183        movdqa          xmm0,       XMMWORD PTR [%2+%4]
184        lddqu           xmm1,       XMMWORD PTR [%3+%5]
185        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
186        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
187
188%if %1==0 || %1==1
189        lea             %2,         [%2+%4*2]
190        lea             %3,         [%3+%5*2]
191%endif
192
193        psadbw          xmm1,       xmm0
194        psadbw          xmm2,       xmm0
195        psadbw          xmm3,       xmm0
196
197        paddw           xmm5,       xmm1
198        paddw           xmm6,       xmm2
199        paddw           xmm7,       xmm3
200%endmacro
201
202%macro PROCESS_8X2X3 5
203%if %1==0
204        movq            mm0,       QWORD PTR [%2]
205        movq            mm5,       QWORD PTR [%3]
206        movq            mm6,       QWORD PTR [%3+1]
207        movq            mm7,       QWORD PTR [%3+2]
208
209        psadbw          mm5,       mm0
210        psadbw          mm6,       mm0
211        psadbw          mm7,       mm0
212%else
213        movq            mm0,       QWORD PTR [%2]
214        movq            mm1,       QWORD PTR [%3]
215        movq            mm2,       QWORD PTR [%3+1]
216        movq            mm3,       QWORD PTR [%3+2]
217
218        psadbw          mm1,       mm0
219        psadbw          mm2,       mm0
220        psadbw          mm3,       mm0
221
222        paddw           mm5,       mm1
223        paddw           mm6,       mm2
224        paddw           mm7,       mm3
225%endif
226        movq            mm0,       QWORD PTR [%2+%4]
227        movq            mm1,       QWORD PTR [%3+%5]
228        movq            mm2,       QWORD PTR [%3+%5+1]
229        movq            mm3,       QWORD PTR [%3+%5+2]
230
231%if %1==0 || %1==1
232        lea             %2,        [%2+%4*2]
233        lea             %3,        [%3+%5*2]
234%endif
235
236        psadbw          mm1,       mm0
237        psadbw          mm2,       mm0
238        psadbw          mm3,       mm0
239
240        paddw           mm5,       mm1
241        paddw           mm6,       mm2
242        paddw           mm7,       mm3
243%endmacro
244
245%macro LOAD_X4_ADDRESSES 5
246        mov             %2,         [%1+REG_SZ_BYTES*0]
247        mov             %3,         [%1+REG_SZ_BYTES*1]
248
249        mov             %4,         [%1+REG_SZ_BYTES*2]
250        mov             %5,         [%1+REG_SZ_BYTES*3]
251%endmacro
252
253%macro PROCESS_16X2X4 8
254%if %1==0
255        movdqa          xmm0,       XMMWORD PTR [%2]
256        lddqu           xmm4,       XMMWORD PTR [%3]
257        lddqu           xmm5,       XMMWORD PTR [%4]
258        lddqu           xmm6,       XMMWORD PTR [%5]
259        lddqu           xmm7,       XMMWORD PTR [%6]
260
261        psadbw          xmm4,       xmm0
262        psadbw          xmm5,       xmm0
263        psadbw          xmm6,       xmm0
264        psadbw          xmm7,       xmm0
265%else
266        movdqa          xmm0,       XMMWORD PTR [%2]
267        lddqu           xmm1,       XMMWORD PTR [%3]
268        lddqu           xmm2,       XMMWORD PTR [%4]
269        lddqu           xmm3,       XMMWORD PTR [%5]
270
271        psadbw          xmm1,       xmm0
272        psadbw          xmm2,       xmm0
273        psadbw          xmm3,       xmm0
274
275        paddw           xmm4,       xmm1
276        lddqu           xmm1,       XMMWORD PTR [%6]
277        paddw           xmm5,       xmm2
278        paddw           xmm6,       xmm3
279
280        psadbw          xmm1,       xmm0
281        paddw           xmm7,       xmm1
282%endif
283        movdqa          xmm0,       XMMWORD PTR [%2+%7]
284        lddqu           xmm1,       XMMWORD PTR [%3+%8]
285        lddqu           xmm2,       XMMWORD PTR [%4+%8]
286        lddqu           xmm3,       XMMWORD PTR [%5+%8]
287
288        psadbw          xmm1,       xmm0
289        psadbw          xmm2,       xmm0
290        psadbw          xmm3,       xmm0
291
292        paddw           xmm4,       xmm1
293        lddqu           xmm1,       XMMWORD PTR [%6+%8]
294        paddw           xmm5,       xmm2
295        paddw           xmm6,       xmm3
296
297%if %1==0 || %1==1
298        lea             %2,         [%2+%7*2]
299        lea             %3,         [%3+%8*2]
300
301        lea             %4,         [%4+%8*2]
302        lea             %5,         [%5+%8*2]
303
304        lea             %6,         [%6+%8*2]
305%endif
306        psadbw          xmm1,       xmm0
307        paddw           xmm7,       xmm1
308
309%endmacro
310
311%macro PROCESS_8X2X4 8
312%if %1==0
313        movq            mm0,        QWORD PTR [%2]
314        movq            mm4,        QWORD PTR [%3]
315        movq            mm5,        QWORD PTR [%4]
316        movq            mm6,        QWORD PTR [%5]
317        movq            mm7,        QWORD PTR [%6]
318
319        psadbw          mm4,        mm0
320        psadbw          mm5,        mm0
321        psadbw          mm6,        mm0
322        psadbw          mm7,        mm0
323%else
324        movq            mm0,        QWORD PTR [%2]
325        movq            mm1,        QWORD PTR [%3]
326        movq            mm2,        QWORD PTR [%4]
327        movq            mm3,        QWORD PTR [%5]
328
329        psadbw          mm1,        mm0
330        psadbw          mm2,        mm0
331        psadbw          mm3,        mm0
332
333        paddw           mm4,        mm1
334        movq            mm1,        QWORD PTR [%6]
335        paddw           mm5,        mm2
336        paddw           mm6,        mm3
337
338        psadbw          mm1,        mm0
339        paddw           mm7,        mm1
340%endif
341        movq            mm0,        QWORD PTR [%2+%7]
342        movq            mm1,        QWORD PTR [%3+%8]
343        movq            mm2,        QWORD PTR [%4+%8]
344        movq            mm3,        QWORD PTR [%5+%8]
345
346        psadbw          mm1,        mm0
347        psadbw          mm2,        mm0
348        psadbw          mm3,        mm0
349
350        paddw           mm4,        mm1
351        movq            mm1,        QWORD PTR [%6+%8]
352        paddw           mm5,        mm2
353        paddw           mm6,        mm3
354
355%if %1==0 || %1==1
356        lea             %2,         [%2+%7*2]
357        lea             %3,         [%3+%8*2]
358
359        lea             %4,         [%4+%8*2]
360        lea             %5,         [%5+%8*2]
361
362        lea             %6,         [%6+%8*2]
363%endif
364        psadbw          mm1,        mm0
365        paddw           mm7,        mm1
366
367%endmacro
368
369;void int vp8_sad16x16x3_sse3(
370;    unsigned char *src_ptr,
371;    int  src_stride,
372;    unsigned char *ref_ptr,
373;    int  ref_stride,
374;    int  *results)
375global sym(vp8_sad16x16x3_sse3)
376sym(vp8_sad16x16x3_sse3):
377
378    STACK_FRAME_CREATE_X3
379
380        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
381        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
382        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
383        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
384        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
385        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
386        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
387        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
388
389        mov             rcx,        result_ptr
390
391        movq            xmm0,       xmm5
392        psrldq          xmm5,       8
393
394        paddw           xmm0,       xmm5
395        movd            [rcx],      xmm0
396;-
397        movq            xmm0,       xmm6
398        psrldq          xmm6,       8
399
400        paddw           xmm0,       xmm6
401        movd            [rcx+4],    xmm0
402;-
403        movq            xmm0,       xmm7
404        psrldq          xmm7,       8
405
406        paddw           xmm0,       xmm7
407        movd            [rcx+8],    xmm0
408
409    STACK_FRAME_DESTROY_X3
410
411;void int vp8_sad16x8x3_sse3(
412;    unsigned char *src_ptr,
413;    int  src_stride,
414;    unsigned char *ref_ptr,
415;    int  ref_stride,
416;    int  *results)
417global sym(vp8_sad16x8x3_sse3)
418sym(vp8_sad16x8x3_sse3):
419
420    STACK_FRAME_CREATE_X3
421
422        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
423        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
424        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
425        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
426
427        mov             rcx,        result_ptr
428
429        movq            xmm0,       xmm5
430        psrldq          xmm5,       8
431
432        paddw           xmm0,       xmm5
433        movd            [rcx],      xmm0
434;-
435        movq            xmm0,       xmm6
436        psrldq          xmm6,       8
437
438        paddw           xmm0,       xmm6
439        movd            [rcx+4],    xmm0
440;-
441        movq            xmm0,       xmm7
442        psrldq          xmm7,       8
443
444        paddw           xmm0,       xmm7
445        movd            [rcx+8],    xmm0
446
447    STACK_FRAME_DESTROY_X3
448
449;void int vp8_sad8x16x3_sse3(
450;    unsigned char *src_ptr,
451;    int  src_stride,
452;    unsigned char *ref_ptr,
453;    int  ref_stride,
454;    int  *results)
455global sym(vp8_sad8x16x3_sse3)
456sym(vp8_sad8x16x3_sse3):
457
458    STACK_FRAME_CREATE_X3
459
460        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
461        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
462        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
463        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
464        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
465        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
466        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
467        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
468
469        mov             rcx,        result_ptr
470
471        punpckldq       mm5,        mm6
472
473        movq            [rcx],      mm5
474        movd            [rcx+8],    mm7
475
476    STACK_FRAME_DESTROY_X3
477
478;void int vp8_sad8x8x3_sse3(
479;    unsigned char *src_ptr,
480;    int  src_stride,
481;    unsigned char *ref_ptr,
482;    int  ref_stride,
483;    int  *results)
484global sym(vp8_sad8x8x3_sse3)
485sym(vp8_sad8x8x3_sse3):
486
487    STACK_FRAME_CREATE_X3
488
489        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
490        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
491        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
492        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
493
494        mov             rcx,        result_ptr
495
496        punpckldq       mm5,        mm6
497
498        movq            [rcx],      mm5
499        movd            [rcx+8],    mm7
500
501    STACK_FRAME_DESTROY_X3
502
503;void int vp8_sad4x4x3_sse3(
504;    unsigned char *src_ptr,
505;    int  src_stride,
506;    unsigned char *ref_ptr,
507;    int  ref_stride,
508;    int  *results)
509global sym(vp8_sad4x4x3_sse3)
510sym(vp8_sad4x4x3_sse3):
511
512    STACK_FRAME_CREATE_X3
513
514        movd            mm0,        DWORD PTR [src_ptr]
515        movd            mm1,        DWORD PTR [ref_ptr]
516
517        movd            mm2,        DWORD PTR [src_ptr+src_stride]
518        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
519
520        punpcklbw       mm0,        mm2
521        punpcklbw       mm1,        mm3
522
523        movd            mm4,        DWORD PTR [ref_ptr+1]
524        movd            mm5,        DWORD PTR [ref_ptr+2]
525
526        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
527        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
528
529        psadbw          mm1,        mm0
530
531        punpcklbw       mm4,        mm2
532        punpcklbw       mm5,        mm3
533
534        psadbw          mm4,        mm0
535        psadbw          mm5,        mm0
536
537        lea             src_ptr,    [src_ptr+src_stride*2]
538        lea             ref_ptr,    [ref_ptr+ref_stride*2]
539
540        movd            mm0,        DWORD PTR [src_ptr]
541        movd            mm2,        DWORD PTR [ref_ptr]
542
543        movd            mm3,        DWORD PTR [src_ptr+src_stride]
544        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
545
546        punpcklbw       mm0,        mm3
547        punpcklbw       mm2,        mm6
548
549        movd            mm3,        DWORD PTR [ref_ptr+1]
550        movd            mm7,        DWORD PTR [ref_ptr+2]
551
552        psadbw          mm2,        mm0
553
554        paddw           mm1,        mm2
555
556        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
557        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
558
559        punpcklbw       mm3,        mm2
560        punpcklbw       mm7,        mm6
561
562        psadbw          mm3,        mm0
563        psadbw          mm7,        mm0
564
565        paddw           mm3,        mm4
566        paddw           mm7,        mm5
567
568        mov             rcx,        result_ptr
569
570        punpckldq       mm1,        mm3
571
572        movq            [rcx],      mm1
573        movd            [rcx+8],    mm7
574
575    STACK_FRAME_DESTROY_X3
576
577;unsigned int vp8_sad16x16_sse3(
578;    unsigned char *src_ptr,
579;    int  src_stride,
580;    unsigned char *ref_ptr,
581;    int  ref_stride,
582;    int  max_err)
583;%define lddqu movdqu
584global sym(vp8_sad16x16_sse3)
585sym(vp8_sad16x16_sse3):
586
587    STACK_FRAME_CREATE_X3
588
589        mov             end_ptr,    4
590        pxor            xmm7,        xmm7
591
592.vp8_sad16x16_sse3_loop:
593        movdqa          xmm0,       XMMWORD PTR [src_ptr]
594        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
595        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
596        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
597
598        lea             src_ptr,    [src_ptr+src_stride*2]
599        lea             ref_ptr,    [ref_ptr+ref_stride*2]
600
601        movdqa          xmm4,       XMMWORD PTR [src_ptr]
602        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
603        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
604
605        psadbw          xmm0,       xmm1
606
607        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
608
609        psadbw          xmm2,       xmm3
610        psadbw          xmm4,       xmm5
611        psadbw          xmm6,       xmm1
612
613        lea             src_ptr,    [src_ptr+src_stride*2]
614        lea             ref_ptr,    [ref_ptr+ref_stride*2]
615
616        paddw           xmm7,        xmm0
617        paddw           xmm7,        xmm2
618        paddw           xmm7,        xmm4
619        paddw           xmm7,        xmm6
620
621        sub             end_ptr,     1
622        jne             .vp8_sad16x16_sse3_loop
623
624        movq            xmm0,       xmm7
625        psrldq          xmm7,       8
626        paddw           xmm0,       xmm7
627        movq            rax,        xmm0
628
629    STACK_FRAME_DESTROY_X3
630
631;void vp8_sad16x16x4d_sse3(
632;    unsigned char *src_ptr,
633;    int  src_stride,
634;    unsigned char *ref_ptr_base,
635;    int  ref_stride,
636;    int  *results)
637global sym(vp8_sad16x16x4d_sse3)
638sym(vp8_sad16x16x4d_sse3):
639
640    STACK_FRAME_CREATE_X4
641
642        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
643        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
644        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
645        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
646        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
647        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
648        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
649        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
650
651%if ABI_IS_32BIT
652        pop             rbp
653%endif
654        mov             rcx,        result_ptr
655
656        movq            xmm0,       xmm4
657        psrldq          xmm4,       8
658
659        paddw           xmm0,       xmm4
660        movd            [rcx],      xmm0
661;-
662        movq            xmm0,       xmm5
663        psrldq          xmm5,       8
664
665        paddw           xmm0,       xmm5
666        movd            [rcx+4],    xmm0
667;-
668        movq            xmm0,       xmm6
669        psrldq          xmm6,       8
670
671        paddw           xmm0,       xmm6
672        movd            [rcx+8],    xmm0
673;-
674        movq            xmm0,       xmm7
675        psrldq          xmm7,       8
676
677        paddw           xmm0,       xmm7
678        movd            [rcx+12],   xmm0
679
680    STACK_FRAME_DESTROY_X4
681
682;void vp8_sad16x8x4d_sse3(
683;    unsigned char *src_ptr,
684;    int  src_stride,
685;    unsigned char *ref_ptr_base,
686;    int  ref_stride,
687;    int  *results)
688global sym(vp8_sad16x8x4d_sse3)
689sym(vp8_sad16x8x4d_sse3):
690
691    STACK_FRAME_CREATE_X4
692
693        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
694        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
695        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
696        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
697
698%if ABI_IS_32BIT
699        pop             rbp
700%endif
701        mov             rcx,        result_ptr
702
703        movq            xmm0,       xmm4
704        psrldq          xmm4,       8
705
706        paddw           xmm0,       xmm4
707        movd            [rcx],      xmm0
708;-
709        movq            xmm0,       xmm5
710        psrldq          xmm5,       8
711
712        paddw           xmm0,       xmm5
713        movd            [rcx+4],    xmm0
714;-
715        movq            xmm0,       xmm6
716        psrldq          xmm6,       8
717
718        paddw           xmm0,       xmm6
719        movd            [rcx+8],    xmm0
720;-
721        movq            xmm0,       xmm7
722        psrldq          xmm7,       8
723
724        paddw           xmm0,       xmm7
725        movd            [rcx+12],   xmm0
726
727    STACK_FRAME_DESTROY_X4
728
729;void int vp8_sad8x16x4d_sse3(
730;    unsigned char *src_ptr,
731;    int  src_stride,
732;    unsigned char *ref_ptr,
733;    int  ref_stride,
734;    int  *results)
735global sym(vp8_sad8x16x4d_sse3)
736sym(vp8_sad8x16x4d_sse3):
737
738    STACK_FRAME_CREATE_X4
739
740        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
741        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
742        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
743        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
744        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
745        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
746        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
747        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
748
749%if ABI_IS_32BIT
750        pop             rbp
751%endif
752        mov             rcx,        result_ptr
753
754        punpckldq       mm4,        mm5
755        punpckldq       mm6,        mm7
756
757        movq            [rcx],      mm4
758        movq            [rcx+8],    mm6
759
760    STACK_FRAME_DESTROY_X4
761
762;void int vp8_sad8x8x4d_sse3(
763;    unsigned char *src_ptr,
764;    int  src_stride,
765;    unsigned char *ref_ptr,
766;    int  ref_stride,
767;    int  *results)
768global sym(vp8_sad8x8x4d_sse3)
769sym(vp8_sad8x8x4d_sse3):
770
771    STACK_FRAME_CREATE_X4
772
773        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
774        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
775        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
776        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
777
778%if ABI_IS_32BIT
779        pop             rbp
780%endif
781        mov             rcx,        result_ptr
782
783        punpckldq       mm4,        mm5
784        punpckldq       mm6,        mm7
785
786        movq            [rcx],      mm4
787        movq            [rcx+8],    mm6
788
789    STACK_FRAME_DESTROY_X4
790
791;void int vp8_sad4x4x4d_sse3(
792;    unsigned char *src_ptr,
793;    int  src_stride,
794;    unsigned char *ref_ptr,
795;    int  ref_stride,
796;    int  *results)
797global sym(vp8_sad4x4x4d_sse3)
798sym(vp8_sad4x4x4d_sse3):
799
800    STACK_FRAME_CREATE_X4
801
802        movd            mm0,        DWORD PTR [src_ptr]
803        movd            mm1,        DWORD PTR [r0_ptr]
804
805        movd            mm2,        DWORD PTR [src_ptr+src_stride]
806        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
807
808        punpcklbw       mm0,        mm2
809        punpcklbw       mm1,        mm3
810
811        movd            mm4,        DWORD PTR [r1_ptr]
812        movd            mm5,        DWORD PTR [r2_ptr]
813
814        movd            mm6,        DWORD PTR [r3_ptr]
815        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
816
817        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
818        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
819
820        psadbw          mm1,        mm0
821
822        punpcklbw       mm4,        mm2
823        punpcklbw       mm5,        mm3
824
825        punpcklbw       mm6,        mm7
826        psadbw          mm4,        mm0
827
828        psadbw          mm5,        mm0
829        psadbw          mm6,        mm0
830
831
832
833        lea             src_ptr,    [src_ptr+src_stride*2]
834        lea             r0_ptr,     [r0_ptr+ref_stride*2]
835
836        lea             r1_ptr,     [r1_ptr+ref_stride*2]
837        lea             r2_ptr,     [r2_ptr+ref_stride*2]
838
839        lea             r3_ptr,     [r3_ptr+ref_stride*2]
840
841        movd            mm0,        DWORD PTR [src_ptr]
842        movd            mm2,        DWORD PTR [r0_ptr]
843
844        movd            mm3,        DWORD PTR [src_ptr+src_stride]
845        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
846
847        punpcklbw       mm0,        mm3
848        punpcklbw       mm2,        mm7
849
850        movd            mm3,        DWORD PTR [r1_ptr]
851        movd            mm7,        DWORD PTR [r2_ptr]
852
853        psadbw          mm2,        mm0
854%if ABI_IS_32BIT
855        mov             rax,        rbp
856
857        pop             rbp
858%define     ref_stride    rax
859%endif
860        mov             rsi,        result_ptr
861
862        paddw           mm1,        mm2
863        movd            [rsi],      mm1
864
865        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
866        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
867
868        punpcklbw       mm3,        mm2
869        punpcklbw       mm7,        mm1
870
871        psadbw          mm3,        mm0
872        psadbw          mm7,        mm0
873
874        movd            mm2,        DWORD PTR [r3_ptr]
875        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
876
877        paddw           mm3,        mm4
878        paddw           mm7,        mm5
879
880        movd            [rsi+4],    mm3
881        punpcklbw       mm2,        mm1
882
883        movd            [rsi+8],    mm7
884        psadbw          mm2,        mm0
885
886        paddw           mm2,        mm6
887        movd            [rsi+12],   mm2
888
889
890    STACK_FRAME_DESTROY_X4
891