1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13%macro STACK_FRAME_CREATE_X3 0
14%if ABI_IS_32BIT
15  %define     src_ptr       rsi
16  %define     src_stride    rax
17  %define     ref_ptr       rdi
18  %define     ref_stride    rdx
19  %define     end_ptr       rcx
20  %define     ret_var       rbx
21  %define     result_ptr    arg(4)
22  %define     max_sad       arg(4)
23  %define     height        dword ptr arg(4)
24    push        rbp
25    mov         rbp,        rsp
26    push        rsi
27    push        rdi
28    push        rbx
29
30    mov         rsi,        arg(0)              ; src_ptr
31    mov         rdi,        arg(2)              ; ref_ptr
32
33    movsxd      rax,        dword ptr arg(1)    ; src_stride
34    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
35%else
36  %if LIBVPX_YASM_WIN64
37    SAVE_XMM 7, u
38    %define     src_ptr     rcx
39    %define     src_stride  rdx
40    %define     ref_ptr     r8
41    %define     ref_stride  r9
42    %define     end_ptr     r10
43    %define     ret_var     r11
44    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
45    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
46    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
47  %else
48    %define     src_ptr     rdi
49    %define     src_stride  rsi
50    %define     ref_ptr     rdx
51    %define     ref_stride  rcx
52    %define     end_ptr     r9
53    %define     ret_var     r10
54    %define     result_ptr  r8
55    %define     max_sad     r8
56    %define     height      r8
57  %endif
58%endif
59
60%endmacro
61
62%macro STACK_FRAME_DESTROY_X3 0
63  %define     src_ptr
64  %define     src_stride
65  %define     ref_ptr
66  %define     ref_stride
67  %define     end_ptr
68  %define     ret_var
69  %define     result_ptr
70  %define     max_sad
71  %define     height
72
73%if ABI_IS_32BIT
74    pop         rbx
75    pop         rdi
76    pop         rsi
77    pop         rbp
78%else
79  %if LIBVPX_YASM_WIN64
80    RESTORE_XMM
81  %endif
82%endif
83    ret
84%endmacro
85
86%macro STACK_FRAME_CREATE_X4 0
87%if ABI_IS_32BIT
88  %define     src_ptr       rsi
89  %define     src_stride    rax
90  %define     r0_ptr        rcx
91  %define     r1_ptr        rdx
92  %define     r2_ptr        rbx
93  %define     r3_ptr        rdi
94  %define     ref_stride    rbp
95  %define     result_ptr    arg(4)
96    push        rbp
97    mov         rbp,        rsp
98    push        rsi
99    push        rdi
100    push        rbx
101
102    push        rbp
103    mov         rdi,        arg(2)              ; ref_ptr_base
104
105    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
106
107    mov         rsi,        arg(0)              ; src_ptr
108
109    movsxd      rbx,        dword ptr arg(1)    ; src_stride
110    movsxd      rbp,        dword ptr arg(3)    ; ref_stride
111
112    xchg        rbx,        rax
113%else
114  %if LIBVPX_YASM_WIN64
115    SAVE_XMM 7, u
116    %define     src_ptr     rcx
117    %define     src_stride  rdx
118    %define     r0_ptr      rsi
119    %define     r1_ptr      r10
120    %define     r2_ptr      r11
121    %define     r3_ptr      r8
122    %define     ref_stride  r9
123    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
124    push        rsi
125
126    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
127  %else
128    %define     src_ptr     rdi
129    %define     src_stride  rsi
130    %define     r0_ptr      r9
131    %define     r1_ptr      r10
132    %define     r2_ptr      r11
133    %define     r3_ptr      rdx
134    %define     ref_stride  rcx
135    %define     result_ptr  r8
136
137    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
138
139  %endif
140%endif
141%endmacro
142
143%macro STACK_FRAME_DESTROY_X4 0
144  %define     src_ptr
145  %define     src_stride
146  %define     r0_ptr
147  %define     r1_ptr
148  %define     r2_ptr
149  %define     r3_ptr
150  %define     ref_stride
151  %define     result_ptr
152
153%if ABI_IS_32BIT
154    pop         rbx
155    pop         rdi
156    pop         rsi
157    pop         rbp
158%else
159  %if LIBVPX_YASM_WIN64
160    pop         rsi
161    RESTORE_XMM
162  %endif
163%endif
164    ret
165%endmacro
166
167%macro PROCESS_16X2X3 5
168%if %1==0
169        movdqa          xmm0,       XMMWORD PTR [%2]
170        lddqu           xmm5,       XMMWORD PTR [%3]
171        lddqu           xmm6,       XMMWORD PTR [%3+1]
172        lddqu           xmm7,       XMMWORD PTR [%3+2]
173
174        psadbw          xmm5,       xmm0
175        psadbw          xmm6,       xmm0
176        psadbw          xmm7,       xmm0
177%else
178        movdqa          xmm0,       XMMWORD PTR [%2]
179        lddqu           xmm1,       XMMWORD PTR [%3]
180        lddqu           xmm2,       XMMWORD PTR [%3+1]
181        lddqu           xmm3,       XMMWORD PTR [%3+2]
182
183        psadbw          xmm1,       xmm0
184        psadbw          xmm2,       xmm0
185        psadbw          xmm3,       xmm0
186
187        paddw           xmm5,       xmm1
188        paddw           xmm6,       xmm2
189        paddw           xmm7,       xmm3
190%endif
191        movdqa          xmm0,       XMMWORD PTR [%2+%4]
192        lddqu           xmm1,       XMMWORD PTR [%3+%5]
193        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
194        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
195
196%if %1==0 || %1==1
197        lea             %2,         [%2+%4*2]
198        lea             %3,         [%3+%5*2]
199%endif
200
201        psadbw          xmm1,       xmm0
202        psadbw          xmm2,       xmm0
203        psadbw          xmm3,       xmm0
204
205        paddw           xmm5,       xmm1
206        paddw           xmm6,       xmm2
207        paddw           xmm7,       xmm3
208%endmacro
209
210%macro PROCESS_8X2X3 5
211%if %1==0
212        movq            mm0,       QWORD PTR [%2]
213        movq            mm5,       QWORD PTR [%3]
214        movq            mm6,       QWORD PTR [%3+1]
215        movq            mm7,       QWORD PTR [%3+2]
216
217        psadbw          mm5,       mm0
218        psadbw          mm6,       mm0
219        psadbw          mm7,       mm0
220%else
221        movq            mm0,       QWORD PTR [%2]
222        movq            mm1,       QWORD PTR [%3]
223        movq            mm2,       QWORD PTR [%3+1]
224        movq            mm3,       QWORD PTR [%3+2]
225
226        psadbw          mm1,       mm0
227        psadbw          mm2,       mm0
228        psadbw          mm3,       mm0
229
230        paddw           mm5,       mm1
231        paddw           mm6,       mm2
232        paddw           mm7,       mm3
233%endif
234        movq            mm0,       QWORD PTR [%2+%4]
235        movq            mm1,       QWORD PTR [%3+%5]
236        movq            mm2,       QWORD PTR [%3+%5+1]
237        movq            mm3,       QWORD PTR [%3+%5+2]
238
239%if %1==0 || %1==1
240        lea             %2,        [%2+%4*2]
241        lea             %3,        [%3+%5*2]
242%endif
243
244        psadbw          mm1,       mm0
245        psadbw          mm2,       mm0
246        psadbw          mm3,       mm0
247
248        paddw           mm5,       mm1
249        paddw           mm6,       mm2
250        paddw           mm7,       mm3
251%endmacro
252
253%macro LOAD_X4_ADDRESSES 5
254        mov             %2,         [%1+REG_SZ_BYTES*0]
255        mov             %3,         [%1+REG_SZ_BYTES*1]
256
257        mov             %4,         [%1+REG_SZ_BYTES*2]
258        mov             %5,         [%1+REG_SZ_BYTES*3]
259%endmacro
260
261%macro PROCESS_16X2X4 8
262%if %1==0
263        movdqa          xmm0,       XMMWORD PTR [%2]
264        lddqu           xmm4,       XMMWORD PTR [%3]
265        lddqu           xmm5,       XMMWORD PTR [%4]
266        lddqu           xmm6,       XMMWORD PTR [%5]
267        lddqu           xmm7,       XMMWORD PTR [%6]
268
269        psadbw          xmm4,       xmm0
270        psadbw          xmm5,       xmm0
271        psadbw          xmm6,       xmm0
272        psadbw          xmm7,       xmm0
273%else
274        movdqa          xmm0,       XMMWORD PTR [%2]
275        lddqu           xmm1,       XMMWORD PTR [%3]
276        lddqu           xmm2,       XMMWORD PTR [%4]
277        lddqu           xmm3,       XMMWORD PTR [%5]
278
279        psadbw          xmm1,       xmm0
280        psadbw          xmm2,       xmm0
281        psadbw          xmm3,       xmm0
282
283        paddw           xmm4,       xmm1
284        lddqu           xmm1,       XMMWORD PTR [%6]
285        paddw           xmm5,       xmm2
286        paddw           xmm6,       xmm3
287
288        psadbw          xmm1,       xmm0
289        paddw           xmm7,       xmm1
290%endif
291        movdqa          xmm0,       XMMWORD PTR [%2+%7]
292        lddqu           xmm1,       XMMWORD PTR [%3+%8]
293        lddqu           xmm2,       XMMWORD PTR [%4+%8]
294        lddqu           xmm3,       XMMWORD PTR [%5+%8]
295
296        psadbw          xmm1,       xmm0
297        psadbw          xmm2,       xmm0
298        psadbw          xmm3,       xmm0
299
300        paddw           xmm4,       xmm1
301        lddqu           xmm1,       XMMWORD PTR [%6+%8]
302        paddw           xmm5,       xmm2
303        paddw           xmm6,       xmm3
304
305%if %1==0 || %1==1
306        lea             %2,         [%2+%7*2]
307        lea             %3,         [%3+%8*2]
308
309        lea             %4,         [%4+%8*2]
310        lea             %5,         [%5+%8*2]
311
312        lea             %6,         [%6+%8*2]
313%endif
314        psadbw          xmm1,       xmm0
315        paddw           xmm7,       xmm1
316
317%endmacro
318
319%macro PROCESS_8X2X4 8
320%if %1==0
321        movq            mm0,        QWORD PTR [%2]
322        movq            mm4,        QWORD PTR [%3]
323        movq            mm5,        QWORD PTR [%4]
324        movq            mm6,        QWORD PTR [%5]
325        movq            mm7,        QWORD PTR [%6]
326
327        psadbw          mm4,        mm0
328        psadbw          mm5,        mm0
329        psadbw          mm6,        mm0
330        psadbw          mm7,        mm0
331%else
332        movq            mm0,        QWORD PTR [%2]
333        movq            mm1,        QWORD PTR [%3]
334        movq            mm2,        QWORD PTR [%4]
335        movq            mm3,        QWORD PTR [%5]
336
337        psadbw          mm1,        mm0
338        psadbw          mm2,        mm0
339        psadbw          mm3,        mm0
340
341        paddw           mm4,        mm1
342        movq            mm1,        QWORD PTR [%6]
343        paddw           mm5,        mm2
344        paddw           mm6,        mm3
345
346        psadbw          mm1,        mm0
347        paddw           mm7,        mm1
348%endif
349        movq            mm0,        QWORD PTR [%2+%7]
350        movq            mm1,        QWORD PTR [%3+%8]
351        movq            mm2,        QWORD PTR [%4+%8]
352        movq            mm3,        QWORD PTR [%5+%8]
353
354        psadbw          mm1,        mm0
355        psadbw          mm2,        mm0
356        psadbw          mm3,        mm0
357
358        paddw           mm4,        mm1
359        movq            mm1,        QWORD PTR [%6+%8]
360        paddw           mm5,        mm2
361        paddw           mm6,        mm3
362
363%if %1==0 || %1==1
364        lea             %2,         [%2+%7*2]
365        lea             %3,         [%3+%8*2]
366
367        lea             %4,         [%4+%8*2]
368        lea             %5,         [%5+%8*2]
369
370        lea             %6,         [%6+%8*2]
371%endif
372        psadbw          mm1,        mm0
373        paddw           mm7,        mm1
374
375%endmacro
376
377;void int vp8_sad16x16x3_sse3(
378;    unsigned char *src_ptr,
379;    int  src_stride,
380;    unsigned char *ref_ptr,
381;    int  ref_stride,
382;    int  *results)
383global sym(vp8_sad16x16x3_sse3) PRIVATE
384sym(vp8_sad16x16x3_sse3):
385
386    STACK_FRAME_CREATE_X3
387
388        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
389        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
390        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
391        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
392        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
393        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
394        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
395        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
396
397        mov             rcx,        result_ptr
398
399        movq            xmm0,       xmm5
400        psrldq          xmm5,       8
401
402        paddw           xmm0,       xmm5
403        movd            [rcx],      xmm0
404;-
405        movq            xmm0,       xmm6
406        psrldq          xmm6,       8
407
408        paddw           xmm0,       xmm6
409        movd            [rcx+4],    xmm0
410;-
411        movq            xmm0,       xmm7
412        psrldq          xmm7,       8
413
414        paddw           xmm0,       xmm7
415        movd            [rcx+8],    xmm0
416
417    STACK_FRAME_DESTROY_X3
418
419;void int vp8_sad16x8x3_sse3(
420;    unsigned char *src_ptr,
421;    int  src_stride,
422;    unsigned char *ref_ptr,
423;    int  ref_stride,
424;    int  *results)
425global sym(vp8_sad16x8x3_sse3) PRIVATE
426sym(vp8_sad16x8x3_sse3):
427
428    STACK_FRAME_CREATE_X3
429
430        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
431        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
432        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
433        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
434
435        mov             rcx,        result_ptr
436
437        movq            xmm0,       xmm5
438        psrldq          xmm5,       8
439
440        paddw           xmm0,       xmm5
441        movd            [rcx],      xmm0
442;-
443        movq            xmm0,       xmm6
444        psrldq          xmm6,       8
445
446        paddw           xmm0,       xmm6
447        movd            [rcx+4],    xmm0
448;-
449        movq            xmm0,       xmm7
450        psrldq          xmm7,       8
451
452        paddw           xmm0,       xmm7
453        movd            [rcx+8],    xmm0
454
455    STACK_FRAME_DESTROY_X3
456
457;void int vp8_sad8x16x3_sse3(
458;    unsigned char *src_ptr,
459;    int  src_stride,
460;    unsigned char *ref_ptr,
461;    int  ref_stride,
462;    int  *results)
463global sym(vp8_sad8x16x3_sse3) PRIVATE
464sym(vp8_sad8x16x3_sse3):
465
466    STACK_FRAME_CREATE_X3
467
468        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
469        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
470        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
471        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
472        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
473        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
474        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
475        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
476
477        mov             rcx,        result_ptr
478
479        punpckldq       mm5,        mm6
480
481        movq            [rcx],      mm5
482        movd            [rcx+8],    mm7
483
484    STACK_FRAME_DESTROY_X3
485
486;void int vp8_sad8x8x3_sse3(
487;    unsigned char *src_ptr,
488;    int  src_stride,
489;    unsigned char *ref_ptr,
490;    int  ref_stride,
491;    int  *results)
492global sym(vp8_sad8x8x3_sse3) PRIVATE
493sym(vp8_sad8x8x3_sse3):
494
495    STACK_FRAME_CREATE_X3
496
497        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
498        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
499        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
500        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
501
502        mov             rcx,        result_ptr
503
504        punpckldq       mm5,        mm6
505
506        movq            [rcx],      mm5
507        movd            [rcx+8],    mm7
508
509    STACK_FRAME_DESTROY_X3
510
511;void int vp8_sad4x4x3_sse3(
512;    unsigned char *src_ptr,
513;    int  src_stride,
514;    unsigned char *ref_ptr,
515;    int  ref_stride,
516;    int  *results)
517global sym(vp8_sad4x4x3_sse3) PRIVATE
518sym(vp8_sad4x4x3_sse3):
519
520    STACK_FRAME_CREATE_X3
521
522        movd            mm0,        DWORD PTR [src_ptr]
523        movd            mm1,        DWORD PTR [ref_ptr]
524
525        movd            mm2,        DWORD PTR [src_ptr+src_stride]
526        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
527
528        punpcklbw       mm0,        mm2
529        punpcklbw       mm1,        mm3
530
531        movd            mm4,        DWORD PTR [ref_ptr+1]
532        movd            mm5,        DWORD PTR [ref_ptr+2]
533
534        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
535        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
536
537        psadbw          mm1,        mm0
538
539        punpcklbw       mm4,        mm2
540        punpcklbw       mm5,        mm3
541
542        psadbw          mm4,        mm0
543        psadbw          mm5,        mm0
544
545        lea             src_ptr,    [src_ptr+src_stride*2]
546        lea             ref_ptr,    [ref_ptr+ref_stride*2]
547
548        movd            mm0,        DWORD PTR [src_ptr]
549        movd            mm2,        DWORD PTR [ref_ptr]
550
551        movd            mm3,        DWORD PTR [src_ptr+src_stride]
552        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
553
554        punpcklbw       mm0,        mm3
555        punpcklbw       mm2,        mm6
556
557        movd            mm3,        DWORD PTR [ref_ptr+1]
558        movd            mm7,        DWORD PTR [ref_ptr+2]
559
560        psadbw          mm2,        mm0
561
562        paddw           mm1,        mm2
563
564        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
565        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
566
567        punpcklbw       mm3,        mm2
568        punpcklbw       mm7,        mm6
569
570        psadbw          mm3,        mm0
571        psadbw          mm7,        mm0
572
573        paddw           mm3,        mm4
574        paddw           mm7,        mm5
575
576        mov             rcx,        result_ptr
577
578        punpckldq       mm1,        mm3
579
580        movq            [rcx],      mm1
581        movd            [rcx+8],    mm7
582
583    STACK_FRAME_DESTROY_X3
584
585;unsigned int vp8_sad16x16_sse3(
586;    unsigned char *src_ptr,
587;    int  src_stride,
588;    unsigned char *ref_ptr,
589;    int  ref_stride,
590;    int  max_sad)
591;%define lddqu movdqu
592global sym(vp8_sad16x16_sse3) PRIVATE
593sym(vp8_sad16x16_sse3):
594
595    STACK_FRAME_CREATE_X3
596
597        mov             end_ptr,    4
598        pxor            xmm7,        xmm7
599
600.vp8_sad16x16_sse3_loop:
601        movdqa          xmm0,       XMMWORD PTR [src_ptr]
602        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
603        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
604        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
605
606        lea             src_ptr,    [src_ptr+src_stride*2]
607        lea             ref_ptr,    [ref_ptr+ref_stride*2]
608
609        movdqa          xmm4,       XMMWORD PTR [src_ptr]
610        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
611        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
612
613        psadbw          xmm0,       xmm1
614
615        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
616
617        psadbw          xmm2,       xmm3
618        psadbw          xmm4,       xmm5
619        psadbw          xmm6,       xmm1
620
621        lea             src_ptr,    [src_ptr+src_stride*2]
622        lea             ref_ptr,    [ref_ptr+ref_stride*2]
623
624        paddw           xmm7,        xmm0
625        paddw           xmm7,        xmm2
626        paddw           xmm7,        xmm4
627        paddw           xmm7,        xmm6
628
629        sub             end_ptr,     1
630        jne             .vp8_sad16x16_sse3_loop
631
632        movq            xmm0,       xmm7
633        psrldq          xmm7,       8
634        paddw           xmm0,       xmm7
635        movq            rax,        xmm0
636
637    STACK_FRAME_DESTROY_X3
638
639;void vp8_copy32xn_sse3(
640;    unsigned char *src_ptr,
641;    int  src_stride,
642;    unsigned char *dst_ptr,
643;    int  dst_stride,
644;    int height);
645global sym(vp8_copy32xn_sse3) PRIVATE
646sym(vp8_copy32xn_sse3):
647
648    STACK_FRAME_CREATE_X3
649
650.block_copy_sse3_loopx4:
651        lea             end_ptr,    [src_ptr+src_stride*2]
652
653        movdqu          xmm0,       XMMWORD PTR [src_ptr]
654        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
655        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
656        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
657        movdqu          xmm4,       XMMWORD PTR [end_ptr]
658        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
659        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
660        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
661
662        lea             src_ptr,    [src_ptr+src_stride*4]
663
664        lea             end_ptr,    [ref_ptr+ref_stride*2]
665
666        movdqa          XMMWORD PTR [ref_ptr], xmm0
667        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
668        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
669        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
670        movdqa          XMMWORD PTR [end_ptr], xmm4
671        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
672        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
673        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
674
675        lea             ref_ptr,    [ref_ptr+ref_stride*4]
676
677        sub             height,     4
678        cmp             height,     4
679        jge             .block_copy_sse3_loopx4
680
681        ;Check to see if there is more rows need to be copied.
682        cmp             height, 0
683        je              .copy_is_done
684
685.block_copy_sse3_loop:
686        movdqu          xmm0,       XMMWORD PTR [src_ptr]
687        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
688        lea             src_ptr,    [src_ptr+src_stride]
689
690        movdqa          XMMWORD PTR [ref_ptr], xmm0
691        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
692        lea             ref_ptr,    [ref_ptr+ref_stride]
693
694        sub             height,     1
695        jne             .block_copy_sse3_loop
696
697.copy_is_done:
698    STACK_FRAME_DESTROY_X3
699
700;void vp8_sad16x16x4d_sse3(
701;    unsigned char *src_ptr,
702;    int  src_stride,
703;    unsigned char *ref_ptr_base,
704;    int  ref_stride,
705;    int  *results)
706global sym(vp8_sad16x16x4d_sse3) PRIVATE
707sym(vp8_sad16x16x4d_sse3):
708
709    STACK_FRAME_CREATE_X4
710
711        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
712        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
713        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
714        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
715        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
716        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
717        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
718        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
719
720%if ABI_IS_32BIT
721        pop             rbp
722%endif
723        mov             rcx,        result_ptr
724
725        movq            xmm0,       xmm4
726        psrldq          xmm4,       8
727
728        paddw           xmm0,       xmm4
729        movd            [rcx],      xmm0
730;-
731        movq            xmm0,       xmm5
732        psrldq          xmm5,       8
733
734        paddw           xmm0,       xmm5
735        movd            [rcx+4],    xmm0
736;-
737        movq            xmm0,       xmm6
738        psrldq          xmm6,       8
739
740        paddw           xmm0,       xmm6
741        movd            [rcx+8],    xmm0
742;-
743        movq            xmm0,       xmm7
744        psrldq          xmm7,       8
745
746        paddw           xmm0,       xmm7
747        movd            [rcx+12],   xmm0
748
749    STACK_FRAME_DESTROY_X4
750
751;void vp8_sad16x8x4d_sse3(
752;    unsigned char *src_ptr,
753;    int  src_stride,
754;    unsigned char *ref_ptr_base,
755;    int  ref_stride,
756;    int  *results)
757global sym(vp8_sad16x8x4d_sse3) PRIVATE
758sym(vp8_sad16x8x4d_sse3):
759
760    STACK_FRAME_CREATE_X4
761
762        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
763        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
764        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
765        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
766
767%if ABI_IS_32BIT
768        pop             rbp
769%endif
770        mov             rcx,        result_ptr
771
772        movq            xmm0,       xmm4
773        psrldq          xmm4,       8
774
775        paddw           xmm0,       xmm4
776        movd            [rcx],      xmm0
777;-
778        movq            xmm0,       xmm5
779        psrldq          xmm5,       8
780
781        paddw           xmm0,       xmm5
782        movd            [rcx+4],    xmm0
783;-
784        movq            xmm0,       xmm6
785        psrldq          xmm6,       8
786
787        paddw           xmm0,       xmm6
788        movd            [rcx+8],    xmm0
789;-
790        movq            xmm0,       xmm7
791        psrldq          xmm7,       8
792
793        paddw           xmm0,       xmm7
794        movd            [rcx+12],   xmm0
795
796    STACK_FRAME_DESTROY_X4
797
798;void int vp8_sad8x16x4d_sse3(
799;    unsigned char *src_ptr,
800;    int  src_stride,
801;    unsigned char *ref_ptr,
802;    int  ref_stride,
803;    int  *results)
804global sym(vp8_sad8x16x4d_sse3) PRIVATE
805sym(vp8_sad8x16x4d_sse3):
806
807    STACK_FRAME_CREATE_X4
808
809        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
810        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
811        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
812        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
813        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
814        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
815        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
816        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
817
818%if ABI_IS_32BIT
819        pop             rbp
820%endif
821        mov             rcx,        result_ptr
822
823        punpckldq       mm4,        mm5
824        punpckldq       mm6,        mm7
825
826        movq            [rcx],      mm4
827        movq            [rcx+8],    mm6
828
829    STACK_FRAME_DESTROY_X4
830
831;void int vp8_sad8x8x4d_sse3(
832;    unsigned char *src_ptr,
833;    int  src_stride,
834;    unsigned char *ref_ptr,
835;    int  ref_stride,
836;    int  *results)
837global sym(vp8_sad8x8x4d_sse3) PRIVATE
838sym(vp8_sad8x8x4d_sse3):
839
840    STACK_FRAME_CREATE_X4
841
842        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
843        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
844        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
845        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
846
847%if ABI_IS_32BIT
848        pop             rbp
849%endif
850        mov             rcx,        result_ptr
851
852        punpckldq       mm4,        mm5
853        punpckldq       mm6,        mm7
854
855        movq            [rcx],      mm4
856        movq            [rcx+8],    mm6
857
858    STACK_FRAME_DESTROY_X4
859
860;void int vp8_sad4x4x4d_sse3(
861;    unsigned char *src_ptr,
862;    int  src_stride,
863;    unsigned char *ref_ptr,
864;    int  ref_stride,
865;    int  *results)
866global sym(vp8_sad4x4x4d_sse3) PRIVATE
867sym(vp8_sad4x4x4d_sse3):
868
869    STACK_FRAME_CREATE_X4
870
871        movd            mm0,        DWORD PTR [src_ptr]
872        movd            mm1,        DWORD PTR [r0_ptr]
873
874        movd            mm2,        DWORD PTR [src_ptr+src_stride]
875        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
876
877        punpcklbw       mm0,        mm2
878        punpcklbw       mm1,        mm3
879
880        movd            mm4,        DWORD PTR [r1_ptr]
881        movd            mm5,        DWORD PTR [r2_ptr]
882
883        movd            mm6,        DWORD PTR [r3_ptr]
884        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
885
886        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
887        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
888
889        psadbw          mm1,        mm0
890
891        punpcklbw       mm4,        mm2
892        punpcklbw       mm5,        mm3
893
894        punpcklbw       mm6,        mm7
895        psadbw          mm4,        mm0
896
897        psadbw          mm5,        mm0
898        psadbw          mm6,        mm0
899
900
901
902        lea             src_ptr,    [src_ptr+src_stride*2]
903        lea             r0_ptr,     [r0_ptr+ref_stride*2]
904
905        lea             r1_ptr,     [r1_ptr+ref_stride*2]
906        lea             r2_ptr,     [r2_ptr+ref_stride*2]
907
908        lea             r3_ptr,     [r3_ptr+ref_stride*2]
909
910        movd            mm0,        DWORD PTR [src_ptr]
911        movd            mm2,        DWORD PTR [r0_ptr]
912
913        movd            mm3,        DWORD PTR [src_ptr+src_stride]
914        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
915
916        punpcklbw       mm0,        mm3
917        punpcklbw       mm2,        mm7
918
919        movd            mm3,        DWORD PTR [r1_ptr]
920        movd            mm7,        DWORD PTR [r2_ptr]
921
922        psadbw          mm2,        mm0
923%if ABI_IS_32BIT
924        mov             rax,        rbp
925
926        pop             rbp
927%define     ref_stride    rax
928%endif
929        mov             rsi,        result_ptr
930
931        paddw           mm1,        mm2
932        movd            [rsi],      mm1
933
934        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
935        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
936
937        punpcklbw       mm3,        mm2
938        punpcklbw       mm7,        mm1
939
940        psadbw          mm3,        mm0
941        psadbw          mm7,        mm0
942
943        movd            mm2,        DWORD PTR [r3_ptr]
944        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
945
946        paddw           mm3,        mm4
947        paddw           mm7,        mm5
948
949        movd            [rsi+4],    mm3
950        punpcklbw       mm2,        mm1
951
952        movd            [rsi+8],    mm7
953        psadbw          mm2,        mm0
954
955        paddw           mm2,        mm6
956        movd            [rsi+12],   mm2
957
958
959    STACK_FRAME_DESTROY_X4
960
961