15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);  Use of this source code is governed by a BSD-style license
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);  that can be found in the LICENSE file in the root of the source
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);  tree. An additional intellectual property rights grant can be found
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);  in the file PATENTS.  All contributing project authors may
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);  be found in the AUTHORS file in the root of the source tree.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "vpx_ports/x86_abi_support.asm"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)global sym(vp8_sad16x16_mmx) PRIVATE
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)global sym(vp8_sad8x16_mmx) PRIVATE
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)global sym(vp8_sad8x8_mmx) PRIVATE
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)global sym(vp8_sad4x4_mmx) PRIVATE
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)global sym(vp8_sad16x8_mmx) PRIVATE
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);unsigned int vp8_sad16x16_mmx(
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    unsigned char *src_ptr,
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    int  src_stride,
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    unsigned char *ref_ptr,
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    int  ref_stride)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)sym(vp8_sad16x16_mmx):
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    push        rbp
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    mov         rbp, rsp
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    SHADOW_ARGS_TO_STACK 4
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    push rsi
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    push rdi
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ; end prolog
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        mov             rsi,        arg(0) ;src_ptr
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        mov             rdi,        arg(2) ;ref_ptr
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movsxd          rax,        dword ptr arg(1) ;src_stride
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movsxd          rdx,        dword ptr arg(3) ;ref_stride
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        lea             rcx,        [rsi+rax*8]
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        lea             rcx,        [rcx+rax*8]
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        pxor            mm7,        mm7
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        pxor            mm6,        mm6
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).x16x16sad_mmx_loop:
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm0,        QWORD PTR [rsi]
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm2,        QWORD PTR [rsi+8]
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm1,        QWORD PTR [rdi]
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm3,        QWORD PTR [rdi+8]
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm4,        mm0
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm5,        mm2
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psubusb         mm0,        mm1
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psubusb         mm1,        mm4
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psubusb         mm2,        mm3
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psubusb         mm3,        mm5
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        por             mm0,        mm1
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        por             mm2,        mm3
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm1,        mm0
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm3,        mm2
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpcklbw       mm0,        mm6
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpcklbw       mm2,        mm6
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpckhbw       mm1,        mm6
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpckhbw       mm3,        mm6
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm0,        mm2
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm1,        mm3
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        lea             rsi,        [rsi+rax]
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        add             rdi,        rdx
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm7,        mm0
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm7,        mm1
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        cmp             rsi,        rcx
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        jne             .x16x16sad_mmx_loop
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm0,        mm7
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpcklwd       mm0,        mm6
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpckhwd       mm7,        mm6
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm0,        mm7
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm7,        mm0
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psrlq           mm0,        32
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm7,        mm0
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            rax,        mm7
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pop rdi
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pop rsi
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    mov rsp, rbp
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ; begin epilog
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UNSHADOW_ARGS
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pop         rbp
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ret
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);unsigned int vp8_sad8x16_mmx(
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    unsigned char *src_ptr,
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    int  src_stride,
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    unsigned char *ref_ptr,
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    int  ref_stride)
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)sym(vp8_sad8x16_mmx):
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    push        rbp
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    mov         rbp, rsp
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    SHADOW_ARGS_TO_STACK 4
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    push rsi
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    push rdi
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ; end prolog
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        mov             rsi,        arg(0) ;src_ptr
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        mov             rdi,        arg(2) ;ref_ptr
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movsxd          rax,        dword ptr arg(1) ;src_stride
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movsxd          rdx,        dword ptr arg(3) ;ref_stride
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        lea             rcx,        [rsi+rax*8]
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        lea             rcx,        [rcx+rax*8]
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        pxor            mm7,        mm7
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        pxor            mm6,        mm6
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).x8x16sad_mmx_loop:
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm0,        QWORD PTR [rsi]
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm1,        QWORD PTR [rdi]
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm2,        mm0
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psubusb         mm0,        mm1
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psubusb         mm1,        mm2
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        por             mm0,        mm1
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm2,        mm0
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpcklbw       mm0,        mm6
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpckhbw       mm2,        mm6
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        lea             rsi,        [rsi+rax]
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        add             rdi,        rdx
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm7,        mm0
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm7,        mm2
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        cmp             rsi,        rcx
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        jne             .x8x16sad_mmx_loop
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm0,        mm7
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpcklwd       mm0,        mm6
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        punpckhwd       mm7,        mm6
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm0,        mm7
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm7,        mm0
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psrlq           mm0,        32
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        paddw           mm7,        mm0
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            rax,        mm7
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pop rdi
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pop rsi
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    mov rsp, rbp
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ; begin epilog
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    UNSHADOW_ARGS
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pop         rbp
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ret
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);unsigned int vp8_sad8x8_mmx(
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    unsigned char *src_ptr,
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    int  src_stride,
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    unsigned char *ref_ptr,
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);    int  ref_stride)
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)sym(vp8_sad8x8_mmx):
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    push        rbp
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    mov         rbp, rsp
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    SHADOW_ARGS_TO_STACK 4
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    push rsi
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    push rdi
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ; end prolog
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        mov             rsi,        arg(0) ;src_ptr
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        mov             rdi,        arg(2) ;ref_ptr
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movsxd          rax,        dword ptr arg(1) ;src_stride
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movsxd          rdx,        dword ptr arg(3) ;ref_stride
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        lea             rcx,        [rsi+rax*8]
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        pxor            mm7,        mm7
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        pxor            mm6,        mm6
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).x8x8sad_mmx_loop:
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm0,        QWORD PTR [rsi]
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm1,        QWORD PTR [rdi]
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        movq            mm2,        mm0
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psubusb         mm0,        mm1
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        psubusb         mm1,        mm2
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        por             mm0,        mm1
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
219        movq            mm2,        mm0
220        punpcklbw       mm0,        mm6
221
222        punpckhbw       mm2,        mm6
223        paddw           mm0,        mm2
224
225        lea             rsi,       [rsi+rax]
226        add             rdi,        rdx
227
228        paddw           mm7,       mm0
229        cmp             rsi,        rcx
230
231        jne             .x8x8sad_mmx_loop
232
233        movq            mm0,        mm7
234        punpcklwd       mm0,        mm6
235
236        punpckhwd       mm7,        mm6
237        paddw           mm0,        mm7
238
239        movq            mm7,        mm0
240        psrlq           mm0,        32
241
242        paddw           mm7,        mm0
243        movq            rax,        mm7
244
245    pop rdi
246    pop rsi
247    mov rsp, rbp
248    ; begin epilog
249    UNSHADOW_ARGS
250    pop         rbp
251    ret
252
253
254;unsigned int vp8_sad4x4_mmx(
255;    unsigned char *src_ptr,
256;    int  src_stride,
257;    unsigned char *ref_ptr,
258;    int  ref_stride)
259sym(vp8_sad4x4_mmx):
260    push        rbp
261    mov         rbp, rsp
262    SHADOW_ARGS_TO_STACK 4
263    push rsi
264    push rdi
265    ; end prolog
266
267        mov             rsi,        arg(0) ;src_ptr
268        mov             rdi,        arg(2) ;ref_ptr
269
270        movsxd          rax,        dword ptr arg(1) ;src_stride
271        movsxd          rdx,        dword ptr arg(3) ;ref_stride
272
273        movd            mm0,        DWORD PTR [rsi]
274        movd            mm1,        DWORD PTR [rdi]
275
276        movd            mm2,        DWORD PTR [rsi+rax]
277        movd            mm3,        DWORD PTR [rdi+rdx]
278
279        punpcklbw       mm0,        mm2
280        punpcklbw       mm1,        mm3
281
282        movq            mm2,        mm0
283        psubusb         mm0,        mm1
284
285        psubusb         mm1,        mm2
286        por             mm0,        mm1
287
288        movq            mm2,        mm0
289        pxor            mm3,        mm3
290
291        punpcklbw       mm0,        mm3
292        punpckhbw       mm2,        mm3
293
294        paddw           mm0,        mm2
295
296        lea             rsi,        [rsi+rax*2]
297        lea             rdi,        [rdi+rdx*2]
298
299        movd            mm4,        DWORD PTR [rsi]
300        movd            mm5,        DWORD PTR [rdi]
301
302        movd            mm6,        DWORD PTR [rsi+rax]
303        movd            mm7,        DWORD PTR [rdi+rdx]
304
305        punpcklbw       mm4,        mm6
306        punpcklbw       mm5,        mm7
307
308        movq            mm6,        mm4
309        psubusb         mm4,        mm5
310
311        psubusb         mm5,        mm6
312        por             mm4,        mm5
313
314        movq            mm5,        mm4
315        punpcklbw       mm4,        mm3
316
317        punpckhbw       mm5,        mm3
318        paddw           mm4,        mm5
319
320        paddw           mm0,        mm4
321        movq            mm1,        mm0
322
323        punpcklwd       mm0,        mm3
324        punpckhwd       mm1,        mm3
325
326        paddw           mm0,        mm1
327        movq            mm1,        mm0
328
329        psrlq           mm0,        32
330        paddw           mm0,        mm1
331
332        movq            rax,        mm0
333
334    pop rdi
335    pop rsi
336    mov rsp, rbp
337    ; begin epilog
338    UNSHADOW_ARGS
339    pop         rbp
340    ret
341
342
343;unsigned int vp8_sad16x8_mmx(
344;    unsigned char *src_ptr,
345;    int  src_stride,
346;    unsigned char *ref_ptr,
347;    int  ref_stride)
348sym(vp8_sad16x8_mmx):
349    push        rbp
350    mov         rbp, rsp
351    SHADOW_ARGS_TO_STACK 4
352    push rsi
353    push rdi
354    ; end prolog
355
356        mov             rsi,        arg(0) ;src_ptr
357        mov             rdi,        arg(2) ;ref_ptr
358
359        movsxd          rax,        dword ptr arg(1) ;src_stride
360        movsxd          rdx,        dword ptr arg(3) ;ref_stride
361
362        lea             rcx,        [rsi+rax*8]
363        pxor            mm7,        mm7
364
365        pxor            mm6,        mm6
366
367.x16x8sad_mmx_loop:
368
369        movq            mm0,       [rsi]
370        movq            mm1,       [rdi]
371
372        movq            mm2,        [rsi+8]
373        movq            mm3,        [rdi+8]
374
375        movq            mm4,        mm0
376        movq            mm5,        mm2
377
378        psubusb         mm0,        mm1
379        psubusb         mm1,        mm4
380
381        psubusb         mm2,        mm3
382        psubusb         mm3,        mm5
383
384        por             mm0,        mm1
385        por             mm2,        mm3
386
387        movq            mm1,        mm0
388        movq            mm3,        mm2
389
390        punpcklbw       mm0,        mm6
391        punpckhbw       mm1,        mm6
392
393        punpcklbw       mm2,        mm6
394        punpckhbw       mm3,        mm6
395
396
397        paddw           mm0,        mm2
398        paddw           mm1,        mm3
399
400        paddw           mm0,        mm1
401        lea             rsi,        [rsi+rax]
402
403        add             rdi,        rdx
404        paddw           mm7,        mm0
405
406        cmp             rsi,        rcx
407        jne             .x16x8sad_mmx_loop
408
409        movq            mm0,        mm7
410        punpcklwd       mm0,        mm6
411
412        punpckhwd       mm7,        mm6
413        paddw           mm0,        mm7
414
415        movq            mm7,        mm0
416        psrlq           mm0,        32
417
418        paddw           mm7,        mm0
419        movq            rax,        mm7
420
421    pop rdi
422    pop rsi
423    mov rsp, rbp
424    ; begin epilog
425    UNSHADOW_ARGS
426    pop         rbp
427    ret
428