190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_sad16x16_wmt(
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_stride,
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  ref_stride)
191b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_sad16x16_wmt) PRIVATE
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_sad16x16_wmt):
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 4
241b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 6
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,        arg(0) ;src_ptr
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,        arg(2) ;ref_ptr
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rax,        dword ptr arg(1) ;src_stride
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rdx,        dword ptr arg(3) ;ref_stride
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rcx,        [rsi+rax*8]
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rcx,        [rcx+rax*8]
381b362b15af34006e6a11974088a46d42b903418eJohann        pxor            xmm6,       xmm6
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
401b362b15af34006e6a11974088a46d42b903418eJohann.x16x16sad_wmt_loop:
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            xmm0,       QWORD PTR [rsi]
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            xmm2,       QWORD PTR [rsi+8]
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            xmm1,       QWORD PTR [rdi]
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            xmm3,       QWORD PTR [rdi+8]
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            xmm4,       QWORD PTR [rsi+rax]
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            xmm5,       QWORD PTR [rdi+rdx]
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       xmm0,       xmm2
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       xmm1,       xmm3
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          xmm0,       xmm1
561b362b15af34006e6a11974088a46d42b903418eJohann        movq            xmm2,       QWORD PTR [rsi+rax+8]
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            xmm3,       QWORD PTR [rdi+rdx+8]
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rsi,        [rsi+rax*2]
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rdi,        [rdi+rdx*2]
621b362b15af34006e6a11974088a46d42b903418eJohann        punpcklbw       xmm4,       xmm2
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       xmm5,       xmm3
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          xmm4,       xmm5
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
671b362b15af34006e6a11974088a46d42b903418eJohann        paddw           xmm6,       xmm0
681b362b15af34006e6a11974088a46d42b903418eJohann        paddw           xmm6,       xmm4
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp             rsi,        rcx
711b362b15af34006e6a11974088a46d42b903418eJohann        jne             .x16x16sad_wmt_loop
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
731b362b15af34006e6a11974088a46d42b903418eJohann        movq            xmm0,       xmm6
741b362b15af34006e6a11974088a46d42b903418eJohann        psrldq          xmm6,       8
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
761b362b15af34006e6a11974088a46d42b903418eJohann        paddw           xmm0,       xmm6
77538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq            rax,        xmm0
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
821b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_XMM
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_sad8x16_wmt(
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_stride,
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  ref_stride,
921b362b15af34006e6a11974088a46d42b903418eJohann;    int  max_sad)
931b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_sad8x16_wmt) PRIVATE
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_sad8x16_wmt):
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 5
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbx
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,        arg(0) ;src_ptr
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,        arg(2) ;ref_ptr
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rbx,        dword ptr arg(1) ;src_stride
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rdx,        dword ptr arg(3) ;ref_stride
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rcx,        [rsi+rbx*8]
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rcx,        [rcx+rbx*8]
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm7,        mm7
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1141b362b15af34006e6a11974088a46d42b903418eJohann.x8x16sad_wmt_loop:
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
116538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq            rax,        mm7
1171b362b15af34006e6a11974088a46d42b903418eJohann        cmp             eax,        arg(4)
1181b362b15af34006e6a11974088a46d42b903418eJohann        ja              .x8x16sad_wmt_early_exit
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm0,        QWORD PTR [rsi]
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm1,        QWORD PTR [rdi]
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm2,        QWORD PTR [rsi+rbx]
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,        QWORD PTR [rdi+rdx]
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          mm0,        mm1
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          mm2,        mm3
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rsi,        [rsi+rbx*2]
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rdi,        [rdi+rdx*2]
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm7,        mm0
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm7,        mm2
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp             rsi,        rcx
1361b362b15af34006e6a11974088a46d42b903418eJohann        jne             .x8x16sad_wmt_loop
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
138538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq            rax,        mm7
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1401b362b15af34006e6a11974088a46d42b903418eJohann.x8x16sad_wmt_early_exit:
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rdi
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rsi
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbx
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_sad8x8_wmt(
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_stride,
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  ref_stride)
1561b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_sad8x8_wmt) PRIVATE
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_sad8x8_wmt):
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 5
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbx
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,        arg(0) ;src_ptr
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,        arg(2) ;ref_ptr
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rbx,        dword ptr arg(1) ;src_stride
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rdx,        dword ptr arg(3) ;ref_stride
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rcx,        [rsi+rbx*8]
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm7,        mm7
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1751b362b15af34006e6a11974088a46d42b903418eJohann.x8x8sad_wmt_loop:
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
177538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq            rax,        mm7
1781b362b15af34006e6a11974088a46d42b903418eJohann        cmp             eax,        arg(4)
1791b362b15af34006e6a11974088a46d42b903418eJohann        ja              .x8x8sad_wmt_early_exit
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm0,        QWORD PTR [rsi]
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm1,        QWORD PTR [rdi]
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          mm0,        mm1
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rsi,        [rsi+rbx]
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rdi,        rdx
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm7,        mm0
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp             rsi,        rcx
1911b362b15af34006e6a11974088a46d42b903418eJohann        jne             .x8x8sad_wmt_loop
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
193538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq            rax,        mm7
1941b362b15af34006e6a11974088a46d42b903418eJohann.x8x8sad_wmt_early_exit:
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rdi
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rsi
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbx
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_sad4x4_wmt(
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_stride,
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  ref_stride)
2091b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_sad4x4_wmt) PRIVATE
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_sad4x4_wmt):
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 4
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,        arg(0) ;src_ptr
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,        arg(2) ;ref_ptr
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rax,        dword ptr arg(1) ;src_stride
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rdx,        dword ptr arg(3) ;ref_stride
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
224538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movd            mm0,        DWORD PTR [rsi]
225538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movd            mm1,        DWORD PTR [rdi]
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
227538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movd            mm2,        DWORD PTR [rsi+rax]
228538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movd            mm3,        DWORD PTR [rdi+rdx]
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm0,        mm2
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm1,        mm3
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          mm0,        mm1
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rsi,        [rsi+rax*2]
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rdi,        [rdi+rdx*2]
237538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movd            mm4,        DWORD PTR [rsi]
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
239538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movd            mm5,        DWORD PTR [rdi]
240538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movd            mm6,        DWORD PTR [rsi+rax]
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
242538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movd            mm7,        DWORD PTR [rdi+rdx]
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm4,        mm6
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm5,        mm7
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          mm4,        mm5
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm0,        mm4
249538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq            rax,        mm0
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_sad16x8_wmt(
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  src_stride,
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  ref_stride)
2641b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_sad16x8_wmt) PRIVATE
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_sad16x8_wmt):
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 5
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbx
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rsi
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rdi
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,        arg(0) ;src_ptr
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,        arg(2) ;ref_ptr
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rbx,        dword ptr arg(1) ;src_stride
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rdx,        dword ptr arg(3) ;ref_stride
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rcx,        [rsi+rbx*8]
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm7,        mm7
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2841b362b15af34006e6a11974088a46d42b903418eJohann.x16x8sad_wmt_loop:
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
286538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq            rax,        mm7
2871b362b15af34006e6a11974088a46d42b903418eJohann        cmp             eax,        arg(4)
2881b362b15af34006e6a11974088a46d42b903418eJohann        ja              .x16x8sad_wmt_early_exit
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm0,        QWORD PTR [rsi]
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm2,        QWORD PTR [rsi+8]
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm1,        QWORD PTR [rdi]
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,        QWORD PTR [rdi+8]
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,        QWORD PTR [rsi+rbx]
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm5,        QWORD PTR [rdi+rdx]
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          mm0,        mm1
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          mm2,        mm3
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm1,        QWORD PTR [rsi+rbx+8]
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,        QWORD PTR [rdi+rdx+8]
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          mm4,        mm5
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psadbw          mm1,        mm3
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rsi,        [rsi+rbx*2]
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea             rdi,        [rdi+rdx*2]
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm0,        mm2
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm4,        mm1
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm7,        mm0
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm7,        mm4
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp             rsi,        rcx
3181b362b15af34006e6a11974088a46d42b903418eJohann        jne             .x16x8sad_wmt_loop
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
320538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq            rax,        mm7
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3221b362b15af34006e6a11974088a46d42b903418eJohann.x16x8sad_wmt_early_exit:
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rdi
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rsi
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbx
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
3311b362b15af34006e6a11974088a46d42b903418eJohann
3321b362b15af34006e6a11974088a46d42b903418eJohann;void vp8_copy32xn_sse2(
3331b362b15af34006e6a11974088a46d42b903418eJohann;    unsigned char *src_ptr,
3341b362b15af34006e6a11974088a46d42b903418eJohann;    int  src_stride,
3351b362b15af34006e6a11974088a46d42b903418eJohann;    unsigned char *dst_ptr,
3361b362b15af34006e6a11974088a46d42b903418eJohann;    int  dst_stride,
3371b362b15af34006e6a11974088a46d42b903418eJohann;    int height);
3381b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_copy32xn_sse2) PRIVATE
3391b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_copy32xn_sse2):
3401b362b15af34006e6a11974088a46d42b903418eJohann    push        rbp
3411b362b15af34006e6a11974088a46d42b903418eJohann    mov         rbp, rsp
3421b362b15af34006e6a11974088a46d42b903418eJohann    SHADOW_ARGS_TO_STACK 5
3431b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 7
3441b362b15af34006e6a11974088a46d42b903418eJohann    push        rsi
3451b362b15af34006e6a11974088a46d42b903418eJohann    push        rdi
3461b362b15af34006e6a11974088a46d42b903418eJohann    ; end prolog
3471b362b15af34006e6a11974088a46d42b903418eJohann
3481b362b15af34006e6a11974088a46d42b903418eJohann        mov             rsi,        arg(0) ;src_ptr
3491b362b15af34006e6a11974088a46d42b903418eJohann        mov             rdi,        arg(2) ;dst_ptr
3501b362b15af34006e6a11974088a46d42b903418eJohann
3511b362b15af34006e6a11974088a46d42b903418eJohann        movsxd          rax,        dword ptr arg(1) ;src_stride
3521b362b15af34006e6a11974088a46d42b903418eJohann        movsxd          rdx,        dword ptr arg(3) ;dst_stride
3531b362b15af34006e6a11974088a46d42b903418eJohann        movsxd          rcx,        dword ptr arg(4) ;height
3541b362b15af34006e6a11974088a46d42b903418eJohann
3551b362b15af34006e6a11974088a46d42b903418eJohann.block_copy_sse2_loopx4:
3561b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm0,       XMMWORD PTR [rsi]
3571b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
3581b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
3591b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
3601b362b15af34006e6a11974088a46d42b903418eJohann
3611b362b15af34006e6a11974088a46d42b903418eJohann        lea             rsi,        [rsi+rax*2]
3621b362b15af34006e6a11974088a46d42b903418eJohann
3631b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm4,       XMMWORD PTR [rsi]
3641b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
3651b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
3661b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
3671b362b15af34006e6a11974088a46d42b903418eJohann
3681b362b15af34006e6a11974088a46d42b903418eJohann        lea             rsi,    [rsi+rax*2]
3691b362b15af34006e6a11974088a46d42b903418eJohann
3701b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi], xmm0
3711b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi + 16], xmm1
3721b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi + rdx], xmm2
3731b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
3741b362b15af34006e6a11974088a46d42b903418eJohann
3751b362b15af34006e6a11974088a46d42b903418eJohann        lea             rdi,    [rdi+rdx*2]
3761b362b15af34006e6a11974088a46d42b903418eJohann
3771b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi], xmm4
3781b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi + 16], xmm5
3791b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi + rdx], xmm6
3801b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
3811b362b15af34006e6a11974088a46d42b903418eJohann
3821b362b15af34006e6a11974088a46d42b903418eJohann        lea             rdi,    [rdi+rdx*2]
3831b362b15af34006e6a11974088a46d42b903418eJohann
3841b362b15af34006e6a11974088a46d42b903418eJohann        sub             rcx,     4
3851b362b15af34006e6a11974088a46d42b903418eJohann        cmp             rcx,     4
3861b362b15af34006e6a11974088a46d42b903418eJohann        jge             .block_copy_sse2_loopx4
3871b362b15af34006e6a11974088a46d42b903418eJohann
3881b362b15af34006e6a11974088a46d42b903418eJohann        cmp             rcx, 0
3891b362b15af34006e6a11974088a46d42b903418eJohann        je              .copy_is_done
3901b362b15af34006e6a11974088a46d42b903418eJohann
3911b362b15af34006e6a11974088a46d42b903418eJohann.block_copy_sse2_loop:
3921b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm0,       XMMWORD PTR [rsi]
3931b362b15af34006e6a11974088a46d42b903418eJohann        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
3941b362b15af34006e6a11974088a46d42b903418eJohann        lea             rsi,    [rsi+rax]
3951b362b15af34006e6a11974088a46d42b903418eJohann
3961b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi], xmm0
3971b362b15af34006e6a11974088a46d42b903418eJohann        movdqa          XMMWORD PTR [rdi + 16], xmm1
3981b362b15af34006e6a11974088a46d42b903418eJohann        lea             rdi,    [rdi+rdx]
3991b362b15af34006e6a11974088a46d42b903418eJohann
4001b362b15af34006e6a11974088a46d42b903418eJohann        sub             rcx,     1
4011b362b15af34006e6a11974088a46d42b903418eJohann        jne             .block_copy_sse2_loop
4021b362b15af34006e6a11974088a46d42b903418eJohann
4031b362b15af34006e6a11974088a46d42b903418eJohann.copy_is_done:
4041b362b15af34006e6a11974088a46d42b903418eJohann    ; begin epilog
4051b362b15af34006e6a11974088a46d42b903418eJohann    pop rdi
4061b362b15af34006e6a11974088a46d42b903418eJohann    pop rsi
4071b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_XMM
4081b362b15af34006e6a11974088a46d42b903418eJohann    UNSHADOW_ARGS
4091b362b15af34006e6a11974088a46d42b903418eJohann    pop         rbp
4101b362b15af34006e6a11974088a46d42b903418eJohann    ret
411