190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get_mb_ss_mmx)
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get_mb_ss_mmx):
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 7
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 8
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rax, arg(0) ;src_ptr
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rcx, 16
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4, mm4
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberNEXTROW:
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rax+8]
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rax+16]
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rax+24]
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm1, mm1
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm3, mm3
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm4, mm0
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm4, mm1
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm4, mm2
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm4, mm3
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax, 32
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ja          NEXTROW
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp], mm4
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;return sum[0]+sum[1];
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr [rsp]
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr [rsp+4]
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax, rcx
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 8
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get8x8var_mmx
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  source_stride,
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  recon_stride,
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *SSE,
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int *Sum
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get8x8var_mmx)
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get8x8var_mmx):
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rbx
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 16
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5, mm5                    ; Blank mmx6
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, mm6                    ; Blank mmx7
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7                    ; Blank mmx7
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rbx, arg(2) ;[ref_ptr]
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(1) ;[source_stride]
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 1
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 2
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 3
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 4
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 5
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;              movq        mm4, [rbx + rdx]
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 6
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 7
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 8
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Now accumulate the final results.
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rdx, WORD PTR [rsp+8]
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rcx, WORD PTR [rsp+10]
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rbx, WORD PTR [rsp+12]
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rax, WORD PTR [rsp+14]
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx, rcx
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx, rax
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx, rbx    ;XSum
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, DWORD PTR [rsp]
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, DWORD PTR [rsp+4]
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax, rcx    ;XXSum
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(4) ;SSE
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, arg(5) ;Sum
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         dword ptr [rsi], eax
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         dword ptr [rdi], edx
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xor         rax, rax    ; return 0
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 16
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rbx
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;vp8_get4x4var_mmx
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  source_stride,
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  recon_stride,
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *SSE,
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int *Sum
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get4x4var_mmx)
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get4x4var_mmx):
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rbx
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 16
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5, mm5                    ; Blank mmx6
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, mm6                    ; Blank mmx7
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7                    ; Blank mmx7
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rbx, arg(2) ;[ref_ptr]
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(1) ;[source_stride]
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 1
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 2
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 3
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 4
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Now accumulate the final results.
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rdx, WORD PTR [rsp+8]
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rcx, WORD PTR [rsp+10]
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rbx, WORD PTR [rsp+12]
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rax, WORD PTR [rsp+14]
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx, rcx
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx, rax
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx, rbx    ;XSum
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, DWORD PTR [rsp]
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, DWORD PTR [rsp+4]
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax, rcx    ;XXSum
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(4) ;SSE
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, arg(5) ;Sum
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         dword ptr [rsi], eax
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         dword ptr [rdi], edx
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xor         rax, rax    ; return 0
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 16
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rbx
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;vp8_get4x4sse_cs_mmx
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  source_stride,
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  recon_stride
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get4x4sse_cs_mmx)
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get4x4sse_cs_mmx):
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 4
44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rbx
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, mm6                    ; Blank mmx7
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7                    ; Blank mmx7
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rbx, arg(2) ;[ref_ptr]
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(1) ;[source_stride]
45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 1
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0, [rax]                  ; Copy eight bytes to mm0
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 2
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0, [rax]                  ; Copy eight bytes to mm0
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 3
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0, [rax]                  ; Copy eight bytes to mm0
47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 4
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0, [rax]                  ; Copy eight bytes to mm0
49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    mm7                 ;
49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm7,    32
49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm0,    mm7
501538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        movq        rax,    mm0
50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rbx
50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%define mmx_filter_shift            7
51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_filter_block2d_bil4x4_var_mmx
51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int ref_pixels_per_line,
51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int src_pixels_per_line,
52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned short *HFilter,
52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned short *VFilter,
52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int *sum,
52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *sumsquared
52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_filter_block2d_bil4x4_var_mmx)
52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_filter_block2d_bil4x4_var_mmx):
52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 8
53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 16
53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm6,            mm6                 ;
53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm7,            mm7                 ;
53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rax,            arg(4) ;HFilter             ;
54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdx,            arg(5) ;VFilter             ;
54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,            arg(0) ;ref_ptr              ;
54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,            arg(2) ;src_ptr              ;
54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rcx,            4                   ;
54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm0,            mm0                 ;
54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm1,            [rsi]               ;
55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm3,            [rsi+1]             ;
55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm1,            mm0                 ;
55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rax]               ;
55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rax+8]             ;
55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
559538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm5,            mm1
56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi, r8
56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilter_block2d_bil4x4_var_mmx_loop:
57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm1,            [rsi]               ;
57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm3,            [rsi+1]             ;
57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm1,            mm0                 ;
57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rax]               ;
57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rax+8]             ;
58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
583538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            mm5                 ;
58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm5,            mm1                 ;
58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rdx]               ;
59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rdx+8]             ;
59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
595538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm3,            [rdi]               ;
59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw           mm1,            mm3                 ;
60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm6,            mm1                 ;
60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd         mm1,            mm1                 ;
60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm7,            mm1                 ;
60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            r8
61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rdi,            r9
61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub             rcx,            1                   ;
61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jnz             filter_block2d_bil4x4_var_mmx_loop       ;
61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm3,            mm3                 ;
62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm2,            mm2                 ;
62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd       mm2,            mm6                 ;
62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd       mm3,            mm6                 ;
62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm2,            mm3                 ;
62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm6,            mm2                 ;
62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq           mm6,            32                  ;
63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm2,            mm6                 ;
63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrad           mm2,            16                  ;
63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm7                 ;
63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq           mm4,            32                  ;
63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm4,            mm7                 ;
63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,            arg(6) ;sum
63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,            arg(7) ;sumsquared
64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            dword ptr [rdi],          mm2                 ;
64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            dword ptr [rsi],          mm4                 ;
64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
64790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 16
64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
65090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_filter_block2d_bil_var_mmx
65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int ref_pixels_per_line,
66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int src_pixels_per_line,
66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int Height,
66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned short *HFilter,
66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned short *VFilter,
66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int *sum,
66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *sumsquared
66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_filter_block2d_bil_var_mmx)
67190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_filter_block2d_bil_var_mmx):
67290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 9
67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 16
67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm6,            mm6                 ;
68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm7,            mm7                 ;
68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rax,            arg(5) ;HFilter             ;
68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdx,            arg(6) ;VFilter             ;
68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,            arg(0) ;ref_ptr              ;
68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,            arg(2) ;src_ptr              ;
68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rcx,            dword ptr arg(4) ;Height              ;
69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm0,            mm0                 ;
69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm1,            [rsi]               ;
69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            [rsi+1]             ;
69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm2,            mm1                 ;
69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm3                 ;
69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm1,            mm0                 ;
69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm2,            mm0                 ;
70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rax]               ;
70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm2,            [rax]               ;
70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
70590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm4,            mm0                 ;
70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rax+8]             ;
70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm4,            [rax+8]             ;
71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm2,            mm4                 ;
713538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
716538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
71790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm2,            mmx_filter_shift    ;
71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm5,            mm1
72090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb        mm5,            mm2                 ;
72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
72590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            r8
72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilter_block2d_bil_var_mmx_loop:
73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm1,            [rsi]               ;
73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            [rsi+1]             ;
73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm2,            mm1                 ;
73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm3                 ;
73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm1,            mm0                 ;
73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm2,            mm0                 ;
73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rax]               ;
74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm2,            [rax]               ;
74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm4,            mm0                 ;
74590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rax+8]             ;
74790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm4,            [rax+8]             ;
74890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
75090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm2,            mm4                 ;
75190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
752538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
75390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
75490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
755538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
75690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm2,            mmx_filter_shift    ;
75790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
75890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            mm5                 ;
75990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm5                 ;
76090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
76290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm4,            mm0                 ;
76390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm5,            mm1                 ;
76590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb        mm5,            mm2                 ;
76690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rdx]               ;
76890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm4,            [rdx]               ;
76990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rdx+8]             ;
77190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm2,            [rdx+8]             ;
77290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
77490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm2,            mm4                 ;
77590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
776538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
777538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
77890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
78090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm2,            mmx_filter_shift    ;
78190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            [rdi]               ;
78390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm3                 ;
78490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
78690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm4,            mm0                 ;
78790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw           mm1,            mm3                 ;
78990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw           mm2,            mm4                 ;
79090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm6,            mm1                 ;
79290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd         mm1,            mm1                 ;
79390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm6,            mm2                 ;
79590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd         mm2,            mm2                 ;
79690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm7,            mm1                 ;
79890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm7,            mm2                 ;
79990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
80190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
80290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
80390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
80490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
80590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
80690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            r8
80790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rdi,            r9
80890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
80990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub             rcx,            1                   ;
81090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jnz             filter_block2d_bil_var_mmx_loop       ;
81190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm3,            mm3                 ;
81490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm2,            mm2                 ;
81590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd       mm2,            mm6                 ;
81790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd       mm3,            mm6                 ;
81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm2,            mm3                 ;
82090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm6,            mm2                 ;
82190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq           mm6,            32                  ;
82390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm2,            mm6                 ;
82490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrad           mm2,            16                  ;
82690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm7                 ;
82790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq           mm4,            32                  ;
82990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm4,            mm7                 ;
83090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,            arg(7) ;sum
83290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,            arg(8) ;sumsquared
83390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
83490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            dword ptr [rdi],          mm2                 ;
83590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            dword ptr [rsi],          mm4                 ;
83690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
83890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 16
83990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
84090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
84190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
84290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
84390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
84590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
84690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get16x16pred_error_mmx
84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
84890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
84990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int src_stride,
85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
85190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int ref_stride
85290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get16x16pred_error_mmx)
85490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get16x16pred_error_mmx):
85590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
85690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
85790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 4
85890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
85990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
86090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
86190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 16
86290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi,            arg(0) ;DWORD PTR [src_ptr]
86590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi,            arg(2) ;DWORD PTR [ref_ptr]
86690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
86890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
87090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm0,            mm0                     ; clear xmm0 for unpack
87190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,            mm7                     ; clear xmm7 for accumulating diffs
87290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
87390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6,            mm6                     ; clear xmm6 for accumulating sse
87490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rcx,            16
87590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
87690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervar16loop:
87790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
87890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,            [rsi]
87990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,            [rdi]
88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,            mm1
88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,            mm2
88390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,            mm0
88590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,            mm0
88690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,            mm0
88890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm4,            mm0
88990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm1,            mm2
89190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm3,            mm4
89290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm7,            mm1
89490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm1,            mm1
89590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm7,            mm3
89790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm3,            mm3
89890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm6,            mm1
90090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm6,            mm3
90190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,            [rsi+8]
90490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,            [rdi+8]
90590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,            mm1
90790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,            mm2
90890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,            mm0
91090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,            mm0
91190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,            mm0
91390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm4,            mm0
91490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm1,            mm2
91690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw       mm3,            mm4
91790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm7,            mm1
91990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm1,            mm1
92090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
92190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm7,            mm3
92290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm3,            mm3
92390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
92490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm6,            mm1
92590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm6,            mm3
92690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
92790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rsi,            rax
92890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdi,            rdx
92990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
93090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         rcx,            1
93190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jnz         var16loop
93290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
93390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
93490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,            mm6
93590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6,            mm6
93690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
93790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5,            mm5
93890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd   mm6,            mm7
93990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd   mm5,            mm7
94190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrad       mm5,            16
94290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrad       mm6,            16
94490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm6,            mm5
94590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,            mm1
94790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm1,            32
94890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm2,            mm1
95090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,            mm6
95190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm6,            32
95390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm6,            mm7
95490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd DWORD PTR [rsp],       mm6  ;Sum
95690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd DWORD PTR [rsp+4],     mm2  ;SSE
95790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; return (SSE-((Sum*Sum)>>8));
95990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx, dword ptr [rsp]
96090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        imul        rdx, rdx
96190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sar         rdx, 8
96290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr [rsp + 4]
96390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         rax, rdx
96490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
96790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 16
96890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
96990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
97090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
97190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
97290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
97390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
97490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA
97890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;short mmx_bi_rd[4] = { 64, 64, 64, 64};
97990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
98090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubermmx_bi_rd:
98190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 64
982