190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian%define mmx_filter_shift            7
157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_get_mb_ss_mmx) PRIVATE
187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_get_mb_ss_mmx):
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 7
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 8
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rax, arg(0) ;src_ptr
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rcx, 16
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm4, mm4
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
321b362b15af34006e6a11974088a46d42b903418eJohann.NEXTROW:
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rax+8]
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, [rax+16]
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, [rax+24]
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm1, mm1
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm3, mm3
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm4, mm0
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm4, mm1
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm4, mm2
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm4, mm3
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax, 32
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        dec         rcx
491b362b15af34006e6a11974088a46d42b903418eJohann        ja          .NEXTROW
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp], mm4
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;return sum[0]+sum[1];
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, dword ptr [rsp]
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr [rsp+4]
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax, rcx
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 8
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;void vpx_get8x8var_mmx
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  source_stride,
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  recon_stride,
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *SSE,
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int *Sum
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_get8x8var_mmx) PRIVATE
767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_get8x8var_mmx):
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rbx
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 16
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5, mm5                    ; Blank mmx6
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, mm6                    ; Blank mmx7
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7                    ; Blank mmx7
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rbx, arg(2) ;[ref_ptr]
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(1) ;[source_stride]
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 1
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 2
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 3
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 4
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 5
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ;              movq        mm4, [rbx + rdx]
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 6
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 7
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 8
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0, [rax]                  ; Copy eight bytes to mm0
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2, mm0                    ; Take copies
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3, mm1                    ; Take copies
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3, mm6
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm2, mm3                    ; A-B (high order) to MM2
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm2                    ; accumulate differences in mm5
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm2, mm2                    ; square and accumulate
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm2                    ; accumulate in mm7
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Now accumulate the final results.
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rdx, WORD PTR [rsp+8]
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rcx, WORD PTR [rsp+10]
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rbx, WORD PTR [rsp+12]
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rax, WORD PTR [rsp+14]
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx, rcx
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx, rax
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx, rbx    ;XSum
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, DWORD PTR [rsp]
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, DWORD PTR [rsp+4]
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax, rcx    ;XXSum
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(4) ;SSE
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, arg(5) ;Sum
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         dword ptr [rsi], eax
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         dword ptr [rdi], edx
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xor         rax, rax    ; return 0
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 16
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rbx
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;void
3097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;vpx_get4x4var_mmx
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  source_stride,
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int  recon_stride,
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *SSE,
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int *Sum
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
3187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_get4x4var_mmx) PRIVATE
3197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_get4x4var_mmx):
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 6
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rbx
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 16
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm5, mm5                    ; Blank mmx6
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm6, mm6                    ; Blank mmx7
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7, mm7                    ; Blank mmx7
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rbx, arg(2) ;[ref_ptr]
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, dword ptr arg(1) ;[source_stride]
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 1
339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        movd        mm0, [rax]                  ; Copy four bytes to mm0
340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        movd        mm1, [rbx]                  ; Copy four bytes to mm1
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        movd        mm1, [rbx]                  ; Copy four bytes to mm1
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 2
352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        movd        mm0, [rax]                  ; Copy four bytes to mm0
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        movd        mm1, [rbx]                  ; Copy four bytes to mm1
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 3
365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        movd        mm0, [rax]                  ; Copy four bytes to mm0
366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        punpcklbw   mm0, mm6                    ; unpack to higher precision
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx,rdx                     ; Inc pointer into ref data
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax,rcx                     ; Inc pointer into the new data
374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        movd        mm1, [rbx]                  ; Copy four bytes to mm1
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Row 4
378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian        movd        mm0, [rax]                  ; Copy four bytes to mm0
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1, mm6
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubsw      mm0, mm1                    ; A-B (low order) to MM0
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5, mm0                    ; accumulate differences in mm5
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd     mm0, mm0                    ; square and accumulate
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd       mm7, mm0                    ; accumulate in mm7
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        ; Now accumulate the final results.
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rdx, WORD PTR [rsp+8]
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rcx, WORD PTR [rsp+10]
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rbx, WORD PTR [rsp+12]
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsx       rax, WORD PTR [rsp+14]
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx, rcx
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rbx, rax
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rdx, rbx    ;XSum
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rax, DWORD PTR [rsp]
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd      rcx, DWORD PTR [rsp+4]
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         rax, rcx    ;XXSum
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rsi, arg(4) ;SSE
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         rdi, arg(5) ;Sum
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         dword ptr [rsi], eax
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         dword ptr [rdi], edx
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xor         rax, rax    ; return 0
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 16
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rbx
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;void vpx_filter_block2d_bil4x4_var_mmx
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int ref_pixels_per_line,
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int src_pixels_per_line,
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned short *HFilter,
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned short *VFilter,
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int *sum,
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *sumsquared
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
4287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
4297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_filter_block2d_bil4x4_var_mmx):
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 8
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 16
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm6,            mm6                 ;
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm7,            mm7                 ;
44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rax,            arg(4) ;HFilter             ;
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdx,            arg(5) ;VFilter             ;
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,            arg(0) ;ref_ptr              ;
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,            arg(2) ;src_ptr              ;
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rcx,            4                   ;
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm0,            mm0                 ;
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm1,            [rsi]               ;
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm3,            [rsi+1]             ;
45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm1,            mm0                 ;
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rax]               ;
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rax+8]             ;
45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
461538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm5,            mm1
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi, r8
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4731b362b15af34006e6a11974088a46d42b903418eJohann.filter_block2d_bil4x4_var_mmx_loop:
47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm1,            [rsi]               ;
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm3,            [rsi+1]             ;
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm1,            mm0                 ;
47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rax]               ;
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rax+8]             ;
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
485538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            mm5                 ;
48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm5,            mm1                 ;
49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rdx]               ;
49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rdx+8]             ;
49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
496538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            mm3,            [rdi]               ;
50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
50190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw           mm1,            mm3                 ;
50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm6,            mm1                 ;
50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd         mm1,            mm1                 ;
50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm7,            mm1                 ;
50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            r8
51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rdi,            r9
51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub             rcx,            1                   ;
5181b362b15af34006e6a11974088a46d42b903418eJohann        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm3,            mm3                 ;
52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm2,            mm2                 ;
52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd       mm2,            mm6                 ;
52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd       mm3,            mm6                 ;
52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm2,            mm3                 ;
52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm6,            mm2                 ;
52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq           mm6,            32                  ;
53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm2,            mm6                 ;
53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrad           mm2,            16                  ;
53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm7                 ;
53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq           mm4,            32                  ;
53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm4,            mm7                 ;
53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,            arg(6) ;sum
53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,            arg(7) ;sumsquared
54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            dword ptr [rdi],          mm2                 ;
54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            dword ptr [rsi],          mm4                 ;
54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 16
54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;void vpx_filter_block2d_bil_var_mmx
55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;(
55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *ref_ptr,
55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int ref_pixels_per_line,
55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned char *src_ptr,
55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int src_pixels_per_line,
55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int Height,
56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned short *HFilter,
56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned short *VFilter,
56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    int *sum,
56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;    unsigned int *sumsquared
56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;)
5657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
5667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_filter_block2d_bil_var_mmx):
56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push        rbp
56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mov         rbp, rsp
56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    SHADOW_ARGS_TO_STACK 9
57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    GET_GOT     rbx
57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rsi
57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    push rdi
57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    sub         rsp, 16
57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; end prolog
57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm6,            mm6                 ;
57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm7,            mm7                 ;
57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rax,            arg(5) ;HFilter             ;
57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdx,            arg(6) ;VFilter             ;
58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,            arg(0) ;ref_ptr              ;
58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,            arg(2) ;src_ptr              ;
58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          rcx,            dword ptr arg(4) ;Height              ;
58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm0,            mm0                 ;
58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm1,            [rsi]               ;
58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            [rsi+1]             ;
59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm2,            mm1                 ;
59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm3                 ;
59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm1,            mm0                 ;
59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm2,            mm0                 ;
59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rax]               ;
59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm2,            [rax]               ;
59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm4,            mm0                 ;
60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rax+8]             ;
60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm4,            [rax+8]             ;
60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm2,            mm4                 ;
608538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
611538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm2,            mmx_filter_shift    ;
61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm5,            mm1
61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb        mm5,            mm2                 ;
61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            r8
62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6241b362b15af34006e6a11974088a46d42b903418eJohann.filter_block2d_bil_var_mmx_loop:
62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm1,            [rsi]               ;
62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            [rsi+1]             ;
62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm2,            mm1                 ;
63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm3                 ;
63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm1,            mm0                 ;
63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm2,            mm0                 ;
63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rax]               ;
63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm2,            [rax]               ;
63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm4,            mm0                 ;
64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rax+8]             ;
64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm4,            [rax+8]             ;
64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm2,            mm4                 ;
64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
647538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
650538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm2,            mmx_filter_shift    ;
65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            mm5                 ;
65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm5                 ;
65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm4,            mm0                 ;
65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm5,            mm1                 ;
66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb        mm5,            mm2                 ;
66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm3,            [rdx]               ;
66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm4,            [rdx]               ;
66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm1,            [rdx+8]             ;
66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw          mm2,            [rdx+8]             ;
66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm1,            mm3                 ;
66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm2,            mm4                 ;
67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
671538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
672538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm1,            mmx_filter_shift    ;
67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw           mm2,            mmx_filter_shift    ;
67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm3,            [rdi]               ;
67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm3                 ;
67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw       mm3,            mm0                 ;
68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw       mm4,            mm0                 ;
68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw           mm1,            mm3                 ;
68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psubw           mm2,            mm4                 ;
68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm6,            mm1                 ;
68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd         mm1,            mm1                 ;
68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw           mm6,            mm2                 ;
69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmaddwd         mm2,            mm2                 ;
69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm7,            mm1                 ;
69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm7,            mm2                 ;
69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT
69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else
69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rsi,            r8
70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add             rdi,            r9
70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif
70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub             rcx,            1                   ;
7051b362b15af34006e6a11974088a46d42b903418eJohann        jnz             .filter_block2d_bil_var_mmx_loop       ;
70690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm3,            mm3                 ;
70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor            mm2,            mm2                 ;
70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklwd       mm2,            mm6                 ;
71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhwd       mm3,            mm6                 ;
71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm2,            mm3                 ;
71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm6,            mm2                 ;
71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq           mm6,            32                  ;
71790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm2,            mm6                 ;
71890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrad           mm2,            16                  ;
72090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq            mm4,            mm7                 ;
72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq           mm4,            32                  ;
72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddd           mm4,            mm7                 ;
72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rdi,            arg(7) ;sum
72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov             rsi,            arg(8) ;sumsquared
72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            dword ptr [rdi],          mm2                 ;
72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd            dword ptr [rsi],          mm4                 ;
73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ; begin epilog
73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add rsp, 16
73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rdi
73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop rsi
73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    RESTORE_GOT
73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    UNSHADOW_ARGS
73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pop         rbp
73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ret
73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA
74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;short mmx_bi_rd[4] = { 64, 64, 64, 64};
74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16
74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubermmx_bi_rd:
74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    times 4 dw 64
745