190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian%define mmx_filter_shift 7 157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) 177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_get_mb_ss_mmx) PRIVATE 187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_get_mb_ss_mmx): 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 7 2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 8 2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(0) ;src_ptr 2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rcx, 16 3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm4 3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 321b362b15af34006e6a11974088a46d42b903418eJohann.NEXTROW: 3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] 3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rax+8] 3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rax+16] 3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rax+24] 3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm1, mm1 3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm3, mm3 4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm0 4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm1 4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm2 4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm3 4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax, 32 4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 491b362b15af34006e6a11974088a46d42b903418eJohann ja .NEXTROW 5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp], mm4 5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;return sum[0]+sum[1]; 5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr [rsp] 5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr [rsp+4] 5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax, rcx 5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 8 5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;void vpx_get8x8var_mmx 6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int source_stride, 7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int recon_stride, 7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *SSE, 7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *Sum 7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_get8x8var_mmx) PRIVATE 767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_get8x8var_mmx): 7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbx 8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 ; Blank mmx6 8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; Blank mmx7 8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; Blank mmx7 8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(0) ;[src_ptr] ; Load base addresses 9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbx, arg(2) ;[ref_ptr] 9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(1) ;[source_stride] 9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, dword ptr arg(3) ;[recon_stride] 9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 1 9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 2 12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 3 14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 4 16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 5 18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; movq mm4, [rbx + rdx] 20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 6 21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 7 23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 8 25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Now accumulate the final results. 28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rdx, WORD PTR [rsp+8] 28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rcx, WORD PTR [rsp+10] 28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rbx, WORD PTR [rsp+12] 28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rax, WORD PTR [rsp+14] 28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdx, rcx 28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx, rax 28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdx, rbx ;XSum 29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, DWORD PTR [rsp] 29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, DWORD PTR [rsp+4] 29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax, rcx ;XXSum 29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(4) ;SSE 29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(5) ;Sum 29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov dword ptr [rsi], eax 29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov dword ptr [rdi], edx 29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber xor rax, rax ; return 0 29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbx 30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;void 3097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;vpx_get4x4var_mmx 31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int source_stride, 31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int recon_stride, 31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *SSE, 31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *Sum 31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 3187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_get4x4var_mmx) PRIVATE 3197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_get4x4var_mmx): 32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbx 32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 ; Blank mmx6 33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; Blank mmx7 33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; Blank mmx7 33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(0) ;[src_ptr] ; Load base addresses 33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbx, arg(2) ;[ref_ptr] 33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(1) ;[source_stride] 33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, dword ptr arg(3) ;[recon_stride] 33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 1 339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd mm0, [rax] ; Copy four bytes to mm0 340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd mm1, [rbx] ; Copy four bytes to mm1 34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd mm1, [rbx] ; Copy four bytes to mm1 34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 2 352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd mm0, [rax] ; Copy four bytes to mm0 35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd mm1, [rbx] ; Copy four bytes to mm1 36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 3 365ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd mm0, [rax] ; Copy four bytes to mm0 366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw mm0, mm6 ; unpack to higher precision 36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd mm1, [rbx] ; Copy four bytes to mm1 37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 4 378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd mm0, [rax] ; Copy four bytes to mm0 37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Now accumulate the final results. 39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rdx, WORD PTR [rsp+8] 39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rcx, WORD PTR [rsp+10] 39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rbx, WORD PTR [rsp+12] 39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rax, WORD PTR [rsp+14] 39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdx, rcx 39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx, rax 39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdx, rbx ;XSum 39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, DWORD PTR [rsp] 40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, DWORD PTR [rsp+4] 40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax, rcx ;XXSum 40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(4) ;SSE 40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(5) ;Sum 40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov dword ptr [rsi], eax 40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov dword ptr [rdi], edx 40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber xor rax, rax ; return 0 40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbx 41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;void vpx_filter_block2d_bil4x4_var_mmx 41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_pixels_per_line, 42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixels_per_line, 42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned short *HFilter, 42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned short *VFilter, 42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *sum, 42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *sumsquared 42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 4287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE 4297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_filter_block2d_bil4x4_var_mmx): 43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 8 43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; 44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(4) ;HFilter ; 44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(5) ;VFilter ; 44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;ref_ptr ; 44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;src_ptr ; 44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rcx, 4 ; 44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rsi] ; 45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm3, [rsi+1] ; 45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 ; 45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rax] ; 45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rax+8] ; 45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 461538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4731b362b15af34006e6a11974088a46d42b903418eJohann.filter_block2d_bil4x4_var_mmx_loop: 47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rsi] ; 47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm3, [rsi+1] ; 47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 ; 47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rax] ; 48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rax+8] ; 48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 485538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; 49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rdx] ; 49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rdx+8] ; 49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 496538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm3, [rdi] ; 50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 50190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm1, mm3 ; 50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm1 ; 50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm1, mm1 ; 50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm1 ; 50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, dword ptr arg(3) ;src_pixels_per_line ; 51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r9, dword ptr arg(3) ;src_pixels_per_line 51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, r9 51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 ; 5181b362b15af34006e6a11974088a46d42b903418eJohann jnz .filter_block2d_bil4x4_var_mmx_loop ; 51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 ; 52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 ; 52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm6 ; 52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm3 ; 52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm6 ; 53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm2, 16 ; 53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm7 ; 53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm4, 32 ; 53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm7 ; 53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(6) ;sum 53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(7) ;sumsquared 54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd dword ptr [rdi], mm2 ; 54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd dword ptr [rsi], mm4 ; 54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;void vpx_filter_block2d_bil_var_mmx 55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_pixels_per_line, 55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixels_per_line, 55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int Height, 56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned short *HFilter, 56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned short *VFilter, 56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *sum, 56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *sumsquared 56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 5657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianglobal sym(vpx_filter_block2d_bil_var_mmx) PRIVATE 5667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramaniansym(vpx_filter_block2d_bil_var_mmx): 56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 9 57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; 57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(5) ;HFilter ; 57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(6) ;VFilter ; 58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;ref_ptr ; 58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;src_ptr ; 58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(4) ;Height ; 58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi] ; 58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi+1] ; 59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; 59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 ; 59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm0 ; 59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rax] ; 59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm2, [rax] ; 59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 ; 60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rax+8] ; 60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm4, [rax+8] ; 60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, mm4 ; 608538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 611538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(mmx_bi_rd)] ; 61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, mmx_filter_shift ; 61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packuswb mm5, mm2 ; 61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, dword ptr arg(1) ;ref_pixels_per_line 61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6241b362b15af34006e6a11974088a46d42b903418eJohann.filter_block2d_bil_var_mmx_loop: 62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi] ; 62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi+1] ; 62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; 63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 ; 63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm0 ; 63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rax] ; 63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm2, [rax] ; 63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 ; 64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rax+8] ; 64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm4, [rax+8] ; 64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, mm4 ; 64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 647538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 650538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(mmx_bi_rd)] ; 65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, mmx_filter_shift ; 65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm5 ; 65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 ; 65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; 66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packuswb mm5, mm2 ; 66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rdx] ; 66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm4, [rdx] ; 66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rdx+8] ; 66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm2, [rdx+8] ; 66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, mm4 ; 67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 671538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 672538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(mmx_bi_rd)] ; 67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, mmx_filter_shift ; 67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi] ; 67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; 67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 ; 68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm1, mm3 ; 68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm2, mm4 ; 68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm1 ; 68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm1, mm1 ; 68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm2 ; 69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; 69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm1 ; 69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; 69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, dword ptr arg(3) ;src_pixels_per_line ; 69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; 70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, r9 70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 ; 7051b362b15af34006e6a11974088a46d42b903418eJohann jnz .filter_block2d_bil_var_mmx_loop ; 70690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 ; 70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 ; 70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm6 ; 71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 71390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm3 ; 71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 71690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 71790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm6 ; 71890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm2, 16 ; 72090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm7 ; 72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm4, 32 ; 72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm7 ; 72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 72590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(7) ;sum 72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(8) ;sumsquared 72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd dword ptr [rdi], mm2 ; 72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd dword ptr [rsi], mm4 ; 73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA 74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;short mmx_bi_rd[4] = { 64, 64, 64, 64}; 74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubermmx_bi_rd: 74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 64 745