190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) 1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get_mb_ss_mmx) 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get_mb_ss_mmx): 1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 7 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 8 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(0) ;src_ptr 2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rcx, 16 2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm4, mm4 2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberNEXTROW: 3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] 3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rax+8] 3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rax+16] 3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rax+24] 3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm1, mm1 3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm3, mm3 3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm0 4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm1 4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm2 4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm3 4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax, 32 4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ja NEXTROW 4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp], mm4 4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ;return sum[0]+sum[1]; 5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr [rsp] 5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr [rsp+4] 5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax, rcx 5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 8 5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get8x8var_mmx 6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int source_stride, 7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int recon_stride, 7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *SSE, 7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *Sum 7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get8x8var_mmx) 7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get8x8var_mmx): 7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbx 8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 ; Blank mmx6 8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; Blank mmx7 8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; Blank mmx7 9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(0) ;[src_ptr] ; Load base addresses 9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbx, arg(2) ;[ref_ptr] 9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(1) ;[source_stride] 9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, dword ptr arg(3) ;[recon_stride] 9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 1 9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 2 12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 3 14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 4 16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 5 19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; movq mm4, [rbx + rdx] 21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 6 21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 7 23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 8 26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm0 ; Take copies 26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 ; Take copies 26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm6 ; unpack to higher prrcision 26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm6 26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm2, mm3 ; A-B (high order) to MM2 27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm2 ; accumulate differences in mm5 27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; square and accumulate 27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; accumulate in mm7 28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Now accumulate the final results. 28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rdx, WORD PTR [rsp+8] 28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rcx, WORD PTR [rsp+10] 28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rbx, WORD PTR [rsp+12] 28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rax, WORD PTR [rsp+14] 28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdx, rcx 29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx, rax 29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdx, rbx ;XSum 29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, DWORD PTR [rsp] 29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, DWORD PTR [rsp+4] 29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax, rcx ;XXSum 29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(4) ;SSE 29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(5) ;Sum 29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov dword ptr [rsi], eax 29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov dword ptr [rdi], edx 29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber xor rax, rax ; return 0 30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbx 30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int 31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;vp8_get4x4var_mmx 31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int source_stride, 31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int recon_stride, 32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *SSE, 32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *Sum 32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get4x4var_mmx) 32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get4x4var_mmx): 32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbx 33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 ; Blank mmx6 33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; Blank mmx7 33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; Blank mmx7 33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(0) ;[src_ptr] ; Load base addresses 34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbx, arg(2) ;[ref_ptr] 34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(1) ;[source_stride] 34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, dword ptr arg(3) ;[recon_stride] 34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 1 34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 2 35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 3 37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rbx] ; Copy eight bytes to mm1 38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 4 38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, [rax] ; Copy eight bytes to mm0 38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm5, mm0 ; accumulate differences in mm5 39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Now accumulate the final results. 39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rdx, WORD PTR [rsp+8] 40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rcx, WORD PTR [rsp+10] 40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rbx, WORD PTR [rsp+12] 40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rax, WORD PTR [rsp+14] 40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdx, rcx 40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx, rax 40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdx, rbx ;XSum 40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, DWORD PTR [rsp] 40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, DWORD PTR [rsp+4] 40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax, rcx ;XXSum 41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(4) ;SSE 41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(5) ;Sum 41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov dword ptr [rsi], eax 41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov dword ptr [rdi], edx 41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber xor rax, rax ; return 0 41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbx 42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int 42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;vp8_get4x4sse_cs_mmx 43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int source_stride, 43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int recon_stride 43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get4x4sse_cs_mmx) 43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get4x4sse_cs_mmx): 43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 4 44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbx 44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; Blank mmx7 44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; Blank mmx7 44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(0) ;[src_ptr] ; Load base addresses 45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbx, arg(2) ;[ref_ptr] 45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(1) ;[source_stride] 45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, dword ptr arg(3) ;[recon_stride] 45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 1 45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rax] ; Copy eight bytes to mm0 45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rbx] ; Copy eight bytes to mm1 45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rbx] ; Copy eight bytes to mm1 46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 2 46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rax] ; Copy eight bytes to mm0 46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rbx] ; Copy eight bytes to mm1 47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 3 47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rax] ; Copy eight bytes to mm0 47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rbx,rdx ; Inc pointer into ref data 48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax,rcx ; Inc pointer into the new data 48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rbx] ; Copy eight bytes to mm1 48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; Row 4 49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm0, [rax] ; Copy eight bytes to mm0 49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm0, mm6 ; unpack to higher prrcision 49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm6 49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm0, mm1 ; A-B (low order) to MM0 49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm0, mm0 ; square and accumulate 49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm0 ; accumulate in mm7 49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm0, mm7 ; 49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm7, 32 49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm0, mm7 501538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq rax, mm0 50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbx 50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%define mmx_filter_shift 7 51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_filter_block2d_bil4x4_var_mmx 51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_pixels_per_line, 51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixels_per_line, 52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned short *HFilter, 52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned short *VFilter, 52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *sum, 52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *sumsquared 52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_filter_block2d_bil4x4_var_mmx) 52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_filter_block2d_bil4x4_var_mmx): 52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 8 53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; 53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(4) ;HFilter ; 54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(5) ;VFilter ; 54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;ref_ptr ; 54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;src_ptr ; 54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rcx, 4 ; 54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rsi] ; 55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm3, [rsi+1] ; 55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 ; 55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rax] ; 55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rax+8] ; 55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 559538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilter_block2d_bil4x4_var_mmx_loop: 57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm1, [rsi] ; 57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm3, [rsi+1] ; 57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 ; 57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rax] ; 57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rax+8] ; 58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 583538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; 58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rdx] ; 59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rdx+8] ; 59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 595538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm3, [rdi] ; 59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm1, mm3 ; 60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm1 ; 60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm1, mm1 ; 60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm1 ; 60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, dword ptr arg(3) ;src_pixels_per_line ; 61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r9, dword ptr arg(3) ;src_pixels_per_line 61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, r9 61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 ; 61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz filter_block2d_bil4x4_var_mmx_loop ; 61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 ; 62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 ; 62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm6 ; 62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm3 ; 62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm6 ; 63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm2, 16 ; 63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm7 ; 63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm4, 32 ; 63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm7 ; 63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(6) ;sum 63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(7) ;sumsquared 64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd dword ptr [rdi], mm2 ; 64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd dword ptr [rsi], mm4 ; 64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 64790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 65090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_filter_block2d_bil_var_mmx 65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_pixels_per_line, 66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixels_per_line, 66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int Height, 66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned short *HFilter, 66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned short *VFilter, 66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *sum, 66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *sumsquared 66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_filter_block2d_bil_var_mmx) 67190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_filter_block2d_bil_var_mmx): 67290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 9 67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; 68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; 68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(5) ;HFilter ; 68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdx, arg(6) ;VFilter ; 68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;ref_ptr ; 68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;src_ptr ; 68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(4) ;Height ; 69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; 69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi] ; 69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi+1] ; 69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; 69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 ; 69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm0 ; 70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rax] ; 70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm2, [rax] ; 70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 70590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 70690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 ; 70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rax+8] ; 70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm4, [rax+8] ; 71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, mm4 ; 713538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 716538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(mmx_bi_rd)] ; 71790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 71890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, mmx_filter_shift ; 71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 72090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packuswb mm5, mm2 ; 72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, dword ptr arg(1) ;ref_pixels_per_line 72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 72590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberfilter_block2d_bil_var_mmx_loop: 73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi] ; 73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rsi+1] ; 73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 ; 73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; 73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 ; 73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm2, mm0 ; 73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rax] ; 74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm2, [rax] ; 74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 ; 74590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rax+8] ; 74790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm4, [rax+8] ; 74890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 75090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, mm4 ; 75190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 752538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 75390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 75490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 755538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(mmx_bi_rd)] ; 75690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, mmx_filter_shift ; 75790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 75890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm5 ; 75990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm5 ; 76090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 76290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 ; 76390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, mm1 ; 76590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber packuswb mm5, mm2 ; 76690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm3, [rdx] ; 76890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm4, [rdx] ; 76990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm1, [rdx+8] ; 77190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm2, [rdx+8] ; 77290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm1, mm3 ; 77490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm2, mm4 ; 77590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 776538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm1, [GLOBAL(mmx_bi_rd)] ; 777538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber paddw mm2, [GLOBAL(mmx_bi_rd)] ; 77890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm1, mmx_filter_shift ; 78090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw mm2, mmx_filter_shift ; 78190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 78290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, [rdi] ; 78390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm3 ; 78490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 78590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm0 ; 78690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 ; 78790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 78890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm1, mm3 ; 78990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm2, mm4 ; 79090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 79190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm1 ; 79290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm1, mm1 ; 79390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 79490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm2 ; 79590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm2, mm2 ; 79690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 79790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm1 ; 79890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm2 ; 79990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 80090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 80190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 80290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, dword ptr arg(3) ;src_pixels_per_line ; 80390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 80490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 80590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; 80690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 80790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, r9 80890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 80990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 ; 81090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz filter_block2d_bil_var_mmx_loop ; 81190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 81290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 81390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 ; 81490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 ; 81590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 81690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 81790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm6 ; 81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 81990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm3 ; 82090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 82190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 82390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm6 ; 82490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 82590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm2, 16 ; 82690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm7 ; 82790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 82890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm4, 32 ; 82990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm7 ; 83090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(7) ;sum 83290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(8) ;sumsquared 83390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 83490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd dword ptr [rdi], mm2 ; 83590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd dword ptr [rsi], mm4 ; 83690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 83890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 83990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 84090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 84190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 84290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 84390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 84590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 84690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get16x16pred_error_mmx 84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 84890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 84990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_stride, 85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 85190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_stride 85290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get16x16pred_error_mmx) 85490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get16x16pred_error_mmx): 85590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 85690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 85790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 4 85890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 85990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 86090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 86190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 86290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 86490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;DWORD PTR [src_ptr] 86590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;DWORD PTR [ref_ptr] 86690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 86790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, DWORD PTR arg(1) ;[src_stride] 86890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, DWORD PTR arg(3) ;[ref_stride] 86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 87090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm0, mm0 ; clear xmm0 for unpack 87190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 ; clear xmm7 for accumulating diffs 87290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 87390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 ; clear xmm6 for accumulating sse 87490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rcx, 16 87590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 87690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervar16loop: 87790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 87890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi] 87990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdi] 88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm2 88390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 88590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm0 88690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, mm0 88890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 88990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm1, mm2 89190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm3, mm4 89290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm7, mm1 89490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm1, mm1 89590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm7, mm3 89790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm3, mm3 89890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm6, mm1 90090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm6, mm3 90190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 90290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 90390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, [rsi+8] 90490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, [rdi+8] 90590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 90690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm3, mm1 90790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm2 90890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 90990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm1, mm0 91090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm3, mm0 91190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 91290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm2, mm0 91390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw mm4, mm0 91490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 91590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm1, mm2 91690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw mm3, mm4 91790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 91890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm7, mm1 91990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm1, mm1 92090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 92190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm7, mm3 92290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd mm3, mm3 92390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 92490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm6, mm1 92590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm6, mm3 92690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 92790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, rax 92890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rdx 92990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 93090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 93190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz var16loop 93290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 93390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 93490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm1, mm6 93590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm6, mm6 93690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 93790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm5, mm5 93890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm6, mm7 93990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm5, mm7 94190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm5, 16 94290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm6, 16 94490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm6, mm5 94590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm2, mm1 94790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm1, 32 94890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm1 95090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm7, mm6 95190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 95290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 95390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm6, mm7 95490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 95590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd DWORD PTR [rsp], mm6 ;Sum 95690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd DWORD PTR [rsp+4], mm2 ;SSE 95790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 95890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; return (SSE-((Sum*Sum)>>8)); 95990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, dword ptr [rsp] 96090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber imul rdx, rdx 96190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sar rdx, 8 96290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr [rsp + 4] 96390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rax, rdx 96490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 96590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 96690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 96790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 96890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 96990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 97090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 97190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 97290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 97390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 97490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 97790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA 97890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;short mmx_bi_rd[4] = { 64, 64, 64, 64}; 97990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 98090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubermmx_bi_rd: 98190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 4 dw 64 982