190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 2f71323e297a928af368937089d3ed71239786f86Andreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 4f71323e297a928af368937089d3ed71239786f86Andreas Huber; Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber; that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber; tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber; in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber; be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%define xmm_filter_shift 7 1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get_mb_ss_sse2 1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; short *src_ptr 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get_mb_ss_sse2) 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get_mb_ss_sse2): 2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 1 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(0) ;[src_ptr] 3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rcx, 8 3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm4, xmm4 3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberNEXTROW: 3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm0, [rax] 3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm1, [rax+16] 3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, [rax+32] 4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3, [rax+48] 4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm0, xmm0 4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm1, xmm1 4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm2, xmm2 4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm3, xmm3 4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm0, xmm1 4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm2, xmm3 4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm4, xmm0 4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm4, xmm2 5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rax, 0x40 5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec rcx 5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ja NEXTROW 5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3,xmm4 5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm4,8 5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm4,xmm3 5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3,xmm4 5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm4,4 6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm4,xmm3 61538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq rax,xmm4 6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get16x16var_sse2 7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char * src_ptr, 7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int source_stride, 7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char * ref_ptr, 7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int recon_stride, 8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int * SSE, 8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int * Sum 8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get16x16var_sse2) 8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get16x16var_sse2): 8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 8879f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbx 8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;[src_ptr] 9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;[ref_ptr] 9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, DWORD PTR arg(1) ;[source_stride] 9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; Prefetch data 10079f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rcx, [rax+rax*2] 10179f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rsi] 10279f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rsi+rax] 10379f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rsi+rax*2] 10479f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rsi+rcx] 10579f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rbx, [rsi+rax*4] 10679f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rbx] 10779f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rbx+rax] 10879f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rbx+rax*2] 10979f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rbx+rcx] 11079f15823c34ae1e423108295e416213200bb280fAndreas Huber 11179f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rcx, [rdx+rdx*2] 11279f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rdi] 11379f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rdi+rdx] 11479f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rdi+rdx*2] 11579f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rdi+rcx] 11679f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rbx, [rdi+rdx*4] 11779f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rbx] 11879f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rbx+rdx] 11979f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rbx+rdx*2] 12079f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rbx+rcx] 12179f15823c34ae1e423108295e416213200bb280fAndreas Huber 12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm0, xmm0 ; clear xmm0 for unpack 12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rcx, 16 12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervar16loop: 12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm1, XMMWORD PTR [rsi] 13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm2, XMMWORD PTR [rdi] 13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13279f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rsi+rax*8] 13379f15823c34ae1e423108295e416213200bb280fAndreas Huber prefetcht0 [rdi+rdx*8] 13479f15823c34ae1e423108295e416213200bb280fAndreas Huber 13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3, xmm1 13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm4, xmm2 13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm1, xmm0 14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw xmm3, xmm0 14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw xmm4, xmm0 14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw xmm1, xmm2 14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw xmm3, xmm4 14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm1 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm1, xmm1 15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm3 15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm3, xmm3 15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm6, xmm1 15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm6, xmm3 15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, rax 15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rdx 16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz var16loop 16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm1, xmm6 16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm6, xmm6 16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm5, xmm5 16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd xmm6, xmm7 17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd xmm5, xmm7 17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad xmm5, 16 17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad xmm6, 16 17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm6, xmm5 17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, xmm1 17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm1, xmm0 17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq xmm2, xmm0 18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm6 18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm6, xmm0 18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq xmm7, xmm0 18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm6, xmm7 18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, xmm1 19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm6 19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm1, 8 19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm6, 8 19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm7, xmm6 19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(5) ;[Sum] 19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(4) ;[SSE] 20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd DWORD PTR [rax], xmm7 20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd DWORD PTR [rdi], xmm1 20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 20879f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbx 20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get16x16pred_error_sse2 21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_stride, 21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_stride 22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get16x16pred_error_sse2) 22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get16x16pred_error_sse2): 22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 4 22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;[src_ptr] 23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;[ref_ptr] 23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, DWORD PTR arg(1) ;[src_stride] 23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, DWORD PTR arg(3) ;[ref_stride] 23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm0, xmm0 ; clear xmm0 for unpack 23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rcx, 16 24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervar16peloop: 24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm1, XMMWORD PTR [rsi] 24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqu xmm2, XMMWORD PTR [rdi] 24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm3, xmm1 24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm4, xmm2 25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm1, xmm0 25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw xmm3, xmm0 25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhbw xmm4, xmm0 25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw xmm1, xmm2 25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw xmm3, xmm4 25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm1 26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm1, xmm1 26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm3 26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm3, xmm3 26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm6, xmm1 26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm6, xmm3 26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, rax 27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, rdx 27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz var16peloop 27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm1, xmm6 27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm6, xmm6 27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm5, xmm5 28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd xmm6, xmm7 28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd xmm5, xmm7 28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad xmm5, 16 28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad xmm6, 16 28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm6, xmm5 28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, xmm1 28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm1, xmm0 29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq xmm2, xmm0 29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm6 29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm6, xmm0 29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq xmm7, xmm0 29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm6, xmm7 29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, xmm1 30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm6 30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm1, 8 30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm6, 8 30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm7, xmm6 30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd DWORD PTR [rsp], xmm7 ;Sum 31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd DWORD PTR [rsp+4], xmm1 ;SSE 31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; return (SSE-((Sum*Sum)>>8)); 31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, dword ptr [rsp] 31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber imul rdx, rdx 31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sar rdx, 8 31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr [rsp + 4] 31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rax, rdx 31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;unsigned int vp8_get8x8var_sse2 33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char * src_ptr, 33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int source_stride, 33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char * ref_ptr, 33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int recon_stride, 33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int * SSE, 33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int * Sum 33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_get8x8var_sse2) 34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_get8x8var_sse2): 34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 6 34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rsp, 16 34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;[src_ptr] 35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;[ref_ptr] 35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, DWORD PTR arg(1) ;[source_stride] 35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm0, xmm0 ; clear xmm0 for unpack 35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm1, QWORD PTR [rsi] 36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm2, QWORD PTR [rdi] 36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm1, xmm0 36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw xmm1, xmm2 36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm1 36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm1, xmm1 36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm2, QWORD PTR[rsi + rax] 37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR[rdi + rdx] 37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw xmm2, xmm3 37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm2 37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm2, xmm2 38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm2, QWORD PTR[rsi + rax * 2] 38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR[rdi + rdx * 2] 38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw xmm2, xmm3 39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm2 39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm2, xmm2 39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax * 2] 39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rdi + rdx * 2] 39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm2, QWORD PTR[rsi + rax] 39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR[rdi + rdx] 40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw xmm2, xmm3 40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm2 40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm2, xmm2 40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm2, QWORD PTR[rsi + rax *2] 41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR[rdi + rdx *2] 41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw xmm2, xmm3 41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm2 41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm2, xmm2 42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax * 2] 42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rdi + rdx * 2] 42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm2, QWORD PTR[rsi + rax] 42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR[rdi + rdx] 42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw xmm2, xmm3 43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm2 43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm2, xmm2 43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm2, QWORD PTR[rsi + rax *2] 44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR[rdi + rdx *2] 44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw xmm2, xmm3 44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm2 44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm2, xmm2 44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rsi, [rsi + rax * 2] 45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber lea rdi, [rdi + rdx * 2] 45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm2, QWORD PTR[rsi + rax] 45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR[rdi + rdx] 45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm2, xmm0 45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw xmm2, xmm3 46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm2 46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm2, xmm2 46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm6, xmm7 46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd xmm6, xmm0 47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd xmm7, xmm0 47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, xmm1 47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm6, xmm7 47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm1, xmm0 47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq xmm2, xmm0 47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm6 47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckldq xmm6, xmm0 48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhdq xmm7, xmm0 48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm6, xmm7 48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm2, xmm1 48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm7, xmm6 48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm1, 8 49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm6, 8 49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm7, xmm6 49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm1, xmm2 49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rax, arg(5) ;[Sum] 49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(4) ;[SSE] 49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 498538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber movq rdx, xmm7 49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsx rcx, dx 50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov dword ptr [rax], ecx 50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd DWORD PTR [rdi], xmm1 50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsp, 16 50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;void vp8_filter_block2d_bil_var_sse2 51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_pixels_per_line, 51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixels_per_line, 51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int Height, 52079f15823c34ae1e423108295e416213200bb280fAndreas Huber; int xoffset, 52179f15823c34ae1e423108295e416213200bb280fAndreas Huber; int yoffset, 52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *sum, 52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *sumsquared;; 52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; 52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberglobal sym(vp8_filter_block2d_bil_var_sse2) 52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersym(vp8_filter_block2d_bil_var_sse2): 52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 9 53179f15823c34ae1e423108295e416213200bb280fAndreas Huber SAVE_XMM 53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 53579f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbx 53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm6, xmm6 ; 53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm7, xmm7 ; 54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54179f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding 54279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, XMMWORD PTR [rsi] 54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 54479f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] 54579f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rax, dword ptr arg(5) ; xoffset 54679f15823c34ae1e423108295e416213200bb280fAndreas Huber 54779f15823c34ae1e423108295e416213200bb280fAndreas Huber cmp rax, 0 ; skip first_pass filter if xoffset=0 54879f15823c34ae1e423108295e416213200bb280fAndreas Huber je filter_block2d_bil_var_sse2_sp_only 54979f15823c34ae1e423108295e416213200bb280fAndreas Huber 55079f15823c34ae1e423108295e416213200bb280fAndreas Huber shl rax, 5 ; point to filter coeff with xoffset 55179f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rax, [rax + rcx] ; HFilter 55279f15823c34ae1e423108295e416213200bb280fAndreas Huber 55379f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rdx, dword ptr arg(6) ; yoffset 55479f15823c34ae1e423108295e416213200bb280fAndreas Huber 55579f15823c34ae1e423108295e416213200bb280fAndreas Huber cmp rdx, 0 ; skip second_pass filter if yoffset=0 55679f15823c34ae1e423108295e416213200bb280fAndreas Huber je filter_block2d_bil_var_sse2_fp_only 55779f15823c34ae1e423108295e416213200bb280fAndreas Huber 55879f15823c34ae1e423108295e416213200bb280fAndreas Huber shl rdx, 5 55979f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdx, [rdx + rcx] ; VFilter 56079f15823c34ae1e423108295e416213200bb280fAndreas Huber 56179f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;ref_ptr 56279f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;src_ptr 56379f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rcx, dword ptr arg(4) ;Height 56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm0, xmm0 ; 56679f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm1, QWORD PTR [rsi] ; 56779f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm3, QWORD PTR [rsi+1] ; 56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm1, xmm0 ; 57079f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm1, [rax] ; 57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw xmm3, [rax+16] ; 57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 57479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm3 ; 57579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm4 ; 57679f15823c34ae1e423108295e416213200bb280fAndreas Huber psraw xmm1, xmm_filter_shift ; 57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm5, xmm1 57879f15823c34ae1e423108295e416213200bb280fAndreas Huber 57979f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line 58079f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rbx] 58179f15823c34ae1e423108295e416213200bb280fAndreas Huber%if ABI_IS_32BIT=0 58279f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd r9, dword ptr arg(3) ;src_pixels_per_line 58390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58579f15823c34ae1e423108295e416213200bb280fAndreas Huberfilter_block2d_bil_var_sse2_loop: 58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm1, QWORD PTR [rsi] ; 58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR [rsi+1] ; 58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm1, xmm0 ; 59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw xmm1, [rax] ; 59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 ; 59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw xmm3, [rax+16] ; 59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm1, xmm3 ; 59579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm4 ; 59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm1, xmm_filter_shift ; 59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 59879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm5 ; 59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm5, xmm1 ; 60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60179f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm3, [rdx] ; 60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw xmm1, [rdx+16] ; 60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm1, xmm3 ; 60479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm4 ; 60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psraw xmm1, xmm_filter_shift ; 60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR [rdi] ; 60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 ; 60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw xmm1, xmm3 ; 61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm6, xmm1 ; 61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm1, xmm1 ; 61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm7, xmm1 ; 61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 61679f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rbx] ;ref_pixels_per_line 61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 61879f15823c34ae1e423108295e416213200bb280fAndreas Huber add rdi, dword ptr arg(3) ;src_pixels_per_line 61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 62079f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdi, [rdi + r9] 62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 ; 62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz filter_block2d_bil_var_sse2_loop ; 62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 62679f15823c34ae1e423108295e416213200bb280fAndreas Huber jmp filter_block2d_bil_variance 62779f15823c34ae1e423108295e416213200bb280fAndreas Huber 62879f15823c34ae1e423108295e416213200bb280fAndreas Huberfilter_block2d_bil_var_sse2_sp_only: 62979f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rdx, dword ptr arg(6) ; yoffset 63079f15823c34ae1e423108295e416213200bb280fAndreas Huber 63179f15823c34ae1e423108295e416213200bb280fAndreas Huber cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 63279f15823c34ae1e423108295e416213200bb280fAndreas Huber je filter_block2d_bil_var_sse2_full_pixel 63379f15823c34ae1e423108295e416213200bb280fAndreas Huber 63479f15823c34ae1e423108295e416213200bb280fAndreas Huber shl rdx, 5 63579f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdx, [rdx + rcx] ; VFilter 63679f15823c34ae1e423108295e416213200bb280fAndreas Huber 63779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;ref_ptr 63879f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;src_ptr 63979f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rcx, dword ptr arg(4) ;Height 64079f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 64179f15823c34ae1e423108295e416213200bb280fAndreas Huber 64279f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 ; 64379f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm1, QWORD PTR [rsi] ; 64479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm1, xmm0 ; 64579f15823c34ae1e423108295e416213200bb280fAndreas Huber 64679f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 64779f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rax] 64879f15823c34ae1e423108295e416213200bb280fAndreas Huber 64979f15823c34ae1e423108295e416213200bb280fAndreas Huberfilter_block2d_bil_sp_only_loop: 65079f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm3, QWORD PTR [rsi] ; 65179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; 65279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, xmm3 65379f15823c34ae1e423108295e416213200bb280fAndreas Huber 65479f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm1, [rdx] ; 65579f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm3, [rdx+16] ; 65679f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm3 ; 65779f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm4 ; 65879f15823c34ae1e423108295e416213200bb280fAndreas Huber psraw xmm1, xmm_filter_shift ; 65979f15823c34ae1e423108295e416213200bb280fAndreas Huber 66079f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm3, QWORD PTR [rdi] ; 66179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; 66279f15823c34ae1e423108295e416213200bb280fAndreas Huber 66379f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm1, xmm3 ; 66479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm6, xmm1 ; 66579f15823c34ae1e423108295e416213200bb280fAndreas Huber 66679f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm1, xmm1 ; 66779f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm7, xmm1 ; 66879f15823c34ae1e423108295e416213200bb280fAndreas Huber 66979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm5 ; 67079f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rax] ;ref_pixels_per_line 67179f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdi, [rdi + rbx] ;src_pixels_per_line 67279f15823c34ae1e423108295e416213200bb280fAndreas Huber 67379f15823c34ae1e423108295e416213200bb280fAndreas Huber sub rcx, 1 ; 67479f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz filter_block2d_bil_sp_only_loop ; 67579f15823c34ae1e423108295e416213200bb280fAndreas Huber 67679f15823c34ae1e423108295e416213200bb280fAndreas Huber jmp filter_block2d_bil_variance 67779f15823c34ae1e423108295e416213200bb280fAndreas Huber 67879f15823c34ae1e423108295e416213200bb280fAndreas Huberfilter_block2d_bil_var_sse2_full_pixel: 67979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;ref_ptr 68079f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;src_ptr 68179f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rcx, dword ptr arg(4) ;Height 68279f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 68379f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 68479f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 ; 68579f15823c34ae1e423108295e416213200bb280fAndreas Huber 68679f15823c34ae1e423108295e416213200bb280fAndreas Huberfilter_block2d_bil_full_pixel_loop: 68779f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm1, QWORD PTR [rsi] ; 68879f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm1, xmm0 ; 68979f15823c34ae1e423108295e416213200bb280fAndreas Huber 69079f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm2, QWORD PTR [rdi] ; 69179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm2, xmm0 ; 69279f15823c34ae1e423108295e416213200bb280fAndreas Huber 69379f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm1, xmm2 ; 69479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm6, xmm1 ; 69579f15823c34ae1e423108295e416213200bb280fAndreas Huber 69679f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm1, xmm1 ; 69779f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm7, xmm1 ; 69879f15823c34ae1e423108295e416213200bb280fAndreas Huber 69979f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rax] ;ref_pixels_per_line 70079f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdi, [rdi + rbx] ;src_pixels_per_line 70179f15823c34ae1e423108295e416213200bb280fAndreas Huber 70279f15823c34ae1e423108295e416213200bb280fAndreas Huber sub rcx, 1 ; 70379f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz filter_block2d_bil_full_pixel_loop ; 70479f15823c34ae1e423108295e416213200bb280fAndreas Huber 70579f15823c34ae1e423108295e416213200bb280fAndreas Huber jmp filter_block2d_bil_variance 70679f15823c34ae1e423108295e416213200bb280fAndreas Huber 70779f15823c34ae1e423108295e416213200bb280fAndreas Huberfilter_block2d_bil_var_sse2_fp_only: 70879f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;ref_ptr 70979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;src_ptr 71079f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rcx, dword ptr arg(4) ;Height 71179f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line 71279f15823c34ae1e423108295e416213200bb280fAndreas Huber 71379f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 ; 71479f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 71579f15823c34ae1e423108295e416213200bb280fAndreas Huber 71679f15823c34ae1e423108295e416213200bb280fAndreas Huberfilter_block2d_bil_fp_only_loop: 71779f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm1, QWORD PTR [rsi] ; 71879f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm3, QWORD PTR [rsi+1] ; 71979f15823c34ae1e423108295e416213200bb280fAndreas Huber 72079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm1, xmm0 ; 72179f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm1, [rax] ; 72279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; 72379f15823c34ae1e423108295e416213200bb280fAndreas Huber pmullw xmm3, [rax+16] ; 72479f15823c34ae1e423108295e416213200bb280fAndreas Huber 72579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm3 ; 72679f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm1, xmm4 ; 72779f15823c34ae1e423108295e416213200bb280fAndreas Huber psraw xmm1, xmm_filter_shift ; 72879f15823c34ae1e423108295e416213200bb280fAndreas Huber 72979f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm3, QWORD PTR [rdi] ; 73079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; 73179f15823c34ae1e423108295e416213200bb280fAndreas Huber 73279f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm1, xmm3 ; 73379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm6, xmm1 ; 73479f15823c34ae1e423108295e416213200bb280fAndreas Huber 73579f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm1, xmm1 ; 73679f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm7, xmm1 ; 73779f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rdx] 73879f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdi, [rdi + rbx] ;src_pixels_per_line 73979f15823c34ae1e423108295e416213200bb280fAndreas Huber 74079f15823c34ae1e423108295e416213200bb280fAndreas Huber sub rcx, 1 ; 74179f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz filter_block2d_bil_fp_only_loop ; 74279f15823c34ae1e423108295e416213200bb280fAndreas Huber 74379f15823c34ae1e423108295e416213200bb280fAndreas Huber jmp filter_block2d_bil_variance 74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74579f15823c34ae1e423108295e416213200bb280fAndreas Huberfilter_block2d_bil_variance: 74690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm6, xmm6 ; 74790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm7, xmm7 ; 74890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 74990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm6, 8 75090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm7, 8 75190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 75290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm2, xmm6 75390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm3, xmm7 75490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 75590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm2 75690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm3 75790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 75890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 ; 75990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 ; 76090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 76290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm6 ; 76390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm3 ; 76590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 76690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 76790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 76890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm6 ; 76990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm2, 16 ; 77190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm7 ; 77290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm4, 32 ; 77490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm7 ; 77590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(7) ; sum 77790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(8) ; sumsquared 77890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 77990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi], mm2 ; xsum 78090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi], mm4 ; xxsum 78190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 78290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 78379f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbx 78490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 78590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 78690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 78779f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_XMM 78890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 78990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 79090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 79190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 79290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 79379f15823c34ae1e423108295e416213200bb280fAndreas Huber;void vp8_half_horiz_vert_variance8x_h_sse2 79490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 79590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 79690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_pixels_per_line, 79790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 79890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixels_per_line, 79990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int Height, 80090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *sum, 80190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *sumsquared 80290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 80379f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_half_horiz_vert_variance8x_h_sse2) 80479f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_half_horiz_vert_variance8x_h_sse2): 80590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 80690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 80790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 7 80890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 80990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 81090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 81190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 81290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 81390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT=0 81490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 81590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r9, dword ptr arg(3) ;src_pixels_per_line 81690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 81790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm6, xmm6 ; error accumulator 81990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm7, xmm7 ; sse eaccumulator 82090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;ref_ptr ; 82190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;src_ptr ; 82390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(4) ;Height ; 82490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 82590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 82690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm0, xmm0 ; 82790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 82890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 82990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 83090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 83290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 83390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source 83490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 83590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 83690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 83879f15823c34ae1e423108295e416213200bb280fAndreas Hubervp8_half_horiz_vert_variance8x_h_1: 83990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 84090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm1, QWORD PTR [rsi] ; 84190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm2, QWORD PTR [rsi+1] ; 84290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 84390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pavgb xmm5, xmm1 ; xmm = vertical average of the above 84590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm5, xmm0 ; xmm5 = words of above 84690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 84890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 ; xmm3 = words of above 84990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw xmm5, xmm3 ; xmm5 -= xmm3 85190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm6, xmm5 ; xmm6 += accumulated column differences 85290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 85490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 85590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm5, xmm1 ; save xmm1 for use on the next row 85690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 85790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 85890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 85990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 86090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 86190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 86290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, r9 86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 86490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 86590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 ; 86679f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz vp8_half_horiz_vert_variance8x_h_1 ; 86790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 86890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm6, xmm6 ; 86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm7, xmm7 ; 87090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 87190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm6, 8 87290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm7, 8 87390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 87490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm2, xmm6 87590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm3, xmm7 87690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 87790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm2 87890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm3 87990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 ; 88190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 ; 88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 88490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm6 ; 88590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm3 ; 88790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 88890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 88990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 89090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm6 ; 89190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm2, 16 ; 89390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm7 ; 89490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm4, 32 ; 89690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm7 ; 89790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 89890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(5) ; sum 89990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(6) ; sumsquared 90090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 90190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi], mm2 ; 90290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi], mm4 ; 90390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 90490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 90590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 90690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 90790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 90890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 90990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 91090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 91190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 91290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 91379f15823c34ae1e423108295e416213200bb280fAndreas Huber;void vp8_half_horiz_vert_variance16x_h_sse2 91479f15823c34ae1e423108295e416213200bb280fAndreas Huber;( 91579f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *ref_ptr, 91679f15823c34ae1e423108295e416213200bb280fAndreas Huber; int ref_pixels_per_line, 91779f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *src_ptr, 91879f15823c34ae1e423108295e416213200bb280fAndreas Huber; int src_pixels_per_line, 91979f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned int Height, 92079f15823c34ae1e423108295e416213200bb280fAndreas Huber; int *sum, 92179f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned int *sumsquared 92279f15823c34ae1e423108295e416213200bb280fAndreas Huber;) 92379f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_half_horiz_vert_variance16x_h_sse2) 92479f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_half_horiz_vert_variance16x_h_sse2): 92579f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbp 92679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, rsp 92779f15823c34ae1e423108295e416213200bb280fAndreas Huber SHADOW_ARGS_TO_STACK 7 92879f15823c34ae1e423108295e416213200bb280fAndreas Huber SAVE_XMM 92979f15823c34ae1e423108295e416213200bb280fAndreas Huber GET_GOT rbx 93079f15823c34ae1e423108295e416213200bb280fAndreas Huber push rsi 93179f15823c34ae1e423108295e416213200bb280fAndreas Huber push rdi 93279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; end prolog 93379f15823c34ae1e423108295e416213200bb280fAndreas Huber 93479f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm6, xmm6 ; error accumulator 93579f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm7, xmm7 ; sse eaccumulator 93679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;ref_ptr ; 93779f15823c34ae1e423108295e416213200bb280fAndreas Huber 93879f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;src_ptr ; 93979f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rcx, dword ptr arg(4) ;Height ; 94079f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 94179f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 94290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 94379f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 ; 94479f15823c34ae1e423108295e416213200bb280fAndreas Huber 94579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm5, XMMWORD PTR [rsi] 94679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm3, XMMWORD PTR [rsi+1] 94779f15823c34ae1e423108295e416213200bb280fAndreas Huber pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 94879f15823c34ae1e423108295e416213200bb280fAndreas Huber 94979f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rax] 95079f15823c34ae1e423108295e416213200bb280fAndreas Huber 95179f15823c34ae1e423108295e416213200bb280fAndreas Hubervp8_half_horiz_vert_variance16x_h_1: 95279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm1, XMMWORD PTR [rsi] ; 95379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm2, XMMWORD PTR [rsi+1] ; 95479f15823c34ae1e423108295e416213200bb280fAndreas Huber pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 95579f15823c34ae1e423108295e416213200bb280fAndreas Huber 95679f15823c34ae1e423108295e416213200bb280fAndreas Huber pavgb xmm5, xmm1 ; xmm = vertical average of the above 95779f15823c34ae1e423108295e416213200bb280fAndreas Huber 95879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm5 95979f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm5, xmm0 ; xmm5 = words of above 96079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm4, xmm0 96179f15823c34ae1e423108295e416213200bb280fAndreas Huber 96279f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 96379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; xmm3 = words of above 96479f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm5, xmm3 ; xmm5 -= xmm3 96579f15823c34ae1e423108295e416213200bb280fAndreas Huber 96679f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm3, QWORD PTR [rdi+8] 96779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 96879f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm4, xmm3 96979f15823c34ae1e423108295e416213200bb280fAndreas Huber 97079f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm6, xmm5 ; xmm6 += accumulated column differences 97179f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm6, xmm4 97279f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 97379f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm4, xmm4 97479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 97579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm7, xmm4 97679f15823c34ae1e423108295e416213200bb280fAndreas Huber 97779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, xmm1 ; save xmm1 for use on the next row 97879f15823c34ae1e423108295e416213200bb280fAndreas Huber 97979f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rax] 98079f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdi, [rdi + rdx] 98179f15823c34ae1e423108295e416213200bb280fAndreas Huber 98279f15823c34ae1e423108295e416213200bb280fAndreas Huber sub rcx, 1 ; 98379f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz vp8_half_horiz_vert_variance16x_h_1 ; 98479f15823c34ae1e423108295e416213200bb280fAndreas Huber 98579f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm1, xmm1 98679f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm5, xmm5 98779f15823c34ae1e423108295e416213200bb280fAndreas Huber 98879f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm0, xmm6 98979f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm1, xmm6 99079f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm0, 16 99179f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm1, 16 99279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm0, xmm1 99379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 99479f15823c34ae1e423108295e416213200bb280fAndreas Huber 99579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm6, xmm7 99679f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm6, xmm5 99779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm7, xmm5 99879f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm6, xmm7 99979f15823c34ae1e423108295e416213200bb280fAndreas Huber 100079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm0, xmm5 100179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm1, xmm5 100279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm0, xmm1 100379f15823c34ae1e423108295e416213200bb280fAndreas Huber 100479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm7, xmm6 100579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 100679f15823c34ae1e423108295e416213200bb280fAndreas Huber 100779f15823c34ae1e423108295e416213200bb280fAndreas Huber psrldq xmm7, 8 100879f15823c34ae1e423108295e416213200bb280fAndreas Huber psrldq xmm1, 8 100979f15823c34ae1e423108295e416213200bb280fAndreas Huber 101079f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm6, xmm7 101179f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm0, xmm1 101279f15823c34ae1e423108295e416213200bb280fAndreas Huber 101379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(5) ;[Sum] 101479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(6) ;[SSE] 101579f15823c34ae1e423108295e416213200bb280fAndreas Huber 101679f15823c34ae1e423108295e416213200bb280fAndreas Huber movd [rsi], xmm0 101779f15823c34ae1e423108295e416213200bb280fAndreas Huber movd [rdi], xmm6 101879f15823c34ae1e423108295e416213200bb280fAndreas Huber 101979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; begin epilog 102079f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 102179f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsi 102279f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_GOT 102379f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_XMM 102479f15823c34ae1e423108295e416213200bb280fAndreas Huber UNSHADOW_ARGS 102579f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbp 102679f15823c34ae1e423108295e416213200bb280fAndreas Huber ret 102779f15823c34ae1e423108295e416213200bb280fAndreas Huber 102879f15823c34ae1e423108295e416213200bb280fAndreas Huber 102979f15823c34ae1e423108295e416213200bb280fAndreas Huber;void vp8_half_vert_variance8x_h_sse2 103090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 103190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 103290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_pixels_per_line, 103390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 103490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixels_per_line, 103590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int Height, 103690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *sum, 103790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *sumsquared 103890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 103979f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_half_vert_variance8x_h_sse2) 104079f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_half_vert_variance8x_h_sse2): 104190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 104290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 104390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 7 104490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 104590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 104690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 104790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 104890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 104990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT=0 105090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 105190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r9, dword ptr arg(3) ;src_pixels_per_line 105290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 105390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 105490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm6, xmm6 ; error accumulator 105590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm7, xmm7 ; sse eaccumulator 105690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;ref_ptr ; 105790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 105890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;src_ptr ; 105990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(4) ;Height ; 106090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 106190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 106290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm0, xmm0 ; 106379f15823c34ae1e423108295e416213200bb280fAndreas Hubervp8_half_vert_variance8x_h_1: 106490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 106590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 106690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 106790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 106890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm5, xmm0 ; xmm5 = words of above 106990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 107190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 ; xmm3 = words of above 107290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw xmm5, xmm3 ; xmm5 -= xmm3 107490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm6, xmm5 ; xmm6 += accumulated column differences 107590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 107690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 107790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 107890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 107990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 108090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 108190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 108290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 108390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, r9 108490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 108590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 108690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 ; 108779f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz vp8_half_vert_variance8x_h_1 ; 108890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 108990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm6, xmm6 ; 109090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm7, xmm7 ; 109190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 109290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm6, 8 109390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm7, 8 109490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 109590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm2, xmm6 109690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm3, xmm7 109790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 109890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm2 109990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm3 110090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 110190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 ; 110290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 ; 110390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 110490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 110590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm6 ; 110690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 110790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm3 ; 110890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 110990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 111090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 111190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm6 ; 111290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 111390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm2, 16 ; 111490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm7 ; 111590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 111690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm4, 32 ; 111790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm7 ; 111890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 111990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(5) ; sum 112090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(6) ; sumsquared 112190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 112290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi], mm2 ; 112390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi], mm4 ; 112490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 112590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 112690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 112790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 112890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 112990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 113090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 113190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 113290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 113390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 113479f15823c34ae1e423108295e416213200bb280fAndreas Huber;void vp8_half_vert_variance16x_h_sse2 113579f15823c34ae1e423108295e416213200bb280fAndreas Huber;( 113679f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *ref_ptr, 113779f15823c34ae1e423108295e416213200bb280fAndreas Huber; int ref_pixels_per_line, 113879f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *src_ptr, 113979f15823c34ae1e423108295e416213200bb280fAndreas Huber; int src_pixels_per_line, 114079f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned int Height, 114179f15823c34ae1e423108295e416213200bb280fAndreas Huber; int *sum, 114279f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned int *sumsquared 114379f15823c34ae1e423108295e416213200bb280fAndreas Huber;) 114479f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_half_vert_variance16x_h_sse2) 114579f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_half_vert_variance16x_h_sse2): 114679f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbp 114779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, rsp 114879f15823c34ae1e423108295e416213200bb280fAndreas Huber SHADOW_ARGS_TO_STACK 7 114979f15823c34ae1e423108295e416213200bb280fAndreas Huber SAVE_XMM 115079f15823c34ae1e423108295e416213200bb280fAndreas Huber GET_GOT rbx 115179f15823c34ae1e423108295e416213200bb280fAndreas Huber push rsi 115279f15823c34ae1e423108295e416213200bb280fAndreas Huber push rdi 115379f15823c34ae1e423108295e416213200bb280fAndreas Huber ; end prolog 115479f15823c34ae1e423108295e416213200bb280fAndreas Huber 115579f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm6, xmm6 ; error accumulator 115679f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm7, xmm7 ; sse eaccumulator 115779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;ref_ptr 115879f15823c34ae1e423108295e416213200bb280fAndreas Huber 115979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;src_ptr 116079f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rcx, dword ptr arg(4) ;Height 116179f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 116279f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 116390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 116479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm5, XMMWORD PTR [rsi] 116579f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rax ] 116679f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 116779f15823c34ae1e423108295e416213200bb280fAndreas Huber 116879f15823c34ae1e423108295e416213200bb280fAndreas Hubervp8_half_vert_variance16x_h_1: 116979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm3, XMMWORD PTR [rsi] 117079f15823c34ae1e423108295e416213200bb280fAndreas Huber 117179f15823c34ae1e423108295e416213200bb280fAndreas Huber pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 117279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm5 117379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm5, xmm0 117479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm4, xmm0 117579f15823c34ae1e423108295e416213200bb280fAndreas Huber 117679f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm2, QWORD PTR [rdi] 117779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm2, xmm0 117879f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm5, xmm2 117979f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm2, QWORD PTR [rdi+8] 118079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm2, xmm0 118179f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm4, xmm2 118279f15823c34ae1e423108295e416213200bb280fAndreas Huber 118379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm6, xmm5 ; xmm6 += accumulated column differences 118479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm6, xmm4 118579f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 118679f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm4, xmm4 118779f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 118879f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm7, xmm4 118979f15823c34ae1e423108295e416213200bb280fAndreas Huber 119079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm5, xmm3 119179f15823c34ae1e423108295e416213200bb280fAndreas Huber 119279f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rax] 119379f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdi, [rdi + rdx] 119479f15823c34ae1e423108295e416213200bb280fAndreas Huber 119579f15823c34ae1e423108295e416213200bb280fAndreas Huber sub rcx, 1 119679f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz vp8_half_vert_variance16x_h_1 119779f15823c34ae1e423108295e416213200bb280fAndreas Huber 119879f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm1, xmm1 119979f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm5, xmm5 120079f15823c34ae1e423108295e416213200bb280fAndreas Huber 120179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm0, xmm6 120279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm1, xmm6 120379f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm0, 16 120479f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm1, 16 120579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm0, xmm1 120679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 120779f15823c34ae1e423108295e416213200bb280fAndreas Huber 120879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm6, xmm7 120979f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm6, xmm5 121079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm7, xmm5 121179f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm6, xmm7 121279f15823c34ae1e423108295e416213200bb280fAndreas Huber 121379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm0, xmm5 121479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm1, xmm5 121579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm0, xmm1 121679f15823c34ae1e423108295e416213200bb280fAndreas Huber 121779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm7, xmm6 121879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 121979f15823c34ae1e423108295e416213200bb280fAndreas Huber 122079f15823c34ae1e423108295e416213200bb280fAndreas Huber psrldq xmm7, 8 122179f15823c34ae1e423108295e416213200bb280fAndreas Huber psrldq xmm1, 8 122279f15823c34ae1e423108295e416213200bb280fAndreas Huber 122379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm6, xmm7 122479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm0, xmm1 122579f15823c34ae1e423108295e416213200bb280fAndreas Huber 122679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(5) ;[Sum] 122779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(6) ;[SSE] 122879f15823c34ae1e423108295e416213200bb280fAndreas Huber 122979f15823c34ae1e423108295e416213200bb280fAndreas Huber movd [rsi], xmm0 123079f15823c34ae1e423108295e416213200bb280fAndreas Huber movd [rdi], xmm6 123179f15823c34ae1e423108295e416213200bb280fAndreas Huber 123279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; begin epilog 123379f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 123479f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsi 123579f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_GOT 123679f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_XMM 123779f15823c34ae1e423108295e416213200bb280fAndreas Huber UNSHADOW_ARGS 123879f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbp 123979f15823c34ae1e423108295e416213200bb280fAndreas Huber ret 124079f15823c34ae1e423108295e416213200bb280fAndreas Huber 124179f15823c34ae1e423108295e416213200bb280fAndreas Huber 124279f15823c34ae1e423108295e416213200bb280fAndreas Huber;void vp8_half_horiz_variance8x_h_sse2 124390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;( 124490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *ref_ptr, 124590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int ref_pixels_per_line, 124690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned char *src_ptr, 124790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int src_pixels_per_line, 124890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int Height, 124990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; int *sum, 125090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; unsigned int *sumsquared 125190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;) 125279f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_half_horiz_variance8x_h_sse2) 125379f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_half_horiz_variance8x_h_sse2): 125490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rbp 125590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rbp, rsp 125690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber SHADOW_ARGS_TO_STACK 7 125790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber GET_GOT rbx 125890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rsi 125990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber push rdi 126090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; end prolog 126190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 126290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT=0 126390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 126490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd r9, dword ptr arg(3) ;src_pixels_per_line 126590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 126690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 126790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm6, xmm6 ; error accumulator 126890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm7, xmm7 ; sse eaccumulator 126990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(0) ;ref_ptr ; 127090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 127190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(2) ;src_ptr ; 127290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movsxd rcx, dword ptr arg(4) ;Height ; 127390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 127490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm0, xmm0 ; 127579f15823c34ae1e423108295e416213200bb280fAndreas Hubervp8_half_horiz_variance8x_h_1: 127690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 127790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 127890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 127990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 128090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm5, xmm0 ; xmm5 = words of above 128190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 128290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 128390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm0 ; xmm3 = words of above 128490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 128590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubw xmm5, xmm3 ; xmm5 -= xmm3 128690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw xmm6, xmm5 ; xmm6 += accumulated column differences 128790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 128890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 128990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 129090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%if ABI_IS_32BIT 129190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 129290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 129390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%else 129490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rsi, r8 129590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add rdi, r9 129690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber%endif 129790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber sub rcx, 1 ; 129879f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz vp8_half_horiz_variance8x_h_1 ; 129990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm6, xmm6 ; 130190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm7, xmm7 ; 130290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm6, 8 130490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrldq xmm7, 8 130590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm2, xmm6 130790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdq2q mm3, xmm7 130890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 130990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddw mm6, mm2 131090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm7, mm3 131190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 131290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm3, mm3 ; 131390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 ; 131490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 131590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklwd mm2, mm6 ; 131690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpckhwd mm3, mm6 ; 131790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 131890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm3 ; 131990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm2 ; 132090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm6, 32 ; 132290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm2, mm6 ; 132390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrad mm2, 16 ; 132590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm7 ; 132690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 132790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlq mm4, 32 ; 132890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddd mm4, mm7 ; 132990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rsi, arg(5) ; sum 133190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov rdi, arg(6) ; sumsquared 133290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rsi], mm2 ; 133490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd [rdi], mm4 ; 133590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 133790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ; begin epilog 133890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rdi 133990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rsi 134090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber RESTORE_GOT 134190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber UNSHADOW_ARGS 134290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pop rbp 134390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ret 134490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 134579f15823c34ae1e423108295e416213200bb280fAndreas Huber;void vp8_half_horiz_variance16x_h_sse2 134679f15823c34ae1e423108295e416213200bb280fAndreas Huber;( 134779f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *ref_ptr, 134879f15823c34ae1e423108295e416213200bb280fAndreas Huber; int ref_pixels_per_line, 134979f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *src_ptr, 135079f15823c34ae1e423108295e416213200bb280fAndreas Huber; int src_pixels_per_line, 135179f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned int Height, 135279f15823c34ae1e423108295e416213200bb280fAndreas Huber; int *sum, 135379f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned int *sumsquared 135479f15823c34ae1e423108295e416213200bb280fAndreas Huber;) 135579f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_half_horiz_variance16x_h_sse2) 135679f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_half_horiz_variance16x_h_sse2): 135779f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbp 135879f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, rsp 135979f15823c34ae1e423108295e416213200bb280fAndreas Huber SHADOW_ARGS_TO_STACK 7 136079f15823c34ae1e423108295e416213200bb280fAndreas Huber SAVE_XMM 136179f15823c34ae1e423108295e416213200bb280fAndreas Huber GET_GOT rbx 136279f15823c34ae1e423108295e416213200bb280fAndreas Huber push rsi 136379f15823c34ae1e423108295e416213200bb280fAndreas Huber push rdi 136479f15823c34ae1e423108295e416213200bb280fAndreas Huber ; end prolog 136579f15823c34ae1e423108295e416213200bb280fAndreas Huber 136679f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm6, xmm6 ; error accumulator 136779f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm7, xmm7 ; sse eaccumulator 136879f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;ref_ptr ; 136979f15823c34ae1e423108295e416213200bb280fAndreas Huber 137079f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;src_ptr ; 137179f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rcx, dword ptr arg(4) ;Height ; 137279f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 137379f15823c34ae1e423108295e416213200bb280fAndreas Huber movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 137479f15823c34ae1e423108295e416213200bb280fAndreas Huber 137579f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 ; 137679f15823c34ae1e423108295e416213200bb280fAndreas Huber 137779f15823c34ae1e423108295e416213200bb280fAndreas Hubervp8_half_horiz_variance16x_h_1: 137879f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 137979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 138079f15823c34ae1e423108295e416213200bb280fAndreas Huber 138179f15823c34ae1e423108295e416213200bb280fAndreas Huber pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 138279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm5 138379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm5, xmm0 ; xmm5 = words of above 138479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm1, xmm0 138579f15823c34ae1e423108295e416213200bb280fAndreas Huber 138679f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 138779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; xmm3 = words of above 138879f15823c34ae1e423108295e416213200bb280fAndreas Huber movq xmm2, QWORD PTR [rdi+8] 138979f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm2, xmm0 139079f15823c34ae1e423108295e416213200bb280fAndreas Huber 139179f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm5, xmm3 ; xmm5 -= xmm3 139279f15823c34ae1e423108295e416213200bb280fAndreas Huber psubw xmm1, xmm2 139379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm6, xmm5 ; xmm6 += accumulated column differences 139479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddw xmm6, xmm1 139579f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 139679f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm1, xmm1 139779f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 139879f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm7, xmm1 139979f15823c34ae1e423108295e416213200bb280fAndreas Huber 140079f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rsi, [rsi + rax] 140179f15823c34ae1e423108295e416213200bb280fAndreas Huber lea rdi, [rdi + rdx] 140279f15823c34ae1e423108295e416213200bb280fAndreas Huber 140379f15823c34ae1e423108295e416213200bb280fAndreas Huber sub rcx, 1 ; 140479f15823c34ae1e423108295e416213200bb280fAndreas Huber jnz vp8_half_horiz_variance16x_h_1 ; 140579f15823c34ae1e423108295e416213200bb280fAndreas Huber 140679f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm1, xmm1 140779f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm5, xmm5 140879f15823c34ae1e423108295e416213200bb280fAndreas Huber 140979f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd xmm0, xmm6 141079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm1, xmm6 141179f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm0, 16 141279f15823c34ae1e423108295e416213200bb280fAndreas Huber psrad xmm1, 16 141379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm0, xmm1 141479f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 141579f15823c34ae1e423108295e416213200bb280fAndreas Huber 141679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm6, xmm7 141779f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm6, xmm5 141879f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm7, xmm5 141979f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm6, xmm7 142079f15823c34ae1e423108295e416213200bb280fAndreas Huber 142179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq xmm0, xmm5 142279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm1, xmm5 142379f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm0, xmm1 142479f15823c34ae1e423108295e416213200bb280fAndreas Huber 142579f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm7, xmm6 142679f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm0 142779f15823c34ae1e423108295e416213200bb280fAndreas Huber 142879f15823c34ae1e423108295e416213200bb280fAndreas Huber psrldq xmm7, 8 142979f15823c34ae1e423108295e416213200bb280fAndreas Huber psrldq xmm1, 8 143079f15823c34ae1e423108295e416213200bb280fAndreas Huber 143179f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm6, xmm7 143279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd xmm0, xmm1 143379f15823c34ae1e423108295e416213200bb280fAndreas Huber 143479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(5) ;[Sum] 143579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(6) ;[SSE] 143679f15823c34ae1e423108295e416213200bb280fAndreas Huber 143779f15823c34ae1e423108295e416213200bb280fAndreas Huber movd [rsi], xmm0 143879f15823c34ae1e423108295e416213200bb280fAndreas Huber movd [rdi], xmm6 143979f15823c34ae1e423108295e416213200bb280fAndreas Huber 144079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; begin epilog 144179f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 144279f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsi 144379f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_GOT 144479f15823c34ae1e423108295e416213200bb280fAndreas Huber RESTORE_XMM 144579f15823c34ae1e423108295e416213200bb280fAndreas Huber UNSHADOW_ARGS 144679f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbp 144779f15823c34ae1e423108295e416213200bb280fAndreas Huber ret 144890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 144990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas HuberSECTION_RODATA 145090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; 145190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberalign 16 145290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberxmm_bi_rd: 145390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber times 8 dw 64 145479f15823c34ae1e423108295e416213200bb280fAndreas Huberalign 16 145579f15823c34ae1e423108295e416213200bb280fAndreas Hubervp8_bilinear_filters_sse2: 145679f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 145779f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 145879f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 145979f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 146079f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 146179f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 146279f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 146379f15823c34ae1e423108295e416213200bb280fAndreas Huber dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 1464