1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm" 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp9_get_mb_ss_sse2 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan; short *src_ptr 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 18233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_get_mb_ss_sse2) PRIVATE 19233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_get_mb_ss_sse2): 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 1 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 16 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ;[src_ptr] 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, 8 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm4, xmm4 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan.NEXTROW: 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm0, [rax] 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, [rax+16] 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, [rax+32] 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, [rax+48] 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm0, xmm0 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm1, xmm1 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm2, xmm2 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm3, xmm3 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm0, xmm1 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm2, xmm3 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm4, xmm0 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm4, xmm2 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax, 0x40 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rcx 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan ja .NEXTROW 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3,xmm4 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm4,8 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm4,xmm3 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3,xmm4 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm4,4 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm4,xmm3 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq rax,xmm4 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 16 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp9_get16x16var_sse2 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char * src_ptr, 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int source_stride, 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char * ref_ptr, 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int recon_stride, 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int * SSE, 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int * Sum 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 81233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_get16x16var_sse2) PRIVATE 82233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_get16x16var_sse2): 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 6 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbx 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;[src_ptr] 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ;[ref_ptr] 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, DWORD PTR arg(1) ;[source_stride] 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Prefetch data 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rax+rax*2] 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rsi] 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rsi+rax] 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rsi+rax*2] 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rsi+rcx] 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rbx, [rsi+rax*4] 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rbx] 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rbx+rax] 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rbx+rax*2] 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rbx+rcx] 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rcx, [rdx+rdx*2] 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rdi] 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rdi+rdx] 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rdi+rdx*2] 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rdi+rcx] 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rbx, [rdi+rdx*4] 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rbx] 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rbx+rdx] 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rbx+rdx*2] 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rbx+rcx] 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm0 ; clear xmm0 for unpack 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, 16 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan.var16loop: 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm1, XMMWORD PTR [rsi] 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm2, XMMWORD PTR [rdi] 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rsi+rax*8] 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan prefetcht0 [rdi+rdx*8] 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm1 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm2 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm0 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm3, xmm0 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw xmm4, xmm0 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm1, xmm2 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm3, xmm4 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm1 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm1, xmm1 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm3 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm3, xmm3 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm1 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm3 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, rax 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, rdx 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rcx, 1 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .var16loop 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm6 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm6 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm5, xmm5 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm6, xmm7 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm5, xmm7 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad xmm5, 16 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad xmm6, 16 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm5 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm0 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm2, xmm0 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm6 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm6, xmm0 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm7, xmm0 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm7 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm6 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm1, 8 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm6, 8 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm6 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(5) ;[Sum] 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(4) ;[SSE] 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd DWORD PTR [rax], xmm7 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd DWORD PTR [rdi], xmm1 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbx 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp9_get8x8var_sse2 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char * src_ptr, 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int source_stride, 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char * ref_ptr, 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int recon_stride, 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int * SSE, 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int * Sum 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 225233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_get8x8var_sse2) PRIVATE 226233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_get8x8var_sse2): 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 6 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 16 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;[src_ptr] 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ;[ref_ptr] 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, DWORD PTR arg(1) ;[source_stride] 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm0 ; clear xmm0 for unpack 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, QWORD PTR [rsi] 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR [rdi] 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm0 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw xmm1, xmm2 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm1 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm1, xmm1 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR[rsi + rax] 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR[rdi + rdx] 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw xmm2, xmm3 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm2 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm2, xmm2 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR[rsi + rax * 2] 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR[rdi + rdx * 2] 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw xmm2, xmm3 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm2 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm2, xmm2 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rax * 2] 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + rdx * 2] 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR[rsi + rax] 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR[rdi + rdx] 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw xmm2, xmm3 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm2 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm2, xmm2 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR[rsi + rax *2] 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR[rdi + rdx *2] 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw xmm2, xmm3 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm2 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm2, xmm2 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rax * 2] 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + rdx * 2] 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR[rsi + rax] 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR[rdi + rdx] 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw xmm2, xmm3 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm2 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm2, xmm2 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR[rsi + rax *2] 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR[rdi + rdx *2] 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw xmm2, xmm3 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm2 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm2, xmm2 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rax * 2] 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + rdx * 2] 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR[rsi + rax] 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR[rdi + rdx] 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw xmm2, xmm3 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm2 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm2, xmm2 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm7 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm6, xmm0 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm7, xmm0 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm6, xmm7 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm1, xmm0 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm2, xmm0 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm6 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq xmm6, xmm0 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhdq xmm7, xmm0 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm6, xmm7 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm6 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm1, 8 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm6, 8 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm7, xmm6 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(5) ;[Sum] 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(4) ;[SSE] 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq rdx, xmm7 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsx rcx, dx 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov dword ptr [rax], ecx 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd DWORD PTR [rdi], xmm1 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 16 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp9_half_horiz_vert_variance8x_h_sse2 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *ref_ptr, 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int ref_pixels_per_line, 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixels_per_line, 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int Height, 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int *sum, 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *sumsquared 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 411233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE 412233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_half_horiz_vert_variance8x_h_sse2): 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 7 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=0 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 424233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r9, dword ptr arg(3) ;src_pixels_per_line 425233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 426233d2500723e5594f3e7c70896ffeeef32b9c950ywan 427233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm6 ; error accumulator 428233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 ; sse eaccumulator 429233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;ref_ptr ; 430233d2500723e5594f3e7c70896ffeeef32b9c950ywan 431233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ;src_ptr ; 432233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, dword ptr arg(4) ;Height ; 433233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 434233d2500723e5594f3e7c70896ffeeef32b9c950ywan 435233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm0 ; 436233d2500723e5594f3e7c70896ffeeef32b9c950ywan 437233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 438233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 439233d2500723e5594f3e7c70896ffeeef32b9c950ywan pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 440233d2500723e5594f3e7c70896ffeeef32b9c950ywan 441233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 442233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source 443233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 444233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, r8 445233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 446233d2500723e5594f3e7c70896ffeeef32b9c950ywan 447233d2500723e5594f3e7c70896ffeeef32b9c950ywan.half_horiz_vert_variance8x_h_1: 448233d2500723e5594f3e7c70896ffeeef32b9c950ywan 449233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, QWORD PTR [rsi] ; 450233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR [rsi+1] ; 451233d2500723e5594f3e7c70896ffeeef32b9c950ywan pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 452233d2500723e5594f3e7c70896ffeeef32b9c950ywan 453233d2500723e5594f3e7c70896ffeeef32b9c950ywan pavgb xmm5, xmm1 ; xmm = vertical average of the above 454233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm0 ; xmm5 = words of above 455233d2500723e5594f3e7c70896ffeeef32b9c950ywan 456233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 457233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 ; xmm3 = words of above 458233d2500723e5594f3e7c70896ffeeef32b9c950ywan 459233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm5, xmm3 ; xmm5 -= xmm3 460233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm6, xmm5 ; xmm6 += accumulated column differences 461233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 462233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 463233d2500723e5594f3e7c70896ffeeef32b9c950ywan 464233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 ; save xmm1 for use on the next row 465233d2500723e5594f3e7c70896ffeeef32b9c950ywan 466233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 467233d2500723e5594f3e7c70896ffeeef32b9c950ywan add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 468233d2500723e5594f3e7c70896ffeeef32b9c950ywan add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 469233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 470233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, r8 471233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, r9 472233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 473233d2500723e5594f3e7c70896ffeeef32b9c950ywan 474233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rcx, 1 ; 475233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .half_horiz_vert_variance8x_h_1 ; 476233d2500723e5594f3e7c70896ffeeef32b9c950ywan 477233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm6, xmm6 ; 478233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm7, xmm7 ; 479233d2500723e5594f3e7c70896ffeeef32b9c950ywan 480233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm6, 8 481233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm7, 8 482233d2500723e5594f3e7c70896ffeeef32b9c950ywan 483233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm2, xmm6 484233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm3, xmm7 485233d2500723e5594f3e7c70896ffeeef32b9c950ywan 486233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm6, mm2 487233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm3 488233d2500723e5594f3e7c70896ffeeef32b9c950ywan 489233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm3, mm3 ; 490233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm2, mm2 ; 491233d2500723e5594f3e7c70896ffeeef32b9c950ywan 492233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd mm2, mm6 ; 493233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd mm3, mm6 ; 494233d2500723e5594f3e7c70896ffeeef32b9c950ywan 495233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm3 ; 496233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm6, mm2 ; 497233d2500723e5594f3e7c70896ffeeef32b9c950ywan 498233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm6, 32 ; 499233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm6 ; 500233d2500723e5594f3e7c70896ffeeef32b9c950ywan 501233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad mm2, 16 ; 502233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm4, mm7 ; 503233d2500723e5594f3e7c70896ffeeef32b9c950ywan 504233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm4, 32 ; 505233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm4, mm7 ; 506233d2500723e5594f3e7c70896ffeeef32b9c950ywan 507233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(5) ; sum 508233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(6) ; sumsquared 509233d2500723e5594f3e7c70896ffeeef32b9c950ywan 510233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi], mm2 ; 511233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi], mm4 ; 512233d2500723e5594f3e7c70896ffeeef32b9c950ywan 513233d2500723e5594f3e7c70896ffeeef32b9c950ywan 514233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 515233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 516233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 517233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 518233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 519233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 520233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 521233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 522233d2500723e5594f3e7c70896ffeeef32b9c950ywan 523233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp9_half_vert_variance8x_h_sse2 524233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 525233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *ref_ptr, 526233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int ref_pixels_per_line, 527233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 528233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixels_per_line, 529233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int Height, 530233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int *sum, 531233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *sumsquared 532233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 533233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_half_vert_variance8x_h_sse2) PRIVATE 534233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_half_vert_variance8x_h_sse2): 535233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 536233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 537233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 7 538233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 539233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 540233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 541233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 542233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 543233d2500723e5594f3e7c70896ffeeef32b9c950ywan 544233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=0 545233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 546233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r9, dword ptr arg(3) ;src_pixels_per_line 547233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 548233d2500723e5594f3e7c70896ffeeef32b9c950ywan 549233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm6 ; error accumulator 550233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 ; sse eaccumulator 551233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;ref_ptr ; 552233d2500723e5594f3e7c70896ffeeef32b9c950ywan 553233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ;src_ptr ; 554233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, dword ptr arg(4) ;Height ; 555233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 556233d2500723e5594f3e7c70896ffeeef32b9c950ywan 557233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm0 ; 558233d2500723e5594f3e7c70896ffeeef32b9c950ywan.half_vert_variance8x_h_1: 559233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 560233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 561233d2500723e5594f3e7c70896ffeeef32b9c950ywan 562233d2500723e5594f3e7c70896ffeeef32b9c950ywan pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 563233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm0 ; xmm5 = words of above 564233d2500723e5594f3e7c70896ffeeef32b9c950ywan 565233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 566233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 ; xmm3 = words of above 567233d2500723e5594f3e7c70896ffeeef32b9c950ywan 568233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm5, xmm3 ; xmm5 -= xmm3 569233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm6, xmm5 ; xmm6 += accumulated column differences 570233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 571233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 572233d2500723e5594f3e7c70896ffeeef32b9c950ywan 573233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 574233d2500723e5594f3e7c70896ffeeef32b9c950ywan add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 575233d2500723e5594f3e7c70896ffeeef32b9c950ywan add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 576233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 577233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, r8 578233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, r9 579233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 580233d2500723e5594f3e7c70896ffeeef32b9c950ywan 581233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rcx, 1 ; 582233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .half_vert_variance8x_h_1 ; 583233d2500723e5594f3e7c70896ffeeef32b9c950ywan 584233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm6, xmm6 ; 585233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm7, xmm7 ; 586233d2500723e5594f3e7c70896ffeeef32b9c950ywan 587233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm6, 8 588233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm7, 8 589233d2500723e5594f3e7c70896ffeeef32b9c950ywan 590233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm2, xmm6 591233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm3, xmm7 592233d2500723e5594f3e7c70896ffeeef32b9c950ywan 593233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm6, mm2 594233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm3 595233d2500723e5594f3e7c70896ffeeef32b9c950ywan 596233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm3, mm3 ; 597233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm2, mm2 ; 598233d2500723e5594f3e7c70896ffeeef32b9c950ywan 599233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd mm2, mm6 ; 600233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd mm3, mm6 ; 601233d2500723e5594f3e7c70896ffeeef32b9c950ywan 602233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm3 ; 603233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm6, mm2 ; 604233d2500723e5594f3e7c70896ffeeef32b9c950ywan 605233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm6, 32 ; 606233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm6 ; 607233d2500723e5594f3e7c70896ffeeef32b9c950ywan 608233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad mm2, 16 ; 609233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm4, mm7 ; 610233d2500723e5594f3e7c70896ffeeef32b9c950ywan 611233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm4, 32 ; 612233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm4, mm7 ; 613233d2500723e5594f3e7c70896ffeeef32b9c950ywan 614233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(5) ; sum 615233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(6) ; sumsquared 616233d2500723e5594f3e7c70896ffeeef32b9c950ywan 617233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi], mm2 ; 618233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi], mm4 ; 619233d2500723e5594f3e7c70896ffeeef32b9c950ywan 620233d2500723e5594f3e7c70896ffeeef32b9c950ywan 621233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 622233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 623233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 624233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 625233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 626233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 627233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 628233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 629233d2500723e5594f3e7c70896ffeeef32b9c950ywan 630233d2500723e5594f3e7c70896ffeeef32b9c950ywan 631233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp9_half_horiz_variance8x_h_sse2 632233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 633233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *ref_ptr, 634233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int ref_pixels_per_line, 635233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 636233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixels_per_line, 637233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int Height, 638233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int *sum, 639233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *sumsquared 640233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 641233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE 642233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp9_half_horiz_variance8x_h_sse2): 643233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 644233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 645233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 7 646233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 647233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 648233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 649233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 650233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 651233d2500723e5594f3e7c70896ffeeef32b9c950ywan 652233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=0 653233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 654233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r9, dword ptr arg(3) ;src_pixels_per_line 655233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 656233d2500723e5594f3e7c70896ffeeef32b9c950ywan 657233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm6 ; error accumulator 658233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 ; sse eaccumulator 659233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;ref_ptr ; 660233d2500723e5594f3e7c70896ffeeef32b9c950ywan 661233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ;src_ptr ; 662233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, dword ptr arg(4) ;Height ; 663233d2500723e5594f3e7c70896ffeeef32b9c950ywan 664233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm0 ; 665233d2500723e5594f3e7c70896ffeeef32b9c950ywan.half_horiz_variance8x_h_1: 666233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 667233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 668233d2500723e5594f3e7c70896ffeeef32b9c950ywan 669233d2500723e5594f3e7c70896ffeeef32b9c950ywan pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 670233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm5, xmm0 ; xmm5 = words of above 671233d2500723e5594f3e7c70896ffeeef32b9c950ywan 672233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 673233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm3, xmm0 ; xmm3 = words of above 674233d2500723e5594f3e7c70896ffeeef32b9c950ywan 675233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm5, xmm3 ; xmm5 -= xmm3 676233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm6, xmm5 ; xmm6 += accumulated column differences 677233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 678233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 679233d2500723e5594f3e7c70896ffeeef32b9c950ywan 680233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 681233d2500723e5594f3e7c70896ffeeef32b9c950ywan add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 682233d2500723e5594f3e7c70896ffeeef32b9c950ywan add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 683233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 684233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, r8 685233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, r9 686233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 687233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rcx, 1 ; 688233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .half_horiz_variance8x_h_1 ; 689233d2500723e5594f3e7c70896ffeeef32b9c950ywan 690233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm6, xmm6 ; 691233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm7, xmm7 ; 692233d2500723e5594f3e7c70896ffeeef32b9c950ywan 693233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm6, 8 694233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm7, 8 695233d2500723e5594f3e7c70896ffeeef32b9c950ywan 696233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm2, xmm6 697233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm3, xmm7 698233d2500723e5594f3e7c70896ffeeef32b9c950ywan 699233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm6, mm2 700233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm3 701233d2500723e5594f3e7c70896ffeeef32b9c950ywan 702233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm3, mm3 ; 703233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm2, mm2 ; 704233d2500723e5594f3e7c70896ffeeef32b9c950ywan 705233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd mm2, mm6 ; 706233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd mm3, mm6 ; 707233d2500723e5594f3e7c70896ffeeef32b9c950ywan 708233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm3 ; 709233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm6, mm2 ; 710233d2500723e5594f3e7c70896ffeeef32b9c950ywan 711233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm6, 32 ; 712233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm6 ; 713233d2500723e5594f3e7c70896ffeeef32b9c950ywan 714233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad mm2, 16 ; 715233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm4, mm7 ; 716233d2500723e5594f3e7c70896ffeeef32b9c950ywan 717233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm4, 32 ; 718233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm4, mm7 ; 719233d2500723e5594f3e7c70896ffeeef32b9c950ywan 720233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(5) ; sum 721233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(6) ; sumsquared 722233d2500723e5594f3e7c70896ffeeef32b9c950ywan 723233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi], mm2 ; 724233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rdi], mm4 ; 725233d2500723e5594f3e7c70896ffeeef32b9c950ywan 726233d2500723e5594f3e7c70896ffeeef32b9c950ywan 727233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 728233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 729233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 730233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 731233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 732233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 733233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 734233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 735