1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Use of this source code is governed by a BSD-style license 5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; that can be found in the LICENSE file in the root of the source 6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; tree. An additional intellectual property rights grant can be found 7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; in the file PATENTS. All contributing project authors may 8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; be found in the AUTHORS file in the root of the source tree. 9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_ports/x86_abi_support.asm" 13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get_mb_ss_sse2 15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;( 16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; short *src_ptr 17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;) 18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get_mb_ss_sse2) PRIVATE 19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get_mb_ss_sse2): 20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 1 23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang GET_GOT rbx 24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub rsp, 16 27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rax, arg(0) ;[src_ptr] 31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rcx, 8 32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm4, xmm4 33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.NEXTROW: 35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm0, [rax] 36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm1, [rax+16] 37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm2, [rax+32] 38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm3, [rax+48] 39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm0, xmm0 40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm1, xmm1 41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm2, xmm2 42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm3, xmm3 43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm0, xmm1 45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm2, xmm3 46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm4, xmm0 47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm4, xmm2 48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax, 0x40 50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dec rcx 51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ja .NEXTROW 52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm3,xmm4 54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm4,8 55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm4,xmm3 56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm3,xmm4 57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm4,4 58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm4,xmm3 59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq rax,xmm4 60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsp, 16 64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_GOT 67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get16x16var_sse2 73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;( 74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char * src_ptr, 75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int source_stride, 76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char * ref_ptr, 77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int recon_stride, 78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int * SSE, 79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int * Sum 80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;) 81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get16x16var_sse2) PRIVATE 82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get16x16var_sse2): 83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 6 86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SAVE_XMM 7 87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbx 88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(0) ;[src_ptr] 93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(2) ;[ref_ptr] 94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rax, DWORD PTR arg(1) ;[source_stride] 96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Prefetch data 99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rcx, [rax+rax*2] 100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rsi] 101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rsi+rax] 102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rsi+rax*2] 103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rsi+rcx] 104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rbx, [rsi+rax*4] 105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rbx] 106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rbx+rax] 107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rbx+rax*2] 108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rbx+rcx] 109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rcx, [rdx+rdx*2] 111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rdi] 112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rdi+rdx] 113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rdi+rdx*2] 114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rdi+rcx] 115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rbx, [rdi+rdx*4] 116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rbx] 117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rbx+rdx] 118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rbx+rdx*2] 119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rbx+rcx] 120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm0, xmm0 ; clear xmm0 for unpack 122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rcx, 16 126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.var16loop: 128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqu xmm1, XMMWORD PTR [rsi] 129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqu xmm2, XMMWORD PTR [rdi] 130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rsi+rax*8] 132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang prefetcht0 [rdi+rdx*8] 133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm3, xmm1 135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm4, xmm2 136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm1, xmm0 139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw xmm3, xmm0 140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm2, xmm0 142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw xmm4, xmm0 143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubw xmm1, xmm2 146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubw xmm3, xmm4 147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm1 149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm1, xmm1 150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm3 152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm3, xmm3 153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm6, xmm1 155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm6, xmm3 156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsi, rax 158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdi, rdx 159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub rcx, 1 161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang jnz .var16loop 162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm1, xmm6 165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm6, xmm6 166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm5, xmm5 168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklwd xmm6, xmm7 169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhwd xmm5, xmm7 171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrad xmm5, 16 172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrad xmm6, 16 174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm6, xmm5 175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm2, xmm1 177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckldq xmm1, xmm0 178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhdq xmm2, xmm0 180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm7, xmm6 181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckldq xmm6, xmm0 184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhdq xmm7, xmm0 186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm6, xmm7 187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm2, xmm1 189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm7, xmm6 190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm1, 8 192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm6, 8 193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm7, xmm6 195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rax, arg(5) ;[Sum] 198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(4) ;[SSE] 199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd DWORD PTR [rax], xmm7 201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd DWORD PTR [rdi], xmm1 202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbx 208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_XMM 209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get8x8var_sse2 217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;( 218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char * src_ptr, 219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int source_stride, 220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char * ref_ptr, 221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int recon_stride, 222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int * SSE, 223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int * Sum 224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;) 225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get8x8var_sse2) PRIVATE 226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get8x8var_sse2): 227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 6 230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SAVE_XMM 7 231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang GET_GOT rbx 232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub rsp, 16 235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(0) ;[src_ptr] 238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(2) ;[ref_ptr] 239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rax, DWORD PTR arg(1) ;[source_stride] 241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm0, xmm0 ; clear xmm0 for unpack 244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm1, QWORD PTR [rsi] 247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm2, QWORD PTR [rdi] 248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm1, xmm0 250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm2, xmm0 251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw xmm1, xmm2 253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm1 254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm1, xmm1 256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm2, QWORD PTR[rsi + rax] 258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR[rdi + rdx] 259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm2, xmm0 261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw xmm2, xmm3 264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm2 265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm2, xmm2 267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm2, QWORD PTR[rsi + rax * 2] 271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR[rdi + rdx * 2] 272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm2, xmm0 274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw xmm2, xmm3 277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm2 278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm2, xmm2 280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rsi, [rsi + rax * 2] 284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rdi, [rdi + rdx * 2] 285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm2, QWORD PTR[rsi + rax] 286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR[rdi + rdx] 287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm2, xmm0 289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw xmm2, xmm3 292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm2 293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm2, xmm2 295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm2, QWORD PTR[rsi + rax *2] 298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR[rdi + rdx *2] 299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm2, xmm0 301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw xmm2, xmm3 304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm2 305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm2, xmm2 307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rsi, [rsi + rax * 2] 311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rdi, [rdi + rdx * 2] 312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm2, QWORD PTR[rsi + rax] 315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR[rdi + rdx] 316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm2, xmm0 318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw xmm2, xmm3 321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm2 322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm2, xmm2 324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm2, QWORD PTR[rsi + rax *2] 327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR[rdi + rdx *2] 328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm2, xmm0 330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw xmm2, xmm3 333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm2 334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm2, xmm2 336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rsi, [rsi + rax * 2] 340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang lea rdi, [rdi + rdx * 2] 341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm2, QWORD PTR[rsi + rax] 343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR[rdi + rdx] 344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 345ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm2, xmm0 346ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw xmm2, xmm3 349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm2 350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm2, xmm2 352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 354ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm6, xmm7 356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklwd xmm6, xmm0 357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhwd xmm7, xmm0 359ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm2, xmm1 360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm6, xmm7 362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckldq xmm1, xmm0 363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhdq xmm2, xmm0 365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm7, xmm6 366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 368ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckldq xmm6, xmm0 369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhdq xmm7, xmm0 371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm6, xmm7 372ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm2, xmm1 374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm7, xmm6 375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm1, 8 377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm6, 8 378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm7, xmm6 380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm1, xmm2 381ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rax, arg(5) ;[Sum] 383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(4) ;[SSE] 384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 385ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq rdx, xmm7 386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsx rcx, dx 387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov dword ptr [rax], ecx 389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd DWORD PTR [rdi], xmm1 390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsp, 16 393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_GOT 396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_XMM 397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_half_horiz_vert_variance8x_h_sse2 402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;( 403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *ref_ptr, 404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int ref_pixels_per_line, 405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *src_ptr, 406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int src_pixels_per_line, 407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int Height, 408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int *sum, 409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int *sumsquared 410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;) 411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE 412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_half_horiz_vert_variance8x_h_sse2): 413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 7 416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SAVE_XMM 7 417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang GET_GOT rbx 418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=0 423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd r9, dword ptr arg(3) ;src_pixels_per_line 425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm6, xmm6 ; error accumulator 428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm7, xmm7 ; sse eaccumulator 429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(0) ;ref_ptr ; 430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(2) ;src_ptr ; 432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rcx, dword ptr arg(4) ;Height ; 433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm0, xmm0 ; 436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT 442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source 443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else 444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsi, r8 445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.half_horiz_vert_variance8x_h_1: 448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm1, QWORD PTR [rsi] ; 450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm2, QWORD PTR [rsi+1] ; 451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pavgb xmm5, xmm1 ; xmm = vertical average of the above 454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm5, xmm0 ; xmm5 = words of above 455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 ; xmm3 = words of above 458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubw xmm5, xmm3 ; xmm5 -= xmm3 460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm6, xmm5 ; xmm6 += accumulated column differences 461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm5, xmm1 ; save xmm1 for use on the next row 465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT 467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else 470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsi, r8 471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdi, r9 472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub rcx, 1 ; 475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang jnz .half_horiz_vert_variance8x_h_1 ; 476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm6, xmm6 ; 478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm7, xmm7 ; 479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm6, 8 481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm7, 8 482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm2, xmm6 484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm3, xmm7 485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm6, mm2 487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm3 488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm3, mm3 ; 490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm2, mm2 ; 491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklwd mm2, mm6 ; 493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhwd mm3, mm6 ; 494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm2, mm3 ; 496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm6, mm2 ; 497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrlq mm6, 32 ; 499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm2, mm6 ; 500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrad mm2, 16 ; 502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm4, mm7 ; 503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrlq mm4, 32 ; 505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm4, mm7 ; 506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(5) ; sum 508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(6) ; sumsquared 509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rsi], mm2 ; 511ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], mm4 ; 512ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 513ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 514ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 515ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 516ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 517ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_GOT 518ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_XMM 519ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 520ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 521ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 522ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 523ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_half_vert_variance8x_h_sse2 524ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;( 525ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *ref_ptr, 526ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int ref_pixels_per_line, 527ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *src_ptr, 528ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int src_pixels_per_line, 529ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int Height, 530ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int *sum, 531ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int *sumsquared 532ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;) 533ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_half_vert_variance8x_h_sse2) PRIVATE 534ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_half_vert_variance8x_h_sse2): 535ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 536ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 537ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 7 538ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SAVE_XMM 7 539ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang GET_GOT rbx 540ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 541ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 542ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 543ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 544ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=0 545ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 546ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd r9, dword ptr arg(3) ;src_pixels_per_line 547ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 548ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 549ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm6, xmm6 ; error accumulator 550ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm7, xmm7 ; sse eaccumulator 551ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(0) ;ref_ptr ; 552ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 553ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(2) ;src_ptr ; 554ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rcx, dword ptr arg(4) ;Height ; 555ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 556ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 557ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm0, xmm0 ; 558ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.half_vert_variance8x_h_1: 559ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 560ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 561ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 562ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 563ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm5, xmm0 ; xmm5 = words of above 564ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 565ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 566ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 ; xmm3 = words of above 567ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 568ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubw xmm5, xmm3 ; xmm5 -= xmm3 569ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm6, xmm5 ; xmm6 += accumulated column differences 570ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 571ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 572ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 573ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT 574ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 575ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 576ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else 577ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsi, r8 578ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdi, r9 579ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 580ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 581ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub rcx, 1 ; 582ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang jnz .half_vert_variance8x_h_1 ; 583ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 584ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm6, xmm6 ; 585ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm7, xmm7 ; 586ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 587ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm6, 8 588ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm7, 8 589ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 590ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm2, xmm6 591ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm3, xmm7 592ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 593ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm6, mm2 594ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm3 595ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 596ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm3, mm3 ; 597ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm2, mm2 ; 598ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 599ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklwd mm2, mm6 ; 600ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhwd mm3, mm6 ; 601ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 602ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm2, mm3 ; 603ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm6, mm2 ; 604ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 605ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrlq mm6, 32 ; 606ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm2, mm6 ; 607ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 608ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrad mm2, 16 ; 609ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm4, mm7 ; 610ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 611ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrlq mm4, 32 ; 612ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm4, mm7 ; 613ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 614ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(5) ; sum 615ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(6) ; sumsquared 616ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 617ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rsi], mm2 ; 618ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], mm4 ; 619ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 620ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 621ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 622ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 623ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 624ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_GOT 625ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_XMM 626ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 627ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 628ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 629ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 630ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 631ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void vp9_half_horiz_variance8x_h_sse2 632ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;( 633ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *ref_ptr, 634ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int ref_pixels_per_line, 635ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *src_ptr, 636ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int src_pixels_per_line, 637ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int Height, 638ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int *sum, 639ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int *sumsquared 640ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;) 641ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE 642ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_half_horiz_variance8x_h_sse2): 643ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 644ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 645ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 7 646ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SAVE_XMM 7 647ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang GET_GOT rbx 648ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 649ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 650ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 651ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 652ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT=0 653ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 654ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd r9, dword ptr arg(3) ;src_pixels_per_line 655ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 656ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 657ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm6, xmm6 ; error accumulator 658ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm7, xmm7 ; sse eaccumulator 659ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(0) ;ref_ptr ; 660ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 661ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(2) ;src_ptr ; 662ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rcx, dword ptr arg(4) ;Height ; 663ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 664ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm0, xmm0 ; 665ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.half_horiz_variance8x_h_1: 666ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 667ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 668ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 669ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 670ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm5, xmm0 ; xmm5 = words of above 671ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 672ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 673ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 ; xmm3 = words of above 674ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 675ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubw xmm5, xmm3 ; xmm5 -= xmm3 676ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw xmm6, xmm5 ; xmm6 += accumulated column differences 677ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 678ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 679ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 680ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%if ABI_IS_32BIT 681ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 682ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 683ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%else 684ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsi, r8 685ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdi, r9 686ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endif 687ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub rcx, 1 ; 688ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang jnz .half_horiz_variance8x_h_1 ; 689ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 690ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm6, xmm6 ; 691ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm7, xmm7 ; 692ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 693ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm6, 8 694ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrldq xmm7, 8 695ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 696ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm2, xmm6 697ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdq2q mm3, xmm7 698ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 699ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm6, mm2 700ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm3 701ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 702ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm3, mm3 ; 703ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm2, mm2 ; 704ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 705ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklwd mm2, mm6 ; 706ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhwd mm3, mm6 ; 707ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 708ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm2, mm3 ; 709ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm6, mm2 ; 710ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 711ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrlq mm6, 32 ; 712ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm2, mm6 ; 713ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 714ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrad mm2, 16 ; 715ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm4, mm7 ; 716ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 717ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrlq mm4, 32 ; 718ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm4, mm7 ; 719ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 720ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(5) ; sum 721ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(6) ; sumsquared 722ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 723ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rsi], mm2 ; 724ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], mm4 ; 725ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 726ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 727ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 728ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 729ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 730ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_GOT 731ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_XMM 732ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 733ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 734ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 735