1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Use of this source code is governed by a BSD-style license 5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; that can be found in the LICENSE file in the root of the source 6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; tree. An additional intellectual property rights grant can be found 7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; in the file PATENTS. All contributing project authors may 8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; be found in the AUTHORS file in the root of the source tree. 9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_ports/x86_abi_support.asm" 13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get_mb_ss_mmx( short *src_ptr ) 15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get_mb_ss_mmx) PRIVATE 16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get_mb_ss_mmx): 17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 7 20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang GET_GOT rbx 21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub rsp, 8 24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rax, arg(0) ;src_ptr 27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rcx, 16 28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm4, mm4 29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.NEXTROW: 31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, [rax] 32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm1, [rax+8] 33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm2, [rax+16] 34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm3, [rax+24] 35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm1, mm1 37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm2, mm2 38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm3, mm3 39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm4, mm0 41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm4, mm1 42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm4, mm2 43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm4, mm3 44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax, 32 46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dec rcx 47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ja .NEXTROW 48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq QWORD PTR [rsp], mm4 49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ;return sum[0]+sum[1]; 51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rax, dword ptr [rsp] 52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rcx, dword ptr [rsp+4] 53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax, rcx 54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsp, 8 58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_GOT 61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int vp9_get8x8var_mmx 67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;( 68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *src_ptr, 69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int source_stride, 70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *ref_ptr, 71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int recon_stride, 72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int *SSE, 73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int *Sum 74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;) 75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get8x8var_mmx) PRIVATE 76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get8x8var_mmx): 77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 6 80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbx 83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub rsp, 16 84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm5, mm5 ; Blank mmx6 88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm6, mm6 ; Blank mmx7 89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm7, mm7 ; Blank mmx7 90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rax, arg(0) ;[src_ptr] ; Load base addresses 92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbx, arg(2) ;[ref_ptr] 93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rcx, dword ptr arg(1) ;[source_stride] 94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rdx, dword ptr arg(3) ;[recon_stride] 95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 1 97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, [rax] ; Copy eight bytes to mm0 98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm1, [rbx] ; Copy eight bytes to mm1 99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm2, mm0 ; Take copies 100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm3, mm1 ; Take copies 101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm2, mm6 ; unpack to higher prrcision 105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm3, mm6 106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm2, mm3 ; A-B (high order) to MM2 108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm2 ; accumulate differences in mm5 111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm2, mm2 ; square and accumulate 114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm1, [rbx] ; Copy eight bytes to mm1 117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm2 ; accumulate in mm7 119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 2 122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, [rax] ; Copy eight bytes to mm0 123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm2, mm0 ; Take copies 124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm3, mm1 ; Take copies 125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm2, mm6 ; unpack to higher prrcision 129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm3, mm6 130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm2, mm3 ; A-B (high order) to MM2 132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm2 ; accumulate differences in mm5 135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm2, mm2 ; square and accumulate 138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm1, [rbx] ; Copy eight bytes to mm1 141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm2 ; accumulate in mm7 143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 3 145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, [rax] ; Copy eight bytes to mm0 146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm2, mm0 ; Take copies 147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm3, mm1 ; Take copies 148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm2, mm6 ; unpack to higher prrcision 152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm3, mm6 153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm2, mm3 ; A-B (high order) to MM2 155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm2 ; accumulate differences in mm5 158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm2, mm2 ; square and accumulate 161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm1, [rbx] ; Copy eight bytes to mm1 164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm2 ; accumulate in mm7 166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 4 168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, [rax] ; Copy eight bytes to mm0 169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm2, mm0 ; Take copies 170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm3, mm1 ; Take copies 171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm2, mm6 ; unpack to higher prrcision 175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm3, mm6 176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm2, mm3 ; A-B (high order) to MM2 178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm2 ; accumulate differences in mm5 181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm2, mm2 ; square and accumulate 184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm1, [rbx] ; Copy eight bytes to mm1 187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm2 ; accumulate in mm7 189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 5 191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, [rax] ; Copy eight bytes to mm0 192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm2, mm0 ; Take copies 193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm3, mm1 ; Take copies 194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm2, mm6 ; unpack to higher prrcision 198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm3, mm6 199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm2, mm3 ; A-B (high order) to MM2 201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm2 ; accumulate differences in mm5 204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm2, mm2 ; square and accumulate 207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm1, [rbx] ; Copy eight bytes to mm1 210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; movq mm4, [rbx + rdx] 211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm2 ; accumulate in mm7 213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 6 215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, [rax] ; Copy eight bytes to mm0 216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm2, mm0 ; Take copies 217ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm3, mm1 ; Take copies 218ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 219ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 220ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 221ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm2, mm6 ; unpack to higher prrcision 222ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm3, mm6 223ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 224ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm2, mm3 ; A-B (high order) to MM2 225ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 226ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 227ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm2 ; accumulate differences in mm5 228ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 229ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 230ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm2, mm2 ; square and accumulate 231ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 232ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 233ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm1, [rbx] ; Copy eight bytes to mm1 234ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 235ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm2 ; accumulate in mm7 236ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 237ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 7 238ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, [rax] ; Copy eight bytes to mm0 239ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm2, mm0 ; Take copies 240ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm3, mm1 ; Take copies 241ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 242ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 243ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 244ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm2, mm6 ; unpack to higher prrcision 245ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm3, mm6 246ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 247ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm2, mm3 ; A-B (high order) to MM2 248ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 249ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 250ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm2 ; accumulate differences in mm5 251ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 252ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 253ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm2, mm2 ; square and accumulate 254ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 255ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 256ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm1, [rbx] ; Copy eight bytes to mm1 257ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 258ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm2 ; accumulate in mm7 259ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 260ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 8 261ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, [rax] ; Copy eight bytes to mm0 262ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm2, mm0 ; Take copies 263ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm3, mm1 ; Take copies 264ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 265ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 266ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 267ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm2, mm6 ; unpack to higher prrcision 268ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw mm3, mm6 269ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 270ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm2, mm3 ; A-B (high order) to MM2 271ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 272ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 273ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm2 ; accumulate differences in mm5 274ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 275ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 276ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm2, mm2 ; square and accumulate 277ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 278ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 279ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 280ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm2 ; accumulate in mm7 281ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 282ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Now accumulate the final results. 283ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 284ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 285ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsx rdx, WORD PTR [rsp+8] 286ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsx rcx, WORD PTR [rsp+10] 287ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsx rbx, WORD PTR [rsp+12] 288ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsx rax, WORD PTR [rsp+14] 289ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdx, rcx 290ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx, rax 291ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdx, rbx ;XSum 292ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rax, DWORD PTR [rsp] 293ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rcx, DWORD PTR [rsp+4] 294ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax, rcx ;XXSum 295ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(4) ;SSE 296ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(5) ;Sum 297ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov dword ptr [rsi], eax 298ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov dword ptr [rdi], edx 299ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang xor rax, rax ; return 0 300ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 301ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 302ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 303ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsp, 16 304ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbx 305ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 306ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 307ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 308ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 309ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 310ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 311ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 312ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 313ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int 314ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;vp9_get4x4var_mmx 315ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;( 316ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *src_ptr, 317ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int source_stride, 318ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *ref_ptr, 319ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int recon_stride, 320ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned int *SSE, 321ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int *Sum 322ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;) 323ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get4x4var_mmx) PRIVATE 324ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get4x4var_mmx): 325ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 326ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 327ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 6 328ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 329ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 330ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbx 331ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang sub rsp, 16 332ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 333ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 334ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 335ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm5, mm5 ; Blank mmx6 336ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm6, mm6 ; Blank mmx7 337ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm7, mm7 ; Blank mmx7 338ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 339ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rax, arg(0) ;[src_ptr] ; Load base addresses 340ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbx, arg(2) ;[ref_ptr] 341ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rcx, dword ptr arg(1) ;[source_stride] 342ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rdx, dword ptr arg(3) ;[recon_stride] 343ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 344ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 1 3453df0563f1b24dac6c0bd122fc922a48211269061hkuang movd mm0, [rax] ; Copy 4 bytes to mm0 3463df0563f1b24dac6c0bd122fc922a48211269061hkuang movd mm1, [rbx] ; Copy 4 bytes to mm1 347ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 348ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 349ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 350ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 351ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 352ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 353ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 3543df0563f1b24dac6c0bd122fc922a48211269061hkuang movd mm1, [rbx] ; Copy 4 bytes to mm1 355ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 356ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 357ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 358ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 2 3593df0563f1b24dac6c0bd122fc922a48211269061hkuang movd mm0, [rax] ; Copy 4 bytes to mm0 360ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 361ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 362ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 363ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 364ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 365ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 366ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 367ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 3683df0563f1b24dac6c0bd122fc922a48211269061hkuang movd mm1, [rbx] ; Copy 4 bytes to mm1 369ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 370ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 371ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 3 3723df0563f1b24dac6c0bd122fc922a48211269061hkuang movd mm0, [rax] ; Copy 4 bytes to mm0 373ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 374ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 375ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 376ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 377ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 378ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 379ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 380ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 3813df0563f1b24dac6c0bd122fc922a48211269061hkuang movd mm1, [rbx] ; Copy 4 bytes to mm1 382ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 383ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 384ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 4 3853df0563f1b24dac6c0bd122fc922a48211269061hkuang movd mm0, [rax] ; Copy 4 bytes to mm0 386ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 387ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 388ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 389ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 390ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 391ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddw mm5, mm0 ; accumulate differences in mm5 392ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 393ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 394ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 395ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 396ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 397ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Now accumulate the final results. 398ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 399ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 400ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsx rdx, WORD PTR [rsp+8] 401ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsx rcx, WORD PTR [rsp+10] 402ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsx rbx, WORD PTR [rsp+12] 403ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsx rax, WORD PTR [rsp+14] 404ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdx, rcx 405ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx, rax 406ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdx, rbx ;XSum 407ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rax, DWORD PTR [rsp] 408ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rcx, DWORD PTR [rsp+4] 409ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax, rcx ;XXSum 410ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(4) ;SSE 411ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(5) ;Sum 412ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov dword ptr [rsi], eax 413ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov dword ptr [rdi], edx 414ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang xor rax, rax ; return 0 415ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 416ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 417ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 418ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsp, 16 419ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbx 420ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 421ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 422ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 423ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 424ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 425ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 426ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 427ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 428ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;unsigned int 429ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;vp9_get4x4sse_cs_mmx 430ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;( 431ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *src_ptr, 432ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int source_stride, 433ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *ref_ptr, 434ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int recon_stride 435ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;) 436ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_get4x4sse_cs_mmx) PRIVATE 437ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_get4x4sse_cs_mmx): 438ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 439ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 440ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 4 441ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 442ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 443ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbx 444ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 445ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 446ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 447ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm6, mm6 ; Blank mmx7 448ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor mm7, mm7 ; Blank mmx7 449ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 450ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rax, arg(0) ;[src_ptr] ; Load base addresses 451ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbx, arg(2) ;[ref_ptr] 452ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rcx, dword ptr arg(1) ;[source_stride] 453ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movsxd rdx, dword ptr arg(3) ;[recon_stride] 454ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 1 455ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd mm0, [rax] ; Copy eight bytes to mm0 456ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd mm1, [rbx] ; Copy eight bytes to mm1 457ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 458ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 459ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 460ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 461ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 462ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 463ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd mm1, [rbx] ; Copy eight bytes to mm1 464ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 465ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 466ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 2 467ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd mm0, [rax] ; Copy eight bytes to mm0 468ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 469ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 470ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 471ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 472ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 473ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 474ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd mm1, [rbx] ; Copy eight bytes to mm1 475ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 476ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 477ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 3 478ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd mm0, [rax] ; Copy eight bytes to mm0 479ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 480ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 481ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 482ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 483ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 484ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rbx,rdx ; Inc pointer into ref data 485ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rax,rcx ; Inc pointer into the new data 486ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd mm1, [rbx] ; Copy eight bytes to mm1 487ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 488ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 489ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; Row 4 490ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd mm0, [rax] ; Copy eight bytes to mm0 491ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm0, mm6 ; unpack to higher prrcision 492ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw mm1, mm6 493ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psubsw mm0, mm1 ; A-B (low order) to MM0 494ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd mm0, mm0 ; square and accumulate 495ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm7, mm0 ; accumulate in mm7 496ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 497ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq mm0, mm7 ; 498ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang psrlq mm7, 32 499ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 500ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd mm0, mm7 501ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq rax, mm0 502ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 503ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 504ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 505ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbx 506ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 507ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 508ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 509ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 510ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 511