1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm" 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) 15233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_get_mb_ss_mmx) PRIVATE 16233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_get_mb_ss_mmx): 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 7 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 8 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ;src_ptr 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, 16 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm4, mm4 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan.NEXTROW: 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rax+8] 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, [rax+16] 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, [rax+24] 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm1, mm1 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm3, mm3 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm4, mm0 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm4, mm1 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm4, mm2 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm4, mm3 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax, 32 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rcx 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan ja .NEXTROW 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rsp], mm4 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;return sum[0]+sum[1]; 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr [rsp] 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, dword ptr [rsp+4] 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax, rcx 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 8 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int vp8_get8x8var_mmx 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int source_stride, 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *ref_ptr, 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int recon_stride, 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *SSE, 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int *Sum 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 75233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_get8x8var_mmx) PRIVATE 76233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_get8x8var_mmx): 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 6 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbx 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 16 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm5, mm5 ; Blank mmx6 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm6, mm6 ; Blank mmx7 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm7, mm7 ; Blank mmx7 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ;[src_ptr] ; Load base addresses 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbx, arg(2) ;[ref_ptr] 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, dword ptr arg(1) ;[source_stride] 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword ptr arg(3) ;[recon_stride] 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 1 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm0 ; Take copies 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm1 ; Take copies 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm6 ; unpack to higher prrcision 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm3, mm6 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm2, mm3 ; A-B (high order) to MM2 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm2 ; accumulate differences in mm5 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 ; square and accumulate 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm2 ; accumulate in mm7 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 2 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm0 ; Take copies 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm1 ; Take copies 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm6 ; unpack to higher prrcision 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm3, mm6 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm2, mm3 ; A-B (high order) to MM2 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm2 ; accumulate differences in mm5 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 ; square and accumulate 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm2 ; accumulate in mm7 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 3 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm0 ; Take copies 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm1 ; Take copies 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm6 ; unpack to higher prrcision 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm3, mm6 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm2, mm3 ; A-B (high order) to MM2 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm2 ; accumulate differences in mm5 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 ; square and accumulate 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm2 ; accumulate in mm7 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 4 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm0 ; Take copies 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm1 ; Take copies 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm6 ; unpack to higher prrcision 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm3, mm6 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm2, mm3 ; A-B (high order) to MM2 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm2 ; accumulate differences in mm5 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 ; square and accumulate 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm2 ; accumulate in mm7 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 5 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm0 ; Take copies 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm1 ; Take copies 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm6 ; unpack to higher prrcision 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm3, mm6 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm2, mm3 ; A-B (high order) to MM2 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm2 ; accumulate differences in mm5 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 ; square and accumulate 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; movq mm4, [rbx + rdx] 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm2 ; accumulate in mm7 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 6 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm0 ; Take copies 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm1 ; Take copies 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm6 ; unpack to higher prrcision 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm3, mm6 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm2, mm3 ; A-B (high order) to MM2 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm2 ; accumulate differences in mm5 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 ; square and accumulate 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm2 ; accumulate in mm7 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 7 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm0 ; Take copies 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm1 ; Take copies 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm6 ; unpack to higher prrcision 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm3, mm6 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm2, mm3 ; A-B (high order) to MM2 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm2 ; accumulate differences in mm5 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 ; square and accumulate 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm2 ; accumulate in mm7 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 8 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm0 ; Take copies 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm1 ; Take copies 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm6 ; unpack to higher prrcision 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm3, mm6 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm2, mm3 ; A-B (high order) to MM2 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm2 ; accumulate differences in mm5 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 ; square and accumulate 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm2 ; accumulate in mm7 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Now accumulate the final results. 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsx rdx, WORD PTR [rsp+8] 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsx rcx, WORD PTR [rsp+10] 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsx rbx, WORD PTR [rsp+12] 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsx rax, WORD PTR [rsp+14] 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, rcx 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx, rax 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, rbx ;XSum 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, DWORD PTR [rsp] 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, DWORD PTR [rsp+4] 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax, rcx ;XXSum 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(4) ;SSE 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(5) ;Sum 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov dword ptr [rsi], eax 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov dword ptr [rdi], edx 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan xor rax, rax ; return 0 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 16 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbx 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan;vp8_get4x4var_mmx 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int source_stride, 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *ref_ptr, 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int recon_stride, 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *SSE, 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int *Sum 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 323233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_get4x4var_mmx) PRIVATE 324233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_get4x4var_mmx): 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 6 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbx 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 16 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm5, mm5 ; Blank mmx6 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm6, mm6 ; Blank mmx7 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm7, mm7 ; Blank mmx7 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ;[src_ptr] ; Load base addresses 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbx, arg(2) ;[ref_ptr] 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, dword ptr arg(1) ;[source_stride] 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword ptr arg(3) ;[recon_stride] 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 1 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 2 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 3 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rbx] ; Copy eight bytes to mm1 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 4 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rax] ; Copy eight bytes to mm0 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm5, mm0 ; accumulate differences in mm5 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Now accumulate the final results. 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsx rdx, WORD PTR [rsp+8] 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsx rcx, WORD PTR [rsp+10] 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsx rbx, WORD PTR [rsp+12] 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsx rax, WORD PTR [rsp+14] 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, rcx 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx, rax 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, rbx ;XSum 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, DWORD PTR [rsp] 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, DWORD PTR [rsp+4] 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax, rcx ;XXSum 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(4) ;SSE 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(5) ;Sum 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov dword ptr [rsi], eax 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov dword ptr [rdi], edx 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan xor rax, rax ; return 0 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 16 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbx 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 424233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 425233d2500723e5594f3e7c70896ffeeef32b9c950ywan 426233d2500723e5594f3e7c70896ffeeef32b9c950ywan 427233d2500723e5594f3e7c70896ffeeef32b9c950ywan 428233d2500723e5594f3e7c70896ffeeef32b9c950ywan;unsigned int 429233d2500723e5594f3e7c70896ffeeef32b9c950ywan;vp8_get4x4sse_cs_mmx 430233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 431233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 432233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int source_stride, 433233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *ref_ptr, 434233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int recon_stride 435233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 436233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_get4x4sse_cs_mmx) PRIVATE 437233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_get4x4sse_cs_mmx): 438233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 439233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 440233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 4 441233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 442233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 443233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbx 444233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 445233d2500723e5594f3e7c70896ffeeef32b9c950ywan 446233d2500723e5594f3e7c70896ffeeef32b9c950ywan 447233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm6, mm6 ; Blank mmx7 448233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm7, mm7 ; Blank mmx7 449233d2500723e5594f3e7c70896ffeeef32b9c950ywan 450233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(0) ;[src_ptr] ; Load base addresses 451233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbx, arg(2) ;[ref_ptr] 452233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, dword ptr arg(1) ;[source_stride] 453233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword ptr arg(3) ;[recon_stride] 454233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 1 455233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm0, [rax] ; Copy eight bytes to mm0 456233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm1, [rbx] ; Copy eight bytes to mm1 457233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 458233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 459233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 460233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 461233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 462233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 463233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm1, [rbx] ; Copy eight bytes to mm1 464233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 465233d2500723e5594f3e7c70896ffeeef32b9c950ywan 466233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 2 467233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm0, [rax] ; Copy eight bytes to mm0 468233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 469233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 470233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 471233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 472233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 473233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 474233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm1, [rbx] ; Copy eight bytes to mm1 475233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 476233d2500723e5594f3e7c70896ffeeef32b9c950ywan 477233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 3 478233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm0, [rax] ; Copy eight bytes to mm0 479233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 480233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 481233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 482233d2500723e5594f3e7c70896ffeeef32b9c950ywan 483233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 484233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx,rdx ; Inc pointer into ref data 485233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,rcx ; Inc pointer into the new data 486233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm1, [rbx] ; Copy eight bytes to mm1 487233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 488233d2500723e5594f3e7c70896ffeeef32b9c950ywan 489233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; Row 4 490233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm0, [rax] ; Copy eight bytes to mm0 491233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm0, mm6 ; unpack to higher prrcision 492233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm6 493233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubsw mm0, mm1 ; A-B (low order) to MM0 494233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm0, mm0 ; square and accumulate 495233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm0 ; accumulate in mm7 496233d2500723e5594f3e7c70896ffeeef32b9c950ywan 497233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, mm7 ; 498233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm7, 32 499233d2500723e5594f3e7c70896ffeeef32b9c950ywan 500233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm0, mm7 501233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq rax, mm0 502233d2500723e5594f3e7c70896ffeeef32b9c950ywan 503233d2500723e5594f3e7c70896ffeeef32b9c950ywan 504233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 505233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbx 506233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 507233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 508233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 509233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 510233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 511233d2500723e5594f3e7c70896ffeeef32b9c950ywan 512233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define mmx_filter_shift 7 513233d2500723e5594f3e7c70896ffeeef32b9c950ywan 514233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_filter_block2d_bil4x4_var_mmx 515233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 516233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *ref_ptr, 517233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int ref_pixels_per_line, 518233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 519233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixels_per_line, 520233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned short *HFilter, 521233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned short *VFilter, 522233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int *sum, 523233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *sumsquared 524233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 525233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE 526233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_filter_block2d_bil4x4_var_mmx): 527233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 528233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 529233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 8 530233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 531233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 532233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 533233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 16 534233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 535233d2500723e5594f3e7c70896ffeeef32b9c950ywan 536233d2500723e5594f3e7c70896ffeeef32b9c950ywan 537233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm6, mm6 ; 538233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm7, mm7 ; 539233d2500723e5594f3e7c70896ffeeef32b9c950ywan 540233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(4) ;HFilter ; 541233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(5) ;VFilter ; 542233d2500723e5594f3e7c70896ffeeef32b9c950ywan 543233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;ref_ptr ; 544233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ;src_ptr ; 545233d2500723e5594f3e7c70896ffeeef32b9c950ywan 546233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, 4 ; 547233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm0, mm0 ; 548233d2500723e5594f3e7c70896ffeeef32b9c950ywan 549233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm1, [rsi] ; 550233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm3, [rsi+1] ; 551233d2500723e5594f3e7c70896ffeeef32b9c950ywan 552233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm0 ; 553233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm1, [rax] ; 554233d2500723e5594f3e7c70896ffeeef32b9c950ywan 555233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm3, mm0 ; 556233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm3, [rax+8] ; 557233d2500723e5594f3e7c70896ffeeef32b9c950ywan 558233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, mm3 ; 559233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, [GLOBAL(mmx_bi_rd)] ; 560233d2500723e5594f3e7c70896ffeeef32b9c950ywan 561233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw mm1, mmx_filter_shift ; 562233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm5, mm1 563233d2500723e5594f3e7c70896ffeeef32b9c950ywan 564233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 565233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 566233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 567233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 568233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, r8 569233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 570233d2500723e5594f3e7c70896ffeeef32b9c950ywan 571233d2500723e5594f3e7c70896ffeeef32b9c950ywan.filter_block2d_bil4x4_var_mmx_loop: 572233d2500723e5594f3e7c70896ffeeef32b9c950ywan 573233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm1, [rsi] ; 574233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm3, [rsi+1] ; 575233d2500723e5594f3e7c70896ffeeef32b9c950ywan 576233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm0 ; 577233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm1, [rax] ; 578233d2500723e5594f3e7c70896ffeeef32b9c950ywan 579233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm3, mm0 ; 580233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm3, [rax+8] ; 581233d2500723e5594f3e7c70896ffeeef32b9c950ywan 582233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, mm3 ; 583233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, [GLOBAL(mmx_bi_rd)] ; 584233d2500723e5594f3e7c70896ffeeef32b9c950ywan 585233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw mm1, mmx_filter_shift ; 586233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm5 ; 587233d2500723e5594f3e7c70896ffeeef32b9c950ywan 588233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm5, mm1 ; 589233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm3, [rdx] ; 590233d2500723e5594f3e7c70896ffeeef32b9c950ywan 591233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm1, [rdx+8] ; 592233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, mm3 ; 593233d2500723e5594f3e7c70896ffeeef32b9c950ywan 594233d2500723e5594f3e7c70896ffeeef32b9c950ywan 595233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, [GLOBAL(mmx_bi_rd)] ; 596233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw mm1, mmx_filter_shift ; 597233d2500723e5594f3e7c70896ffeeef32b9c950ywan 598233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd mm3, [rdi] ; 599233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm3, mm0 ; 600233d2500723e5594f3e7c70896ffeeef32b9c950ywan 601233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw mm1, mm3 ; 602233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm6, mm1 ; 603233d2500723e5594f3e7c70896ffeeef32b9c950ywan 604233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm1, mm1 ; 605233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm1 ; 606233d2500723e5594f3e7c70896ffeeef32b9c950ywan 607233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 608233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 609233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, dword ptr arg(3) ;src_pixels_per_line ; 610233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 611233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 612233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r9, dword ptr arg(3) ;src_pixels_per_line 613233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, r8 614233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, r9 615233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 616233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rcx, 1 ; 617233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .filter_block2d_bil4x4_var_mmx_loop ; 618233d2500723e5594f3e7c70896ffeeef32b9c950ywan 619233d2500723e5594f3e7c70896ffeeef32b9c950ywan 620233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm3, mm3 ; 621233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm2, mm2 ; 622233d2500723e5594f3e7c70896ffeeef32b9c950ywan 623233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd mm2, mm6 ; 624233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd mm3, mm6 ; 625233d2500723e5594f3e7c70896ffeeef32b9c950ywan 626233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm3 ; 627233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm6, mm2 ; 628233d2500723e5594f3e7c70896ffeeef32b9c950ywan 629233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm6, 32 ; 630233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm6 ; 631233d2500723e5594f3e7c70896ffeeef32b9c950ywan 632233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad mm2, 16 ; 633233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm4, mm7 ; 634233d2500723e5594f3e7c70896ffeeef32b9c950ywan 635233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm4, 32 ; 636233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm4, mm7 ; 637233d2500723e5594f3e7c70896ffeeef32b9c950ywan 638233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(6) ;sum 639233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(7) ;sumsquared 640233d2500723e5594f3e7c70896ffeeef32b9c950ywan 641233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd dword ptr [rdi], mm2 ; 642233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd dword ptr [rsi], mm4 ; 643233d2500723e5594f3e7c70896ffeeef32b9c950ywan 644233d2500723e5594f3e7c70896ffeeef32b9c950ywan 645233d2500723e5594f3e7c70896ffeeef32b9c950ywan 646233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 647233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 16 648233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 649233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 650233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 651233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 652233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 653233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 654233d2500723e5594f3e7c70896ffeeef32b9c950ywan 655233d2500723e5594f3e7c70896ffeeef32b9c950ywan 656233d2500723e5594f3e7c70896ffeeef32b9c950ywan 657233d2500723e5594f3e7c70896ffeeef32b9c950ywan 658233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_filter_block2d_bil_var_mmx 659233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 660233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *ref_ptr, 661233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int ref_pixels_per_line, 662233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 663233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixels_per_line, 664233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int Height, 665233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned short *HFilter, 666233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned short *VFilter, 667233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int *sum, 668233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int *sumsquared 669233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 670233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_filter_block2d_bil_var_mmx) PRIVATE 671233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_filter_block2d_bil_var_mmx): 672233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 673233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 674233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 9 675233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 676233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 677233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 678233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 16 679233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 680233d2500723e5594f3e7c70896ffeeef32b9c950ywan 681233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm6, mm6 ; 682233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm7, mm7 ; 683233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rax, arg(5) ;HFilter ; 684233d2500723e5594f3e7c70896ffeeef32b9c950ywan 685233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(6) ;VFilter ; 686233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;ref_ptr ; 687233d2500723e5594f3e7c70896ffeeef32b9c950ywan 688233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(2) ;src_ptr ; 689233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, dword ptr arg(4) ;Height ; 690233d2500723e5594f3e7c70896ffeeef32b9c950ywan 691233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm0, mm0 ; 692233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rsi] ; 693233d2500723e5594f3e7c70896ffeeef32b9c950ywan 694233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, [rsi+1] ; 695233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm1 ; 696233d2500723e5594f3e7c70896ffeeef32b9c950ywan 697233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm4, mm3 ; 698233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm0 ; 699233d2500723e5594f3e7c70896ffeeef32b9c950ywan 700233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm0 ; 701233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm1, [rax] ; 702233d2500723e5594f3e7c70896ffeeef32b9c950ywan 703233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm2, [rax] ; 704233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm3, mm0 ; 705233d2500723e5594f3e7c70896ffeeef32b9c950ywan 706233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm4, mm0 ; 707233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm3, [rax+8] ; 708233d2500723e5594f3e7c70896ffeeef32b9c950ywan 709233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm4, [rax+8] ; 710233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, mm3 ; 711233d2500723e5594f3e7c70896ffeeef32b9c950ywan 712233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm2, mm4 ; 713233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, [GLOBAL(mmx_bi_rd)] ; 714233d2500723e5594f3e7c70896ffeeef32b9c950ywan 715233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw mm1, mmx_filter_shift ; 716233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm2, [GLOBAL(mmx_bi_rd)] ; 717233d2500723e5594f3e7c70896ffeeef32b9c950ywan 718233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw mm2, mmx_filter_shift ; 719233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm5, mm1 720233d2500723e5594f3e7c70896ffeeef32b9c950ywan 721233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb mm5, mm2 ; 722233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 723233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, dword ptr arg(1) ;ref_pixels_per_line 724233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 725233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 726233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, r8 727233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 728233d2500723e5594f3e7c70896ffeeef32b9c950ywan 729233d2500723e5594f3e7c70896ffeeef32b9c950ywan.filter_block2d_bil_var_mmx_loop: 730233d2500723e5594f3e7c70896ffeeef32b9c950ywan 731233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rsi] ; 732233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, [rsi+1] ; 733233d2500723e5594f3e7c70896ffeeef32b9c950ywan 734233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm2, mm1 ; 735233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm4, mm3 ; 736233d2500723e5594f3e7c70896ffeeef32b9c950ywan 737233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm0 ; 738233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm2, mm0 ; 739233d2500723e5594f3e7c70896ffeeef32b9c950ywan 740233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm1, [rax] ; 741233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm2, [rax] ; 742233d2500723e5594f3e7c70896ffeeef32b9c950ywan 743233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm3, mm0 ; 744233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm4, mm0 ; 745233d2500723e5594f3e7c70896ffeeef32b9c950ywan 746233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm3, [rax+8] ; 747233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm4, [rax+8] ; 748233d2500723e5594f3e7c70896ffeeef32b9c950ywan 749233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, mm3 ; 750233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm2, mm4 ; 751233d2500723e5594f3e7c70896ffeeef32b9c950ywan 752233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, [GLOBAL(mmx_bi_rd)] ; 753233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw mm1, mmx_filter_shift ; 754233d2500723e5594f3e7c70896ffeeef32b9c950ywan 755233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm2, [GLOBAL(mmx_bi_rd)] ; 756233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw mm2, mmx_filter_shift ; 757233d2500723e5594f3e7c70896ffeeef32b9c950ywan 758233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, mm5 ; 759233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm4, mm5 ; 760233d2500723e5594f3e7c70896ffeeef32b9c950ywan 761233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm3, mm0 ; 762233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm4, mm0 ; 763233d2500723e5594f3e7c70896ffeeef32b9c950ywan 764233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm5, mm1 ; 765233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb mm5, mm2 ; 766233d2500723e5594f3e7c70896ffeeef32b9c950ywan 767233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm3, [rdx] ; 768233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm4, [rdx] ; 769233d2500723e5594f3e7c70896ffeeef32b9c950ywan 770233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm1, [rdx+8] ; 771233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw mm2, [rdx+8] ; 772233d2500723e5594f3e7c70896ffeeef32b9c950ywan 773233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, mm3 ; 774233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm2, mm4 ; 775233d2500723e5594f3e7c70896ffeeef32b9c950ywan 776233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm1, [GLOBAL(mmx_bi_rd)] ; 777233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm2, [GLOBAL(mmx_bi_rd)] ; 778233d2500723e5594f3e7c70896ffeeef32b9c950ywan 779233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw mm1, mmx_filter_shift ; 780233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw mm2, mmx_filter_shift ; 781233d2500723e5594f3e7c70896ffeeef32b9c950ywan 782233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm3, [rdi] ; 783233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm4, mm3 ; 784233d2500723e5594f3e7c70896ffeeef32b9c950ywan 785233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm3, mm0 ; 786233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhbw mm4, mm0 ; 787233d2500723e5594f3e7c70896ffeeef32b9c950ywan 788233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw mm1, mm3 ; 789233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw mm2, mm4 ; 790233d2500723e5594f3e7c70896ffeeef32b9c950ywan 791233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm6, mm1 ; 792233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm1, mm1 ; 793233d2500723e5594f3e7c70896ffeeef32b9c950ywan 794233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw mm6, mm2 ; 795233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd mm2, mm2 ; 796233d2500723e5594f3e7c70896ffeeef32b9c950ywan 797233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm1 ; 798233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm7, mm2 ; 799233d2500723e5594f3e7c70896ffeeef32b9c950ywan 800233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT 801233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 802233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, dword ptr arg(3) ;src_pixels_per_line ; 803233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 804233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 805233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; 806233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, r8 807233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, r9 808233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 809233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rcx, 1 ; 810233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .filter_block2d_bil_var_mmx_loop ; 811233d2500723e5594f3e7c70896ffeeef32b9c950ywan 812233d2500723e5594f3e7c70896ffeeef32b9c950ywan 813233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm3, mm3 ; 814233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm2, mm2 ; 815233d2500723e5594f3e7c70896ffeeef32b9c950ywan 816233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd mm2, mm6 ; 817233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd mm3, mm6 ; 818233d2500723e5594f3e7c70896ffeeef32b9c950ywan 819233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm3 ; 820233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm6, mm2 ; 821233d2500723e5594f3e7c70896ffeeef32b9c950ywan 822233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm6, 32 ; 823233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm2, mm6 ; 824233d2500723e5594f3e7c70896ffeeef32b9c950ywan 825233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad mm2, 16 ; 826233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm4, mm7 ; 827233d2500723e5594f3e7c70896ffeeef32b9c950ywan 828233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrlq mm4, 32 ; 829233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd mm4, mm7 ; 830233d2500723e5594f3e7c70896ffeeef32b9c950ywan 831233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(7) ;sum 832233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(8) ;sumsquared 833233d2500723e5594f3e7c70896ffeeef32b9c950ywan 834233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd dword ptr [rdi], mm2 ; 835233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd dword ptr [rsi], mm4 ; 836233d2500723e5594f3e7c70896ffeeef32b9c950ywan 837233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 838233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 16 839233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 840233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 841233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 842233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 843233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 844233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 845233d2500723e5594f3e7c70896ffeeef32b9c950ywan 846233d2500723e5594f3e7c70896ffeeef32b9c950ywan 847233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA 848233d2500723e5594f3e7c70896ffeeef32b9c950ywan;short mmx_bi_rd[4] = { 64, 64, 64, 64}; 849233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 850233d2500723e5594f3e7c70896ffeeef32b9c950ywanmmx_bi_rd: 851233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 4 dw 64 852