1233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan; Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan; that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan; tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan; in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan; be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan; 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "vpx_ports/x86_abi_support.asm" 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan;macro in deblock functions 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro FIRST_2_ROWS 0 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm0 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm0 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm1 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan pavgb xmm5, xmm3 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;calculate absolute value 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm4, xmm1 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, xmm0 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm6, xmm3 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm3, xmm0 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm4, xmm1 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm6, xmm3 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;get threshold 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, flimit 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm1, xmm1 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm7, xmm2 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;get mask 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm2, xmm4 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm7, xmm6 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm2, xmm1 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm7, xmm1 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm7, xmm2 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro SECOND_2_ROWS 0 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm6, xmm0 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm0 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan pavgb xmm1, xmm3 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;calculate absolute value 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm6, xmm2 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm2, xmm0 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm4, xmm3 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm3, xmm0 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm6, xmm2 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm4, xmm3 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan pavgb xmm5, xmm1 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;get threshold 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, flimit 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm1, xmm1 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm2 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;get mask 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm2, xmm6 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm3, xmm4 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm2, xmm1 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan pcmpeqb xmm3, xmm1 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm7, xmm2 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm7, xmm3 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan pavgb xmm5, xmm0 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;decide if or not to use filtered value 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm0, xmm7 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan pandn xmm7, xmm5 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm0, xmm7 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro UPDATE_FLIMIT 0 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, XMMWORD PTR [rbx] 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa [rsp], xmm2 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rbx, 16 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_post_proc_down_and_across_mb_row_sse2 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan;( 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *src_ptr, 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char *dst_ptr, 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int src_pixels_per_line, 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int dst_pixels_per_line, 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int cols, 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int *flimits, 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int size 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan;) 96233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE 97233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_post_proc_down_and_across_mb_row_sse2): 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 7 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbx 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 16 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; put flimit on stack 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbx, arg(5) ;flimits ptr 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan UPDATE_FLIMIT 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define flimit [rsp] 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;src_ptr 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, arg(1) ;dst_ptr 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan.nextrow: 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan xor rdx, rdx ;col 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan.nextcol: 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;load current and next 2 rows 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm0, XMMWORD PTR [rsi] 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm1, XMMWORD PTR [rsi + rax] 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm3, XMMWORD PTR [rsi + 2*rax] 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan FIRST_2_ROWS 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;load above 2 rows 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm1, XMMWORD PTR [rsi + 2*rax] 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm3, XMMWORD PTR [rsi + rax] 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan SECOND_2_ROWS 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu XMMWORD PTR [rdi], xmm0 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax ; positive stride 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi, 16 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, 16 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, 16 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp edx, dword arg(4) ;cols 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan jge .downdone 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan UPDATE_FLIMIT 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan jmp .nextcol 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan.downdone: 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; done with the all cols, start the across filtering in place 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsi, rdx 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rdi, rdx 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbx, arg(5) ; flimits 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan UPDATE_FLIMIT 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dup the first byte into the left border 8 times 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rdi] 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm1 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd mm1, mm1 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq mm1, mm1 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, -8 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi+rdx], mm1 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dup the last byte into the right border 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword arg(4) 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rdi + rdx + -1] 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm1 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd mm1, mm1 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq mm1, mm1 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi+rdx], mm1 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan xor rdx, rdx 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, QWORD PTR [rdi-16]; 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, QWORD PTR [rdi-8]; 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan.acrossnextcol: 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm0, XMMWORD PTR [rdi + rdx] 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm1, XMMWORD PTR [rdi + rdx -2] 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm3, XMMWORD PTR [rdi + rdx -1] 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan FIRST_2_ROWS 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm1, XMMWORD PTR [rdi + rdx +1] 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm3, XMMWORD PTR [rdi + rdx +2] 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan SECOND_2_ROWS 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm0, xmm0 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm0, 8 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm1, xmm0 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, 16 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp edx, dword arg(4) ;cols 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan jge .acrossdone 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan UPDATE_FLIMIT 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan jmp .acrossnextcol 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan.acrossdone 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; last 16 pixels 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rdi+rdx-16], mm0 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp edx, dword arg(4) 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan jne .throw_last_8 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rdi+rdx-8], mm1 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan.throw_last_8: 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; done with this rwo 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsi,rax ;next src line 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov eax, dword arg(3) ;dst_pixels_per_line 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi,rax ;next destination 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov eax, dword arg(2) ;src_pixels_per_line 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbx, arg(5) ;flimits 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan UPDATE_FLIMIT 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rcx ;decrement count 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan jnz .nextrow ;next row 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 16 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbx 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan%undef flimit 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbpost_proc_down_xmm(unsigned char *dst, 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int pitch, int rows, int cols,int flimit) 235233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern sym(vp8_rv) 236233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbpost_proc_down_xmm) PRIVATE 237233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbpost_proc_down_xmm): 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 128+16 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; unsigned char d[16][8] at [rsp] 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; create flimit2 at [rsp+128] 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov eax, dword ptr arg(4) ;flimit 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp+128], eax 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp+128+4], eax 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp+128+8], eax 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp+128+12], eax 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define flimit4 [rsp+128] 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=0 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea r8, [GLOBAL(sym(vp8_rv))] 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;rows +=8; 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan add dword arg(2), 8 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;for(c=0; c<cols; c+=8) 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_col: 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ; s 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm0 ; 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword ptr arg(1) ;pitch ; 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; this copies the last row down into the border 8 rows 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, rsi 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rdx, 9 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan imul rdx, rax 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi+rdx] 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, QWORD ptr[rdi] ; first row 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, 8 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan.init_borderd ; initialize borders 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + rax] 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi], xmm1 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rcx 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan jne .init_borderd 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax ; rax = -pitch 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; this copies the first row up into the border 8 rows 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, rsi 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, QWORD ptr[rdi] ; first row 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, 8 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan.init_border ; initialize borders 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi + rax] 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rdi], xmm1 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rcx 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan jne .init_border 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan neg rax 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm5, xmm5 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm6, xmm6 ; 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm7, xmm7 ; 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, rsi 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, 15 ; 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_initvar: 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, QWORD PTR [rdi]; 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm0 ; 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm1 ; 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm1, xmm1 ; 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 ; 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm1, xmm0 ; 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm2, xmm0 ; 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm1 ; 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm2 ; 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi+rax] ; 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan dec rcx 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan jne .loop_initvar 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;save the var and sum 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan xor rdx, rdx 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_row: 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm0 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm5, xmm2 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubw xmm5, xmm1 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm2, xmm2 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm2 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm2, xmm0 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm4, xmm0 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm2 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm4 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm1, xmm1 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm1, xmm0 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm6, xmm1 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm2, xmm0 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm7, xmm2 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm6 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan pslld xmm3, 4 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm3, xmm6 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm1, xmm5 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm5 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmullw xmm1, xmm1 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmulhw xmm4, xmm4 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm2, xmm1 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm1, xmm4 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckhwd xmm2, xmm4 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm4, xmm7 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan pslld xmm4, 4 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm4, xmm7 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm3, xmm1 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm4, xmm2 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm3, flimit4 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm4, flimit4 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad xmm3, 31 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad xmm4, 31 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan packssdw xmm3, xmm4 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm3, xmm0 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm1, QWORD PTR [rsi+rax*8] 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, xmm1 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm0 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm1, xmm5 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, rdx 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan and rcx, 127 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan%if ABI_IS_32BIT=1 && CONFIG_PIC=1 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rax 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rax, [GLOBAL(sym(vp8_rv))] 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2] 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rax 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan%elif ABI_IS_32BIT=0 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2] 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan%else 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm4, [sym(vp8_rv) + rcx*2] 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endif 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddw xmm1, xmm4 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;paddw xmm1, eight8s 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan psraw xmm1, 4 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm1, xmm0 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm1, xmm3 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan pandn xmm3, xmm2 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm1, xmm3 424233d2500723e5594f3e7c70896ffeeef32b9c950ywan 425233d2500723e5594f3e7c70896ffeeef32b9c950ywan and rcx, 15 426233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] 427233d2500723e5594f3e7c70896ffeeef32b9c950ywan 428233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp edx, 8 429233d2500723e5594f3e7c70896ffeeef32b9c950ywan jl .skip_assignment 430233d2500723e5594f3e7c70896ffeeef32b9c950ywan 431233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, rdx 432233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rcx, 8 433233d2500723e5594f3e7c70896ffeeef32b9c950ywan and rcx, 15 434233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, [rsp + rcx*8] ;d[rcx*8] 435233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi], mm0 436233d2500723e5594f3e7c70896ffeeef32b9c950ywan 437233d2500723e5594f3e7c70896ffeeef32b9c950ywan.skip_assignment 438233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rsi, [rsi+rax] 439233d2500723e5594f3e7c70896ffeeef32b9c950ywan 440233d2500723e5594f3e7c70896ffeeef32b9c950ywan lea rdi, [rdi+rax] 441233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, 1 442233d2500723e5594f3e7c70896ffeeef32b9c950ywan 443233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp edx, dword arg(2) ;rows 444233d2500723e5594f3e7c70896ffeeef32b9c950ywan jl .loop_row 445233d2500723e5594f3e7c70896ffeeef32b9c950ywan 446233d2500723e5594f3e7c70896ffeeef32b9c950ywan add dword arg(0), 8 ; s += 8 447233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub dword arg(3), 8 ; cols -= 8 448233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp dword arg(3), 0 449233d2500723e5594f3e7c70896ffeeef32b9c950ywan jg .loop_col 450233d2500723e5594f3e7c70896ffeeef32b9c950ywan 451233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 128+16 452233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 453233d2500723e5594f3e7c70896ffeeef32b9c950ywan 454233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 455233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 456233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 457233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 458233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 459233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 460233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 461233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 462233d2500723e5594f3e7c70896ffeeef32b9c950ywan%undef flimit4 463233d2500723e5594f3e7c70896ffeeef32b9c950ywan 464233d2500723e5594f3e7c70896ffeeef32b9c950ywan 465233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, 466233d2500723e5594f3e7c70896ffeeef32b9c950ywan; int pitch, int rows, int cols,int flimit) 467233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE 468233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_mbpost_proc_across_ip_xmm): 469233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 470233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 471233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 5 472233d2500723e5594f3e7c70896ffeeef32b9c950ywan SAVE_XMM 7 473233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 474233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 475233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 476233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 477233d2500723e5594f3e7c70896ffeeef32b9c950ywan 478233d2500723e5594f3e7c70896ffeeef32b9c950ywan ALIGN_STACK 16, rax 479233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub rsp, 16 480233d2500723e5594f3e7c70896ffeeef32b9c950ywan 481233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; create flimit4 at [rsp] 482233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov eax, dword ptr arg(4) ;flimit 483233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp], eax 484233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp+4], eax 485233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp+8], eax 486233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov [rsp+12], eax 487233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define flimit4 [rsp] 488233d2500723e5594f3e7c70896ffeeef32b9c950ywan 489233d2500723e5594f3e7c70896ffeeef32b9c950ywan 490233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;for(r=0;r<rows;r++) 491233d2500723e5594f3e7c70896ffeeef32b9c950ywan.ip_row_loop: 492233d2500723e5594f3e7c70896ffeeef32b9c950ywan 493233d2500723e5594f3e7c70896ffeeef32b9c950ywan xor rdx, rdx ;sumsq=0; 494233d2500723e5594f3e7c70896ffeeef32b9c950ywan xor rcx, rcx ;sum=0; 495233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0); s 496233d2500723e5594f3e7c70896ffeeef32b9c950ywan 497233d2500723e5594f3e7c70896ffeeef32b9c950ywan 498233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dup the first byte into the left border 8 times 499233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rsi] 500233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm1 501233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd mm1, mm1 502233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq mm1, mm1 503233d2500723e5594f3e7c70896ffeeef32b9c950ywan 504233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, -8 505233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+rdi], mm1 506233d2500723e5594f3e7c70896ffeeef32b9c950ywan 507233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; dup the last byte into the right border 508233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword arg(3) 509233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm1, [rsi + rdx + -1] 510233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw mm1, mm1 511233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd mm1, mm1 512233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpckldq mm1, mm1 513233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq [rsi+rdx], mm1 514233d2500723e5594f3e7c70896ffeeef32b9c950ywan 515233d2500723e5594f3e7c70896ffeeef32b9c950ywan.ip_var_loop: 516233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;for(i=-8;i<=6;i++) 517233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;{ 518233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; sumsq += s[i]*s[i]; 519233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; sum += s[i]; 520233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;} 521233d2500723e5594f3e7c70896ffeeef32b9c950ywan movzx eax, byte [rsi+rdi] 522233d2500723e5594f3e7c70896ffeeef32b9c950ywan add ecx, eax 523233d2500723e5594f3e7c70896ffeeef32b9c950ywan mul al 524233d2500723e5594f3e7c70896ffeeef32b9c950ywan add edx, eax 525233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdi, 1 526233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp rdi, 6 527233d2500723e5594f3e7c70896ffeeef32b9c950ywan jle .ip_var_loop 528233d2500723e5594f3e7c70896ffeeef32b9c950ywan 529233d2500723e5594f3e7c70896ffeeef32b9c950ywan 530233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;mov rax, sumsq 531233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;movd xmm7, rax 532233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm7, edx 533233d2500723e5594f3e7c70896ffeeef32b9c950ywan 534233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;mov rax, sum 535233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;movd xmm6, rax 536233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm6, ecx 537233d2500723e5594f3e7c70896ffeeef32b9c950ywan 538233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;s 539233d2500723e5594f3e7c70896ffeeef32b9c950ywan xor rcx, rcx 540233d2500723e5594f3e7c70896ffeeef32b9c950ywan 541233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rdx, dword arg(3) ;cols 542233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rdx, 8 543233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm0, mm0 544233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor mm1, mm1 545233d2500723e5594f3e7c70896ffeeef32b9c950ywan 546233d2500723e5594f3e7c70896ffeeef32b9c950ywan pxor xmm0, xmm0 547233d2500723e5594f3e7c70896ffeeef32b9c950ywan.nextcol4: 548233d2500723e5594f3e7c70896ffeeef32b9c950ywan 549233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 550233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 551233d2500723e5594f3e7c70896ffeeef32b9c950ywan 552233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm0 ; expanding 553233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm2, xmm0 ; expanding 554233d2500723e5594f3e7c70896ffeeef32b9c950ywan 555233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm1, xmm0 ; expanding to dwords 556233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm2, xmm0 ; expanding to dwords 557233d2500723e5594f3e7c70896ffeeef32b9c950ywan 558233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 559233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 560233d2500723e5594f3e7c70896ffeeef32b9c950ywan 561233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 562233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 563233d2500723e5594f3e7c70896ffeeef32b9c950ywan 564233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm2 565233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm1 566233d2500723e5594f3e7c70896ffeeef32b9c950ywan 567233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm6, xmm6, 0 ; duplicate the last ones 568233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm7, xmm7, 0 ; duplicate the last ones 569233d2500723e5594f3e7c70896ffeeef32b9c950ywan 570233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 571233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 572233d2500723e5594f3e7c70896ffeeef32b9c950ywan 573233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared 574233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared 575233d2500723e5594f3e7c70896ffeeef32b9c950ywan 576233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm4 577233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm3 578233d2500723e5594f3e7c70896ffeeef32b9c950ywan 579233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared 580233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared 581233d2500723e5594f3e7c70896ffeeef32b9c950ywan 582233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm3 583233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm4 584233d2500723e5594f3e7c70896ffeeef32b9c950ywan 585233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared 586233d2500723e5594f3e7c70896ffeeef32b9c950ywan pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared 587233d2500723e5594f3e7c70896ffeeef32b9c950ywan 588233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm7, xmm3 589233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm6, xmm4 590233d2500723e5594f3e7c70896ffeeef32b9c950ywan 591233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm3, xmm6 592233d2500723e5594f3e7c70896ffeeef32b9c950ywan pmaddwd xmm3, xmm3 593233d2500723e5594f3e7c70896ffeeef32b9c950ywan 594233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqa xmm5, xmm7 595233d2500723e5594f3e7c70896ffeeef32b9c950ywan pslld xmm5, 4 596233d2500723e5594f3e7c70896ffeeef32b9c950ywan 597233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm5, xmm7 598233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm5, xmm3 599233d2500723e5594f3e7c70896ffeeef32b9c950ywan 600233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubd xmm5, flimit4 601233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad xmm5, 31 602233d2500723e5594f3e7c70896ffeeef32b9c950ywan 603233d2500723e5594f3e7c70896ffeeef32b9c950ywan packssdw xmm5, xmm0 604233d2500723e5594f3e7c70896ffeeef32b9c950ywan packsswb xmm5, xmm0 605233d2500723e5594f3e7c70896ffeeef32b9c950ywan 606233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd xmm1, DWORD PTR [rsi+rcx] 607233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq xmm2, xmm1 608233d2500723e5594f3e7c70896ffeeef32b9c950ywan 609233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklbw xmm1, xmm0 610233d2500723e5594f3e7c70896ffeeef32b9c950ywan punpcklwd xmm1, xmm0 611233d2500723e5594f3e7c70896ffeeef32b9c950ywan 612233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, xmm6 613233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddd xmm1, [GLOBAL(four8s)] 614233d2500723e5594f3e7c70896ffeeef32b9c950ywan 615233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrad xmm1, 4 616233d2500723e5594f3e7c70896ffeeef32b9c950ywan packssdw xmm1, xmm0 617233d2500723e5594f3e7c70896ffeeef32b9c950ywan 618233d2500723e5594f3e7c70896ffeeef32b9c950ywan packuswb xmm1, xmm0 619233d2500723e5594f3e7c70896ffeeef32b9c950ywan pand xmm1, xmm5 620233d2500723e5594f3e7c70896ffeeef32b9c950ywan 621233d2500723e5594f3e7c70896ffeeef32b9c950ywan pandn xmm5, xmm2 622233d2500723e5594f3e7c70896ffeeef32b9c950ywan por xmm5, xmm1 623233d2500723e5594f3e7c70896ffeeef32b9c950ywan 624233d2500723e5594f3e7c70896ffeeef32b9c950ywan movd [rsi+rcx-8], mm0 625233d2500723e5594f3e7c70896ffeeef32b9c950ywan movq mm0, mm1 626233d2500723e5594f3e7c70896ffeeef32b9c950ywan 627233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdq2q mm1, xmm5 628233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm7, 12 629233d2500723e5594f3e7c70896ffeeef32b9c950ywan 630233d2500723e5594f3e7c70896ffeeef32b9c950ywan psrldq xmm6, 12 631233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rcx, 4 632233d2500723e5594f3e7c70896ffeeef32b9c950ywan 633233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp rcx, rdx 634233d2500723e5594f3e7c70896ffeeef32b9c950ywan jl .nextcol4 635233d2500723e5594f3e7c70896ffeeef32b9c950ywan 636233d2500723e5594f3e7c70896ffeeef32b9c950ywan ;s+=pitch; 637233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword arg(1) 638233d2500723e5594f3e7c70896ffeeef32b9c950ywan add arg(0), rax 639233d2500723e5594f3e7c70896ffeeef32b9c950ywan 640233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub dword arg(2), 1 ;rows-=1 641233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp dword arg(2), 0 642233d2500723e5594f3e7c70896ffeeef32b9c950ywan jg .ip_row_loop 643233d2500723e5594f3e7c70896ffeeef32b9c950ywan 644233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rsp, 16 645233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsp 646233d2500723e5594f3e7c70896ffeeef32b9c950ywan 647233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 648233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 649233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 650233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 651233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_XMM 652233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 653233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 654233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 655233d2500723e5594f3e7c70896ffeeef32b9c950ywan%undef flimit4 656233d2500723e5594f3e7c70896ffeeef32b9c950ywan 657233d2500723e5594f3e7c70896ffeeef32b9c950ywan 658233d2500723e5594f3e7c70896ffeeef32b9c950ywan;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, 659233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char blackclamp[16], 660233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char whiteclamp[16], 661233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned char bothclamp[16], 662233d2500723e5594f3e7c70896ffeeef32b9c950ywan; unsigned int Width, unsigned int Height, int Pitch) 663233d2500723e5594f3e7c70896ffeeef32b9c950ywanextern sym(rand) 664233d2500723e5594f3e7c70896ffeeef32b9c950ywanglobal sym(vp8_plane_add_noise_wmt) PRIVATE 665233d2500723e5594f3e7c70896ffeeef32b9c950ywansym(vp8_plane_add_noise_wmt): 666233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rbp 667233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rbp, rsp 668233d2500723e5594f3e7c70896ffeeef32b9c950ywan SHADOW_ARGS_TO_STACK 8 669233d2500723e5594f3e7c70896ffeeef32b9c950ywan GET_GOT rbx 670233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rsi 671233d2500723e5594f3e7c70896ffeeef32b9c950ywan push rdi 672233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; end prolog 673233d2500723e5594f3e7c70896ffeeef32b9c950ywan 674233d2500723e5594f3e7c70896ffeeef32b9c950ywan.addnoise_loop: 675233d2500723e5594f3e7c70896ffeeef32b9c950ywan call sym(rand) WRT_PLT 676233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rcx, arg(1) ;noise 677233d2500723e5594f3e7c70896ffeeef32b9c950ywan and rax, 0xff 678233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rcx, rax 679233d2500723e5594f3e7c70896ffeeef32b9c950ywan 680233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; we rely on the fact that the clamping vectors are stored contiguously 681233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; in black/white/both order. Note that we have to reload this here because 682233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; rdx could be trashed by rand() 683233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdx, arg(2) ; blackclamp 684233d2500723e5594f3e7c70896ffeeef32b9c950ywan 685233d2500723e5594f3e7c70896ffeeef32b9c950ywan 686233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rdi, rcx 687233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rcx, dword arg(5) ;[Width] 688233d2500723e5594f3e7c70896ffeeef32b9c950ywan mov rsi, arg(0) ;Pos 689233d2500723e5594f3e7c70896ffeeef32b9c950ywan xor rax,rax 690233d2500723e5594f3e7c70896ffeeef32b9c950ywan 691233d2500723e5594f3e7c70896ffeeef32b9c950ywan.addnoise_nextset: 692233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm1,[rsi+rax] ; get the source 693233d2500723e5594f3e7c70896ffeeef32b9c950ywan 694233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise 695233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddusb xmm1, [rdx+32] ;bothclamp 696233d2500723e5594f3e7c70896ffeeef32b9c950ywan psubusb xmm1, [rdx+16] ;whiteclamp 697233d2500723e5594f3e7c70896ffeeef32b9c950ywan 698233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu xmm2,[rdi+rax] ; get the noise for this line 699233d2500723e5594f3e7c70896ffeeef32b9c950ywan paddb xmm1,xmm2 ; add it in 700233d2500723e5594f3e7c70896ffeeef32b9c950ywan movdqu [rsi+rax],xmm1 ; store the result 701233d2500723e5594f3e7c70896ffeeef32b9c950ywan 702233d2500723e5594f3e7c70896ffeeef32b9c950ywan add rax,16 ; move to the next line 703233d2500723e5594f3e7c70896ffeeef32b9c950ywan 704233d2500723e5594f3e7c70896ffeeef32b9c950ywan cmp rax, rcx 705233d2500723e5594f3e7c70896ffeeef32b9c950ywan jl .addnoise_nextset 706233d2500723e5594f3e7c70896ffeeef32b9c950ywan 707233d2500723e5594f3e7c70896ffeeef32b9c950ywan movsxd rax, dword arg(7) ; Pitch 708233d2500723e5594f3e7c70896ffeeef32b9c950ywan add arg(0), rax ; Start += Pitch 709233d2500723e5594f3e7c70896ffeeef32b9c950ywan sub dword arg(6), 1 ; Height -= 1 710233d2500723e5594f3e7c70896ffeeef32b9c950ywan jg .addnoise_loop 711233d2500723e5594f3e7c70896ffeeef32b9c950ywan 712233d2500723e5594f3e7c70896ffeeef32b9c950ywan ; begin epilog 713233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rdi 714233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rsi 715233d2500723e5594f3e7c70896ffeeef32b9c950ywan RESTORE_GOT 716233d2500723e5594f3e7c70896ffeeef32b9c950ywan UNSHADOW_ARGS 717233d2500723e5594f3e7c70896ffeeef32b9c950ywan pop rbp 718233d2500723e5594f3e7c70896ffeeef32b9c950ywan ret 719233d2500723e5594f3e7c70896ffeeef32b9c950ywan 720233d2500723e5594f3e7c70896ffeeef32b9c950ywan 721233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION_RODATA 722233d2500723e5594f3e7c70896ffeeef32b9c950ywanalign 16 723233d2500723e5594f3e7c70896ffeeef32b9c950ywanfour8s: 724233d2500723e5594f3e7c70896ffeeef32b9c950ywan times 4 dd 8 725