1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13;void vp9_half_horiz_vert_variance16x_h_sse2 14;( 15; unsigned char *ref_ptr, 16; int ref_pixels_per_line, 17; unsigned char *src_ptr, 18; int src_pixels_per_line, 19; unsigned int Height, 20; int *sum, 21; unsigned int *sumsquared 22;) 23global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE 24sym(vp9_half_horiz_vert_variance16x_h_sse2): 25 push rbp 26 mov rbp, rsp 27 SHADOW_ARGS_TO_STACK 7 28 SAVE_XMM 7 29 GET_GOT rbx 30 push rsi 31 push rdi 32 ; end prolog 33 34 pxor xmm6, xmm6 ; error accumulator 35 pxor xmm7, xmm7 ; sse eaccumulator 36 mov rsi, arg(0) ;ref_ptr ; 37 38 mov rdi, arg(2) ;src_ptr ; 39 movsxd rcx, dword ptr arg(4) ;Height ; 40 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 41 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 42 43 pxor xmm0, xmm0 ; 44 45 movdqu xmm5, XMMWORD PTR [rsi] 46 movdqu xmm3, XMMWORD PTR [rsi+1] 47 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 48 49 lea rsi, [rsi + rax] 50 51.half_horiz_vert_variance16x_h_1: 52 movdqu xmm1, XMMWORD PTR [rsi] ; 53 movdqu xmm2, XMMWORD PTR [rsi+1] ; 54 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 55 56 pavgb xmm5, xmm1 ; xmm = vertical average of the above 57 58 movdqa xmm4, xmm5 59 punpcklbw xmm5, xmm0 ; xmm5 = words of above 60 punpckhbw xmm4, xmm0 61 62 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 63 punpcklbw xmm3, xmm0 ; xmm3 = words of above 64 psubw xmm5, xmm3 ; xmm5 -= xmm3 65 66 movq xmm3, QWORD PTR [rdi+8] 67 punpcklbw xmm3, xmm0 68 psubw xmm4, xmm3 69 70 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 71 paddw xmm6, xmm4 72 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 73 pmaddwd xmm4, xmm4 74 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 75 paddd xmm7, xmm4 76 77 movdqa xmm5, xmm1 ; save xmm1 for use on the next row 78 79 lea rsi, [rsi + rax] 80 lea rdi, [rdi + rdx] 81 82 sub rcx, 1 ; 83 jnz .half_horiz_vert_variance16x_h_1 ; 84 85 pxor xmm1, xmm1 86 pxor xmm5, xmm5 87 88 punpcklwd xmm0, xmm6 89 punpckhwd xmm1, xmm6 90 psrad xmm0, 16 91 psrad xmm1, 16 92 paddd xmm0, xmm1 93 movdqa xmm1, xmm0 94 95 movdqa xmm6, xmm7 96 punpckldq xmm6, xmm5 97 punpckhdq xmm7, xmm5 98 paddd xmm6, xmm7 99 100 punpckldq xmm0, xmm5 101 punpckhdq xmm1, xmm5 102 paddd xmm0, xmm1 103 104 movdqa xmm7, xmm6 105 movdqa xmm1, xmm0 106 107 psrldq xmm7, 8 108 psrldq xmm1, 8 109 110 paddd xmm6, xmm7 111 paddd xmm0, xmm1 112 113 mov rsi, arg(5) ;[Sum] 114 mov rdi, arg(6) ;[SSE] 115 116 movd [rsi], xmm0 117 movd [rdi], xmm6 118 119 ; begin epilog 120 pop rdi 121 pop rsi 122 RESTORE_GOT 123 RESTORE_XMM 124 UNSHADOW_ARGS 125 pop rbp 126 ret 127 128;void vp9_half_vert_variance16x_h_sse2 129;( 130; unsigned char *ref_ptr, 131; int ref_pixels_per_line, 132; unsigned char *src_ptr, 133; int src_pixels_per_line, 134; unsigned int Height, 135; int *sum, 136; unsigned int *sumsquared 137;) 138global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE 139sym(vp9_half_vert_variance16x_h_sse2): 140 push rbp 141 mov rbp, rsp 142 SHADOW_ARGS_TO_STACK 7 143 SAVE_XMM 7 144 GET_GOT rbx 145 push rsi 146 push rdi 147 ; end prolog 148 149 pxor xmm6, xmm6 ; error accumulator 150 pxor xmm7, xmm7 ; sse eaccumulator 151 mov rsi, arg(0) ;ref_ptr 152 153 mov rdi, arg(2) ;src_ptr 154 movsxd rcx, dword ptr arg(4) ;Height 155 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 156 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 157 158 movdqu xmm5, XMMWORD PTR [rsi] 159 lea rsi, [rsi + rax ] 160 pxor xmm0, xmm0 161 162.half_vert_variance16x_h_1: 163 movdqu xmm3, XMMWORD PTR [rsi] 164 165 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 166 movdqa xmm4, xmm5 167 punpcklbw xmm5, xmm0 168 punpckhbw xmm4, xmm0 169 170 movq xmm2, QWORD PTR [rdi] 171 punpcklbw xmm2, xmm0 172 psubw xmm5, xmm2 173 movq xmm2, QWORD PTR [rdi+8] 174 punpcklbw xmm2, xmm0 175 psubw xmm4, xmm2 176 177 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 178 paddw xmm6, xmm4 179 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 180 pmaddwd xmm4, xmm4 181 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 182 paddd xmm7, xmm4 183 184 movdqa xmm5, xmm3 185 186 lea rsi, [rsi + rax] 187 lea rdi, [rdi + rdx] 188 189 sub rcx, 1 190 jnz .half_vert_variance16x_h_1 191 192 pxor xmm1, xmm1 193 pxor xmm5, xmm5 194 195 punpcklwd xmm0, xmm6 196 punpckhwd xmm1, xmm6 197 psrad xmm0, 16 198 psrad xmm1, 16 199 paddd xmm0, xmm1 200 movdqa xmm1, xmm0 201 202 movdqa xmm6, xmm7 203 punpckldq xmm6, xmm5 204 punpckhdq xmm7, xmm5 205 paddd xmm6, xmm7 206 207 punpckldq xmm0, xmm5 208 punpckhdq xmm1, xmm5 209 paddd xmm0, xmm1 210 211 movdqa xmm7, xmm6 212 movdqa xmm1, xmm0 213 214 psrldq xmm7, 8 215 psrldq xmm1, 8 216 217 paddd xmm6, xmm7 218 paddd xmm0, xmm1 219 220 mov rsi, arg(5) ;[Sum] 221 mov rdi, arg(6) ;[SSE] 222 223 movd [rsi], xmm0 224 movd [rdi], xmm6 225 226 ; begin epilog 227 pop rdi 228 pop rsi 229 RESTORE_GOT 230 RESTORE_XMM 231 UNSHADOW_ARGS 232 pop rbp 233 ret 234 235;void vp9_half_horiz_variance16x_h_sse2 236;( 237; unsigned char *ref_ptr, 238; int ref_pixels_per_line, 239; unsigned char *src_ptr, 240; int src_pixels_per_line, 241; unsigned int Height, 242; int *sum, 243; unsigned int *sumsquared 244;) 245global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE 246sym(vp9_half_horiz_variance16x_h_sse2): 247 push rbp 248 mov rbp, rsp 249 SHADOW_ARGS_TO_STACK 7 250 SAVE_XMM 7 251 GET_GOT rbx 252 push rsi 253 push rdi 254 ; end prolog 255 256 pxor xmm6, xmm6 ; error accumulator 257 pxor xmm7, xmm7 ; sse eaccumulator 258 mov rsi, arg(0) ;ref_ptr ; 259 260 mov rdi, arg(2) ;src_ptr ; 261 movsxd rcx, dword ptr arg(4) ;Height ; 262 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 263 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 264 265 pxor xmm0, xmm0 ; 266 267.half_horiz_variance16x_h_1: 268 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 269 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 270 271 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 272 movdqa xmm1, xmm5 273 punpcklbw xmm5, xmm0 ; xmm5 = words of above 274 punpckhbw xmm1, xmm0 275 276 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 277 punpcklbw xmm3, xmm0 ; xmm3 = words of above 278 movq xmm2, QWORD PTR [rdi+8] 279 punpcklbw xmm2, xmm0 280 281 psubw xmm5, xmm3 ; xmm5 -= xmm3 282 psubw xmm1, xmm2 283 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 284 paddw xmm6, xmm1 285 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 286 pmaddwd xmm1, xmm1 287 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 288 paddd xmm7, xmm1 289 290 lea rsi, [rsi + rax] 291 lea rdi, [rdi + rdx] 292 293 sub rcx, 1 ; 294 jnz .half_horiz_variance16x_h_1 ; 295 296 pxor xmm1, xmm1 297 pxor xmm5, xmm5 298 299 punpcklwd xmm0, xmm6 300 punpckhwd xmm1, xmm6 301 psrad xmm0, 16 302 psrad xmm1, 16 303 paddd xmm0, xmm1 304 movdqa xmm1, xmm0 305 306 movdqa xmm6, xmm7 307 punpckldq xmm6, xmm5 308 punpckhdq xmm7, xmm5 309 paddd xmm6, xmm7 310 311 punpckldq xmm0, xmm5 312 punpckhdq xmm1, xmm5 313 paddd xmm0, xmm1 314 315 movdqa xmm7, xmm6 316 movdqa xmm1, xmm0 317 318 psrldq xmm7, 8 319 psrldq xmm1, 8 320 321 paddd xmm6, xmm7 322 paddd xmm0, xmm1 323 324 mov rsi, arg(5) ;[Sum] 325 mov rdi, arg(6) ;[SSE] 326 327 movd [rsi], xmm0 328 movd [rdi], xmm6 329 330 ; begin epilog 331 pop rdi 332 pop rsi 333 RESTORE_GOT 334 RESTORE_XMM 335 UNSHADOW_ARGS 336 pop rbp 337 ret 338