1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, 15; short *diff, unsigned char *Predictor, 16; int pitch); 17global sym(vp8_subtract_b_mmx_impl) 18sym(vp8_subtract_b_mmx_impl): 19 push rbp 20 mov rbp, rsp 21 SHADOW_ARGS_TO_STACK 5 22 push rsi 23 push rdi 24 ; end prolog 25 26 27 mov rdi, arg(2) ;diff 28 mov rax, arg(3) ;Predictor 29 mov rsi, arg(0) ;z 30 movsxd rdx, dword ptr arg(1);src_stride; 31 movsxd rcx, dword ptr arg(4);pitch 32 pxor mm7, mm7 33 34 movd mm0, [rsi] 35 movd mm1, [rax] 36 punpcklbw mm0, mm7 37 punpcklbw mm1, mm7 38 psubw mm0, mm1 39 movq [rdi], mm0 40 41 42 movd mm0, [rsi+rdx] 43 movd mm1, [rax+rcx] 44 punpcklbw mm0, mm7 45 punpcklbw mm1, mm7 46 psubw mm0, mm1 47 movq [rdi+rcx*2],mm0 48 49 50 movd mm0, [rsi+rdx*2] 51 movd mm1, [rax+rcx*2] 52 punpcklbw mm0, mm7 53 punpcklbw mm1, mm7 54 psubw mm0, mm1 55 movq [rdi+rcx*4], mm0 56 57 lea rsi, [rsi+rdx*2] 58 lea rcx, [rcx+rcx*2] 59 60 61 62 movd mm0, [rsi+rdx] 63 movd mm1, [rax+rcx] 64 punpcklbw mm0, mm7 65 punpcklbw mm1, mm7 66 psubw mm0, mm1 67 movq [rdi+rcx*2], mm0 68 69 ; begin epilog 70 pop rdi 71 pop rsi 72 UNSHADOW_ARGS 73 pop rbp 74 ret 75 76;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride) 77global sym(vp8_subtract_mby_mmx) 78sym(vp8_subtract_mby_mmx): 79 push rbp 80 mov rbp, rsp 81 SHADOW_ARGS_TO_STACK 4 82 push rsi 83 push rdi 84 ; end prolog 85 86 87 mov rsi, arg(1) ;src 88 mov rdi, arg(0) ;diff 89 90 mov rax, arg(2) ;pred 91 movsxd rdx, dword ptr arg(3) ;stride 92 93 mov rcx, 16 94 pxor mm0, mm0 95 96submby_loop: 97 98 movq mm1, [rsi] 99 movq mm3, [rax] 100 101 movq mm2, mm1 102 movq mm4, mm3 103 104 punpcklbw mm1, mm0 105 punpcklbw mm3, mm0 106 107 punpckhbw mm2, mm0 108 punpckhbw mm4, mm0 109 110 psubw mm1, mm3 111 psubw mm2, mm4 112 113 movq [rdi], mm1 114 movq [rdi+8], mm2 115 116 117 movq mm1, [rsi+8] 118 movq mm3, [rax+8] 119 120 movq mm2, mm1 121 movq mm4, mm3 122 123 punpcklbw mm1, mm0 124 punpcklbw mm3, mm0 125 126 punpckhbw mm2, mm0 127 punpckhbw mm4, mm0 128 129 psubw mm1, mm3 130 psubw mm2, mm4 131 132 movq [rdi+16], mm1 133 movq [rdi+24], mm2 134 135 136 add rdi, 32 137 add rax, 16 138 139 lea rsi, [rsi+rdx] 140 141 sub rcx, 1 142 jnz submby_loop 143 144 pop rdi 145 pop rsi 146 ; begin epilog 147 UNSHADOW_ARGS 148 pop rbp 149 ret 150 151 152;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) 153global sym(vp8_subtract_mbuv_mmx) 154sym(vp8_subtract_mbuv_mmx): 155 push rbp 156 mov rbp, rsp 157 SHADOW_ARGS_TO_STACK 5 158 push rsi 159 push rdi 160 ; end prolog 161 162 ;short *udiff = diff + 256; 163 ;short *vdiff = diff + 320; 164 ;unsigned char *upred = pred + 256; 165 ;unsigned char *vpred = pred + 320; 166 167 ;unsigned char *z = usrc; 168 ;unsigned short *diff = udiff; 169 ;unsigned char *Predictor= upred; 170 171 mov rdi, arg(0) ;diff 172 mov rax, arg(3) ;pred 173 mov rsi, arg(1) ;z = usrc 174 add rdi, 256*2 ;diff = diff + 256 (shorts) 175 add rax, 256 ;Predictor = pred + 256 176 movsxd rdx, dword ptr arg(4) ;stride; 177 pxor mm7, mm7 178 179 movq mm0, [rsi] 180 movq mm1, [rax] 181 movq mm3, mm0 182 movq mm4, mm1 183 punpcklbw mm0, mm7 184 punpcklbw mm1, mm7 185 punpckhbw mm3, mm7 186 punpckhbw mm4, mm7 187 psubw mm0, mm1 188 psubw mm3, mm4 189 movq [rdi], mm0 190 movq [rdi+8], mm3 191 192 193 movq mm0, [rsi+rdx] 194 movq mm1, [rax+8] 195 movq mm3, mm0 196 movq mm4, mm1 197 punpcklbw mm0, mm7 198 punpcklbw mm1, mm7 199 punpckhbw mm3, mm7 200 punpckhbw mm4, mm7 201 psubw mm0, mm1 202 psubw mm3, mm4 203 movq [rdi+16], mm0 204 movq [rdi+24], mm3 205 206 movq mm0, [rsi+rdx*2] 207 movq mm1, [rax+16] 208 movq mm3, mm0 209 movq mm4, mm1 210 punpcklbw mm0, mm7 211 punpcklbw mm1, mm7 212 punpckhbw mm3, mm7 213 punpckhbw mm4, mm7 214 psubw mm0, mm1 215 psubw mm3, mm4 216 movq [rdi+32], mm0 217 movq [rdi+40], mm3 218 lea rsi, [rsi+rdx*2] 219 220 221 movq mm0, [rsi+rdx] 222 movq mm1, [rax+24] 223 movq mm3, mm0 224 movq mm4, mm1 225 punpcklbw mm0, mm7 226 punpcklbw mm1, mm7 227 punpckhbw mm3, mm7 228 punpckhbw mm4, mm7 229 psubw mm0, mm1 230 psubw mm3, mm4 231 232 movq [rdi+48], mm0 233 movq [rdi+56], mm3 234 235 236 add rdi, 64 237 add rax, 32 238 lea rsi, [rsi+rdx*2] 239 240 241 movq mm0, [rsi] 242 movq mm1, [rax] 243 movq mm3, mm0 244 movq mm4, mm1 245 punpcklbw mm0, mm7 246 punpcklbw mm1, mm7 247 punpckhbw mm3, mm7 248 punpckhbw mm4, mm7 249 psubw mm0, mm1 250 psubw mm3, mm4 251 movq [rdi], mm0 252 movq [rdi+8], mm3 253 254 255 movq mm0, [rsi+rdx] 256 movq mm1, [rax+8] 257 movq mm3, mm0 258 movq mm4, mm1 259 punpcklbw mm0, mm7 260 punpcklbw mm1, mm7 261 punpckhbw mm3, mm7 262 punpckhbw mm4, mm7 263 psubw mm0, mm1 264 psubw mm3, mm4 265 movq [rdi+16], mm0 266 movq [rdi+24], mm3 267 268 movq mm0, [rsi+rdx*2] 269 movq mm1, [rax+16] 270 movq mm3, mm0 271 movq mm4, mm1 272 punpcklbw mm0, mm7 273 punpcklbw mm1, mm7 274 punpckhbw mm3, mm7 275 punpckhbw mm4, mm7 276 psubw mm0, mm1 277 psubw mm3, mm4 278 movq [rdi+32], mm0 279 movq [rdi+40], mm3 280 lea rsi, [rsi+rdx*2] 281 282 283 movq mm0, [rsi+rdx] 284 movq mm1, [rax+24] 285 movq mm3, mm0 286 movq mm4, mm1 287 punpcklbw mm0, mm7 288 punpcklbw mm1, mm7 289 punpckhbw mm3, mm7 290 punpckhbw mm4, mm7 291 psubw mm0, mm1 292 psubw mm3, mm4 293 294 movq [rdi+48], mm0 295 movq [rdi+56], mm3 296 297 ;unsigned char *z = vsrc; 298 ;unsigned short *diff = vdiff; 299 ;unsigned char *Predictor= vpred; 300 301 mov rdi, arg(0) ;diff 302 mov rax, arg(3) ;pred 303 mov rsi, arg(2) ;z = usrc 304 add rdi, 320*2 ;diff = diff + 320 (shorts) 305 add rax, 320 ;Predictor = pred + 320 306 movsxd rdx, dword ptr arg(4) ;stride; 307 pxor mm7, mm7 308 309 movq mm0, [rsi] 310 movq mm1, [rax] 311 movq mm3, mm0 312 movq mm4, mm1 313 punpcklbw mm0, mm7 314 punpcklbw mm1, mm7 315 punpckhbw mm3, mm7 316 punpckhbw mm4, mm7 317 psubw mm0, mm1 318 psubw mm3, mm4 319 movq [rdi], mm0 320 movq [rdi+8], mm3 321 322 323 movq mm0, [rsi+rdx] 324 movq mm1, [rax+8] 325 movq mm3, mm0 326 movq mm4, mm1 327 punpcklbw mm0, mm7 328 punpcklbw mm1, mm7 329 punpckhbw mm3, mm7 330 punpckhbw mm4, mm7 331 psubw mm0, mm1 332 psubw mm3, mm4 333 movq [rdi+16], mm0 334 movq [rdi+24], mm3 335 336 movq mm0, [rsi+rdx*2] 337 movq mm1, [rax+16] 338 movq mm3, mm0 339 movq mm4, mm1 340 punpcklbw mm0, mm7 341 punpcklbw mm1, mm7 342 punpckhbw mm3, mm7 343 punpckhbw mm4, mm7 344 psubw mm0, mm1 345 psubw mm3, mm4 346 movq [rdi+32], mm0 347 movq [rdi+40], mm3 348 lea rsi, [rsi+rdx*2] 349 350 351 movq mm0, [rsi+rdx] 352 movq mm1, [rax+24] 353 movq mm3, mm0 354 movq mm4, mm1 355 punpcklbw mm0, mm7 356 punpcklbw mm1, mm7 357 punpckhbw mm3, mm7 358 punpckhbw mm4, mm7 359 psubw mm0, mm1 360 psubw mm3, mm4 361 362 movq [rdi+48], mm0 363 movq [rdi+56], mm3 364 365 366 add rdi, 64 367 add rax, 32 368 lea rsi, [rsi+rdx*2] 369 370 371 movq mm0, [rsi] 372 movq mm1, [rax] 373 movq mm3, mm0 374 movq mm4, mm1 375 punpcklbw mm0, mm7 376 punpcklbw mm1, mm7 377 punpckhbw mm3, mm7 378 punpckhbw mm4, mm7 379 psubw mm0, mm1 380 psubw mm3, mm4 381 movq [rdi], mm0 382 movq [rdi+8], mm3 383 384 385 movq mm0, [rsi+rdx] 386 movq mm1, [rax+8] 387 movq mm3, mm0 388 movq mm4, mm1 389 punpcklbw mm0, mm7 390 punpcklbw mm1, mm7 391 punpckhbw mm3, mm7 392 punpckhbw mm4, mm7 393 psubw mm0, mm1 394 psubw mm3, mm4 395 movq [rdi+16], mm0 396 movq [rdi+24], mm3 397 398 movq mm0, [rsi+rdx*2] 399 movq mm1, [rax+16] 400 movq mm3, mm0 401 movq mm4, mm1 402 punpcklbw mm0, mm7 403 punpcklbw mm1, mm7 404 punpckhbw mm3, mm7 405 punpckhbw mm4, mm7 406 psubw mm0, mm1 407 psubw mm3, mm4 408 movq [rdi+32], mm0 409 movq [rdi+40], mm3 410 lea rsi, [rsi+rdx*2] 411 412 413 movq mm0, [rsi+rdx] 414 movq mm1, [rax+24] 415 movq mm3, mm0 416 movq mm4, mm1 417 punpcklbw mm0, mm7 418 punpcklbw mm1, mm7 419 punpckhbw mm3, mm7 420 punpckhbw mm4, mm7 421 psubw mm0, mm1 422 psubw mm3, mm4 423 424 movq [rdi+48], mm0 425 movq [rdi+56], mm3 426 427 ; begin epilog 428 pop rdi 429 pop rsi 430 UNSHADOW_ARGS 431 pop rbp 432 ret 433