1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14 15;void vp8_loop_filter_horizontal_edge_mmx 16;( 17; unsigned char *src_ptr, 18; int src_pixel_step, 19; const char *flimit, 20; const char *limit, 21; const char *thresh, 22; int count 23;) 24global sym(vp8_loop_filter_horizontal_edge_mmx) 25sym(vp8_loop_filter_horizontal_edge_mmx): 26 push rbp 27 mov rbp, rsp 28 SHADOW_ARGS_TO_STACK 6 29 GET_GOT rbx 30 push rsi 31 push rdi 32 ; end prolog 33 34 ALIGN_STACK 16, rax 35 sub rsp, 32 ; reserve 32 bytes 36 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 37 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 38 39 mov rsi, arg(0) ;src_ptr 40 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 41 42 movsxd rcx, dword ptr arg(5) ;count 43next8_h: 44 mov rdx, arg(3) ;limit 45 movq mm7, [rdx] 46 mov rdi, rsi ; rdi points to row +1 for indirect addressing 47 add rdi, rax 48 49 ; calculate breakout conditions 50 movq mm2, [rdi+2*rax] ; q3 51 movq mm1, [rsi+2*rax] ; q2 52 movq mm6, mm1 ; q2 53 psubusb mm1, mm2 ; q2-=q3 54 psubusb mm2, mm6 ; q3-=q2 55 por mm1, mm2 ; abs(q3-q2) 56 psubusb mm1, mm7 ; 57 58 59 movq mm4, [rsi+rax] ; q1 60 movq mm3, mm4 ; q1 61 psubusb mm4, mm6 ; q1-=q2 62 psubusb mm6, mm3 ; q2-=q1 63 por mm4, mm6 ; abs(q2-q1) 64 65 psubusb mm4, mm7 66 por mm1, mm4 67 68 movq mm4, [rsi] ; q0 69 movq mm0, mm4 ; q0 70 psubusb mm4, mm3 ; q0-=q1 71 psubusb mm3, mm0 ; q1-=q0 72 por mm4, mm3 ; abs(q0-q1) 73 movq t0, mm4 ; save to t0 74 psubusb mm4, mm7 75 por mm1, mm4 76 77 78 neg rax ; negate pitch to deal with above border 79 80 movq mm2, [rsi+4*rax] ; p3 81 movq mm4, [rdi+4*rax] ; p2 82 movq mm5, mm4 ; p2 83 psubusb mm4, mm2 ; p2-=p3 84 psubusb mm2, mm5 ; p3-=p2 85 por mm4, mm2 ; abs(p3 - p2) 86 psubusb mm4, mm7 87 por mm1, mm4 88 89 90 movq mm4, [rsi+2*rax] ; p1 91 movq mm3, mm4 ; p1 92 psubusb mm4, mm5 ; p1-=p2 93 psubusb mm5, mm3 ; p2-=p1 94 por mm4, mm5 ; abs(p2 - p1) 95 psubusb mm4, mm7 96 por mm1, mm4 97 98 movq mm2, mm3 ; p1 99 100 movq mm4, [rsi+rax] ; p0 101 movq mm5, mm4 ; p0 102 psubusb mm4, mm3 ; p0-=p1 103 psubusb mm3, mm5 ; p1-=p0 104 por mm4, mm3 ; abs(p1 - p0) 105 movq t1, mm4 ; save to t1 106 psubusb mm4, mm7 107 por mm1, mm4 108 109 movq mm3, [rdi] ; q1 110 movq mm4, mm3 ; q1 111 psubusb mm3, mm2 ; q1-=p1 112 psubusb mm2, mm4 ; p1-=q1 113 por mm2, mm3 ; abs(p1-q1) 114 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 115 psrlw mm2, 1 ; abs(p1-q1)/2 116 117 movq mm6, mm5 ; p0 118 movq mm3, [rsi] ; q0 119 psubusb mm5, mm3 ; p0-=q0 120 psubusb mm3, mm6 ; q0-=p0 121 por mm5, mm3 ; abs(p0 - q0) 122 paddusb mm5, mm5 ; abs(p0-q0)*2 123 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 124 125 mov rdx, arg(2) ;flimit ; get flimit 126 movq mm2, [rdx] ; flimit mm2 127 paddb mm2, mm2 ; flimit*2 (less than 255) 128 paddb mm7, mm2 ; flimit * 2 + limit (less than 255) 129 130 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 131 por mm1, mm5 132 pxor mm5, mm5 133 pcmpeqb mm1, mm5 ; mask mm1 134 135 ; calculate high edge variance 136 mov rdx, arg(4) ;thresh ; get thresh 137 movq mm7, [rdx] ; 138 movq mm4, t0 ; get abs (q1 - q0) 139 psubusb mm4, mm7 140 movq mm3, t1 ; get abs (p1 - p0) 141 psubusb mm3, mm7 142 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 143 144 pcmpeqb mm4, mm5 145 146 pcmpeqb mm5, mm5 147 pxor mm4, mm5 148 149 150 ; start work on filters 151 movq mm2, [rsi+2*rax] ; p1 152 movq mm7, [rdi] ; q1 153 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 154 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 155 psubsb mm2, mm7 ; p1 - q1 156 pand mm2, mm4 ; high var mask (hvm)(p1 - q1) 157 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 158 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 159 movq mm3, mm0 ; q0 160 psubsb mm0, mm6 ; q0 - p0 161 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 162 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 163 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 164 pand mm1, mm2 ; mask filter values we don't care about 165 movq mm2, mm1 166 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 167 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 168 169 pxor mm0, mm0 ; 170 pxor mm5, mm5 171 punpcklbw mm0, mm2 ; 172 punpckhbw mm5, mm2 ; 173 psraw mm0, 11 ; 174 psraw mm5, 11 175 packsswb mm0, mm5 176 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 177 178 pxor mm0, mm0 ; 0 179 movq mm5, mm1 ; abcdefgh 180 punpcklbw mm0, mm1 ; e0f0g0h0 181 psraw mm0, 11 ; sign extended shift right by 3 182 pxor mm1, mm1 ; 0 183 punpckhbw mm1, mm5 ; a0b0c0d0 184 psraw mm1, 11 ; sign extended shift right by 3 185 movq mm5, mm0 ; save results 186 187 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 188 paddsw mm5, [GLOBAL(ones)] 189 paddsw mm1, [GLOBAL(ones)] 190 psraw mm5, 1 ; partial shifted one more time for 2nd tap 191 psraw mm1, 1 ; partial shifted one more time for 2nd tap 192 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 193 pandn mm4, mm5 ; high edge variance additive 194 195 paddsb mm6, mm2 ; p0+= p0 add 196 pxor mm6, [GLOBAL(t80)] ; unoffset 197 movq [rsi+rax], mm6 ; write back 198 199 movq mm6, [rsi+2*rax] ; p1 200 pxor mm6, [GLOBAL(t80)] ; reoffset 201 paddsb mm6, mm4 ; p1+= p1 add 202 pxor mm6, [GLOBAL(t80)] ; unoffset 203 movq [rsi+2*rax], mm6 ; write back 204 205 psubsb mm3, mm0 ; q0-= q0 add 206 pxor mm3, [GLOBAL(t80)] ; unoffset 207 movq [rsi], mm3 ; write back 208 209 psubsb mm7, mm4 ; q1-= q1 add 210 pxor mm7, [GLOBAL(t80)] ; unoffset 211 movq [rdi], mm7 ; write back 212 213 add rsi,8 214 neg rax 215 dec rcx 216 jnz next8_h 217 218 add rsp, 32 219 pop rsp 220 ; begin epilog 221 pop rdi 222 pop rsi 223 RESTORE_GOT 224 UNSHADOW_ARGS 225 pop rbp 226 ret 227 228 229;void vp8_loop_filter_vertical_edge_mmx 230;( 231; unsigned char *src_ptr, 232; int src_pixel_step, 233; const char *flimit, 234; const char *limit, 235; const char *thresh, 236; int count 237;) 238global sym(vp8_loop_filter_vertical_edge_mmx) 239sym(vp8_loop_filter_vertical_edge_mmx): 240 push rbp 241 mov rbp, rsp 242 SHADOW_ARGS_TO_STACK 6 243 GET_GOT rbx 244 push rsi 245 push rdi 246 ; end prolog 247 248 ALIGN_STACK 16, rax 249 sub rsp, 64 ; reserve 64 bytes 250 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 251 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 252 %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; 253 254 mov rsi, arg(0) ;src_ptr 255 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 256 257 lea rsi, [rsi + rax*4 - 4] 258 259 movsxd rcx, dword ptr arg(5) ;count 260next8_v: 261 mov rdi, rsi ; rdi points to row +1 for indirect addressing 262 add rdi, rax 263 264 265 ;transpose 266 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 267 movq mm7, mm6 ; 77 76 75 74 73 72 71 70 268 269 punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 270 punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 271 272 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 273 movq mm5, mm4 ; 47 46 45 44 43 42 41 40 274 275 punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 276 punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 277 278 movq mm3, mm5 ; 57 47 56 46 55 45 54 44 279 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 280 281 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 282 movq mm2, mm4 ; 53 43 52 42 51 41 50 40 283 284 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 285 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 286 287 neg rax 288 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 289 290 movq mm1, mm6 ; 27 26 25 24 23 22 21 20 291 punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 292 293 punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 294 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 295 296 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 297 movq mm0, mm7 ; 17 07 16 06 15 05 14 04 298 299 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 300 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 301 302 movq mm6, mm7 ; 37 27 17 07 36 26 16 06 303 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 304 305 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 306 307 movq mm5, mm6 ; 76 66 56 46 36 26 16 06 308 psubusb mm5, mm7 ; q2-q3 309 310 psubusb mm7, mm6 ; q3-q2 311 por mm7, mm5; ; mm7=abs (q3-q2) 312 313 movq mm5, mm0 ; 35 25 15 05 34 24 14 04 314 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 315 316 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 317 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 318 319 psubusb mm3, mm6 ; q1-q2 320 psubusb mm6, mm5 ; q2-q1 321 322 por mm6, mm3 ; mm6=abs(q2-q1) 323 lea rdx, srct 324 325 movq [rdx+24], mm5 ; save q1 326 movq [rdx+16], mm0 ; save q0 327 328 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 329 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 330 331 movq mm0, mm3 ; 13 03 12 02 11 01 10 00 332 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 333 334 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 335 movq mm1, mm0 ; 31 21 11 01 30 20 10 00 336 337 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 338 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 339 340 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 341 psubusb mm2, mm0 ; p2-p3 342 343 psubusb mm0, mm1 ; p3-p2 344 por mm0, mm2 ; mm0=abs(p3-p2) 345 346 movq mm2, mm3 ; 33 23 13 03 32 22 12 02 347 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 348 349 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 350 movq [rdx+8], mm3 ; save p0 351 352 movq [rdx], mm2 ; save p1 353 movq mm5, mm2 ; mm5 = p1 354 355 psubusb mm2, mm1 ; p1-p2 356 psubusb mm1, mm5 ; p2-p1 357 358 por mm1, mm2 ; mm1=abs(p2-p1) 359 mov rdx, arg(3) ;limit 360 361 movq mm4, [rdx] ; mm4 = limit 362 psubusb mm7, mm4 363 364 psubusb mm0, mm4 365 psubusb mm1, mm4 366 367 psubusb mm6, mm4 368 por mm7, mm6 369 370 por mm0, mm1 371 por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit 372 373 movq mm1, mm5 ; p1 374 375 movq mm7, mm3 ; mm3=mm7=p0 376 psubusb mm7, mm5 ; p0 - p1 377 378 psubusb mm5, mm3 ; p1 - p0 379 por mm5, mm7 ; abs(p1-p0) 380 381 movq t0, mm5 ; save abs(p1-p0) 382 lea rdx, srct 383 384 psubusb mm5, mm4 385 por mm0, mm5 ; mm0=mask 386 387 movq mm5, [rdx+16] ; mm5=q0 388 movq mm7, [rdx+24] ; mm7=q1 389 390 movq mm6, mm5 ; mm6=q0 391 movq mm2, mm7 ; q1 392 psubusb mm5, mm7 ; q0-q1 393 394 psubusb mm7, mm6 ; q1-q0 395 por mm7, mm5 ; abs(q1-q0) 396 397 movq t1, mm7 ; save abs(q1-q0) 398 psubusb mm7, mm4 399 400 por mm0, mm7 ; mask 401 402 movq mm5, mm2 ; q1 403 psubusb mm5, mm1 ; q1-=p1 404 psubusb mm1, mm2 ; p1-=q1 405 por mm5, mm1 ; abs(p1-q1) 406 pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 407 psrlw mm5, 1 ; abs(p1-q1)/2 408 409 mov rdx, arg(2) ;flimit ; 410 411 movq mm2, [rdx] ;flimit mm2 412 movq mm1, mm3 ; mm1=mm3=p0 413 414 movq mm7, mm6 ; mm7=mm6=q0 415 psubusb mm1, mm7 ; p0-q0 416 417 psubusb mm7, mm3 ; q0-p0 418 por mm1, mm7 ; abs(q0-p0) 419 paddusb mm1, mm1 ; abs(q0-p0)*2 420 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 421 422 paddb mm2, mm2 ; flimit*2 (less than 255) 423 paddb mm4, mm2 ; flimit * 2 + limit (less than 255) 424 425 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 426 por mm1, mm0; ; mask 427 428 pxor mm0, mm0 429 pcmpeqb mm1, mm0 430 431 ; calculate high edge variance 432 mov rdx, arg(4) ;thresh ; get thresh 433 movq mm7, [rdx] 434 ; 435 movq mm4, t0 ; get abs (q1 - q0) 436 psubusb mm4, mm7 437 438 movq mm3, t1 ; get abs (p1 - p0) 439 psubusb mm3, mm7 440 441 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 442 pcmpeqb mm4, mm0 443 444 pcmpeqb mm0, mm0 445 pxor mm4, mm0 446 447 448 449 ; start work on filters 450 lea rdx, srct 451 452 movq mm2, [rdx] ; p1 453 movq mm7, [rdx+24] ; q1 454 455 movq mm6, [rdx+8] ; p0 456 movq mm0, [rdx+16] ; q0 457 458 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 459 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 460 461 psubsb mm2, mm7 ; p1 - q1 462 pand mm2, mm4 ; high var mask (hvm)(p1 - q1) 463 464 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 465 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 466 467 movq mm3, mm0 ; q0 468 psubsb mm0, mm6 ; q0 - p0 469 470 paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 471 paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 472 473 paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 474 pand mm1, mm2 ; mask filter values we don't care about 475 476 movq mm2, mm1 477 paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 478 479 paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 480 pxor mm0, mm0 ; 481 482 pxor mm5, mm5 483 punpcklbw mm0, mm2 ; 484 485 punpckhbw mm5, mm2 ; 486 psraw mm0, 11 ; 487 488 psraw mm5, 11 489 packsswb mm0, mm5 490 491 movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 492 493 pxor mm0, mm0 ; 0 494 movq mm5, mm1 ; abcdefgh 495 496 punpcklbw mm0, mm1 ; e0f0g0h0 497 psraw mm0, 11 ; sign extended shift right by 3 498 499 pxor mm1, mm1 ; 0 500 punpckhbw mm1, mm5 ; a0b0c0d0 501 502 psraw mm1, 11 ; sign extended shift right by 3 503 movq mm5, mm0 ; save results 504 505 packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 506 paddsw mm5, [GLOBAL(ones)] 507 508 paddsw mm1, [GLOBAL(ones)] 509 psraw mm5, 1 ; partial shifted one more time for 2nd tap 510 511 psraw mm1, 1 ; partial shifted one more time for 2nd tap 512 packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 513 514 pandn mm4, mm5 ; high edge variance additive 515 516 paddsb mm6, mm2 ; p0+= p0 add 517 pxor mm6, [GLOBAL(t80)] ; unoffset 518 519 ; mm6=p0 ; 520 movq mm1, [rdx] ; p1 521 pxor mm1, [GLOBAL(t80)] ; reoffset 522 523 paddsb mm1, mm4 ; p1+= p1 add 524 pxor mm1, [GLOBAL(t80)] ; unoffset 525 ; mm6 = p0 mm1 = p1 526 527 psubsb mm3, mm0 ; q0-= q0 add 528 pxor mm3, [GLOBAL(t80)] ; unoffset 529 530 ; mm3 = q0 531 psubsb mm7, mm4 ; q1-= q1 add 532 pxor mm7, [GLOBAL(t80)] ; unoffset 533 ; mm7 = q1 534 535 ; tranpose and write back 536 ; mm1 = 72 62 52 42 32 22 12 02 537 ; mm6 = 73 63 53 43 33 23 13 03 538 ; mm3 = 74 64 54 44 34 24 14 04 539 ; mm7 = 75 65 55 45 35 25 15 05 540 541 movq mm2, mm1 ; 72 62 52 42 32 22 12 02 542 punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 543 544 movq mm4, mm3 ; 74 64 54 44 34 24 14 04 545 punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 546 547 punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 548 punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 549 550 movq mm6, mm2 ; 33 32 23 22 13 12 03 02 551 punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 552 553 punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 554 movq mm5, mm1 ; 73 72 63 62 53 52 43 42 555 556 punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 557 punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 558 559 560 ; mm2 = 15 14 13 12 05 04 03 02 561 ; mm6 = 35 34 33 32 25 24 23 22 562 ; mm5 = 55 54 53 52 45 44 43 42 563 ; mm1 = 75 74 73 72 65 64 63 62 564 565 566 567 movd [rsi+rax*4+2], mm2 568 psrlq mm2, 32 569 570 movd [rdi+rax*4+2], mm2 571 movd [rsi+rax*2+2], mm6 572 573 psrlq mm6, 32 574 movd [rsi+rax+2],mm6 575 576 movd [rsi+2], mm1 577 psrlq mm1, 32 578 579 movd [rdi+2], mm1 580 neg rax 581 582 movd [rdi+rax+2],mm5 583 psrlq mm5, 32 584 585 movd [rdi+rax*2+2], mm5 586 587 lea rsi, [rsi+rax*8] 588 dec rcx 589 jnz next8_v 590 591 add rsp, 64 592 pop rsp 593 ; begin epilog 594 pop rdi 595 pop rsi 596 RESTORE_GOT 597 UNSHADOW_ARGS 598 pop rbp 599 ret 600 601 602;void vp8_mbloop_filter_horizontal_edge_mmx 603;( 604; unsigned char *src_ptr, 605; int src_pixel_step, 606; const char *flimit, 607; const char *limit, 608; const char *thresh, 609; int count 610;) 611global sym(vp8_mbloop_filter_horizontal_edge_mmx) 612sym(vp8_mbloop_filter_horizontal_edge_mmx): 613 push rbp 614 mov rbp, rsp 615 SHADOW_ARGS_TO_STACK 6 616 GET_GOT rbx 617 push rsi 618 push rdi 619 ; end prolog 620 621 ALIGN_STACK 16, rax 622 sub rsp, 32 ; reserve 32 bytes 623 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 624 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 625 626 mov rsi, arg(0) ;src_ptr 627 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 628 629 movsxd rcx, dword ptr arg(5) ;count 630next8_mbh: 631 mov rdx, arg(3) ;limit 632 movq mm7, [rdx] 633 mov rdi, rsi ; rdi points to row +1 for indirect addressing 634 add rdi, rax 635 636 ; calculate breakout conditions 637 movq mm2, [rdi+2*rax] ; q3 638 639 movq mm1, [rsi+2*rax] ; q2 640 movq mm6, mm1 ; q2 641 psubusb mm1, mm2 ; q2-=q3 642 psubusb mm2, mm6 ; q3-=q2 643 por mm1, mm2 ; abs(q3-q2) 644 psubusb mm1, mm7 645 646 647 ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit 648 movq mm4, [rsi+rax] ; q1 649 movq mm3, mm4 ; q1 650 psubusb mm4, mm6 ; q1-=q2 651 psubusb mm6, mm3 ; q2-=q1 652 por mm4, mm6 ; abs(q2-q1) 653 psubusb mm4, mm7 654 por mm1, mm4 655 656 657 ; mm1 = mask, mm3=q1, mm7 = limit 658 659 movq mm4, [rsi] ; q0 660 movq mm0, mm4 ; q0 661 psubusb mm4, mm3 ; q0-=q1 662 psubusb mm3, mm0 ; q1-=q0 663 por mm4, mm3 ; abs(q0-q1) 664 movq t0, mm4 ; save to t0 665 psubusb mm4, mm7 666 por mm1, mm4 667 668 669 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 670 671 neg rax ; negate pitch to deal with above border 672 673 movq mm2, [rsi+4*rax] ; p3 674 movq mm4, [rdi+4*rax] ; p2 675 movq mm5, mm4 ; p2 676 psubusb mm4, mm2 ; p2-=p3 677 psubusb mm2, mm5 ; p3-=p2 678 por mm4, mm2 ; abs(p3 - p2) 679 psubusb mm4, mm7 680 por mm1, mm4 681 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 682 683 movq mm4, [rsi+2*rax] ; p1 684 movq mm3, mm4 ; p1 685 psubusb mm4, mm5 ; p1-=p2 686 psubusb mm5, mm3 ; p2-=p1 687 por mm4, mm5 ; abs(p2 - p1) 688 psubusb mm4, mm7 689 por mm1, mm4 690 691 movq mm2, mm3 ; p1 692 693 694 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) 695 696 movq mm4, [rsi+rax] ; p0 697 movq mm5, mm4 ; p0 698 psubusb mm4, mm3 ; p0-=p1 699 psubusb mm3, mm5 ; p1-=p0 700 por mm4, mm3 ; abs(p1 - p0) 701 movq t1, mm4 ; save to t1 702 psubusb mm4, mm7 703 por mm1, mm4 704 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0) 705 ; mm5 = p0 706 movq mm3, [rdi] ; q1 707 movq mm4, mm3 ; q1 708 psubusb mm3, mm2 ; q1-=p1 709 psubusb mm2, mm4 ; p1-=q1 710 por mm2, mm3 ; abs(p1-q1) 711 pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 712 psrlw mm2, 1 ; abs(p1-q1)/2 713 714 movq mm6, mm5 ; p0 715 movq mm3, mm0 ; q0 716 psubusb mm5, mm3 ; p0-=q0 717 psubusb mm3, mm6 ; q0-=p0 718 por mm5, mm3 ; abs(p0 - q0) 719 paddusb mm5, mm5 ; abs(p0-q0)*2 720 paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 721 722 mov rdx, arg(2) ;flimit ; get flimit 723 movq mm2, [rdx] ; flimit mm2 724 paddb mm2, mm2 ; flimit*2 (less than 255) 725 paddb mm7, mm2 ; flimit * 2 + limit (less than 255) 726 727 psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 728 por mm1, mm5 729 pxor mm5, mm5 730 pcmpeqb mm1, mm5 ; mask mm1 731 732 ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0) 733 ; mm6 = p0, 734 735 ; calculate high edge variance 736 mov rdx, arg(4) ;thresh ; get thresh 737 movq mm7, [rdx] ; 738 movq mm4, t0 ; get abs (q1 - q0) 739 psubusb mm4, mm7 740 movq mm3, t1 ; get abs (p1 - p0) 741 psubusb mm3, mm7 742 paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 743 744 pcmpeqb mm4, mm5 745 746 pcmpeqb mm5, mm5 747 pxor mm4, mm5 748 749 750 751 ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0) 752 ; mm6 = p0, mm4=hev 753 ; start work on filters 754 movq mm2, [rsi+2*rax] ; p1 755 movq mm7, [rdi] ; q1 756 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 757 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 758 psubsb mm2, mm7 ; p1 - q1 759 760 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 761 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 762 movq mm3, mm0 ; q0 763 psubsb mm0, mm6 ; q0 - p0 764 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) 765 paddsb mm2, mm0 ; 2 * (q0 - p0) 766 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) 767 pand mm1, mm2 ; mask filter values we don't care about 768 769 770 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 771 movq mm2, mm1 ; vp8_filter 772 pand mm2, mm4; ; Filter2 = vp8_filter & hev 773 774 movq mm5, mm2 ; 775 paddsb mm5, [GLOBAL(t3)]; 776 777 pxor mm0, mm0 ; 0 778 pxor mm7, mm7 ; 0 779 780 punpcklbw mm0, mm5 ; e0f0g0h0 781 psraw mm0, 11 ; sign extended shift right by 3 782 punpckhbw mm7, mm5 ; a0b0c0d0 783 psraw mm7, 11 ; sign extended shift right by 3 784 packsswb mm0, mm7 ; Filter2 >>=3; 785 786 movq mm5, mm0 ; Filter2 787 788 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 789 pxor mm0, mm0 ; 0 790 pxor mm7, mm7 ; 0 791 792 punpcklbw mm0, mm2 ; e0f0g0h0 793 psraw mm0, 11 ; sign extended shift right by 3 794 punpckhbw mm7, mm2 ; a0b0c0d0 795 psraw mm7, 11 ; sign extended shift right by 3 796 packsswb mm0, mm7 ; Filter2 >>=3; 797 798 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 799 psubsb mm3, mm0 ; qs0 =qs0 - filter1 800 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 801 802 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 803 ; vp8_filter &= ~hev; 804 ; Filter2 = vp8_filter; 805 pandn mm4, mm1 ; vp8_filter&=~hev 806 807 808 ; mm3=qs0, mm4=filter2, mm6=ps0 809 810 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 811 ; s = vp8_signed_char_clamp(qs0 - u); 812 ; *oq0 = s^0x80; 813 ; s = vp8_signed_char_clamp(ps0 + u); 814 ; *op0 = s^0x80; 815 pxor mm0, mm0 816 817 pxor mm1, mm1 818 pxor mm2, mm2 819 punpcklbw mm1, mm4 820 punpckhbw mm2, mm4 821 pmulhw mm1, [GLOBAL(s27)] 822 pmulhw mm2, [GLOBAL(s27)] 823 paddw mm1, [GLOBAL(s63)] 824 paddw mm2, [GLOBAL(s63)] 825 psraw mm1, 7 826 psraw mm2, 7 827 packsswb mm1, mm2 828 829 psubsb mm3, mm1 830 paddsb mm6, mm1 831 832 pxor mm3, [GLOBAL(t80)] 833 pxor mm6, [GLOBAL(t80)] 834 movq [rsi+rax], mm6 835 movq [rsi], mm3 836 837 ; roughly 2/7th difference across boundary 838 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 839 ; s = vp8_signed_char_clamp(qs1 - u); 840 ; *oq1 = s^0x80; 841 ; s = vp8_signed_char_clamp(ps1 + u); 842 ; *op1 = s^0x80; 843 pxor mm1, mm1 844 pxor mm2, mm2 845 punpcklbw mm1, mm4 846 punpckhbw mm2, mm4 847 pmulhw mm1, [GLOBAL(s18)] 848 pmulhw mm2, [GLOBAL(s18)] 849 paddw mm1, [GLOBAL(s63)] 850 paddw mm2, [GLOBAL(s63)] 851 psraw mm1, 7 852 psraw mm2, 7 853 packsswb mm1, mm2 854 855 movq mm3, [rdi] 856 movq mm6, [rsi+rax*2] ; p1 857 858 pxor mm3, [GLOBAL(t80)] 859 pxor mm6, [GLOBAL(t80)] 860 861 paddsb mm6, mm1 862 psubsb mm3, mm1 863 864 pxor mm6, [GLOBAL(t80)] 865 pxor mm3, [GLOBAL(t80)] 866 movq [rdi], mm3 867 movq [rsi+rax*2], mm6 868 869 ; roughly 1/7th difference across boundary 870 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 871 ; s = vp8_signed_char_clamp(qs2 - u); 872 ; *oq2 = s^0x80; 873 ; s = vp8_signed_char_clamp(ps2 + u); 874 ; *op2 = s^0x80; 875 pxor mm1, mm1 876 pxor mm2, mm2 877 punpcklbw mm1, mm4 878 punpckhbw mm2, mm4 879 pmulhw mm1, [GLOBAL(s9)] 880 pmulhw mm2, [GLOBAL(s9)] 881 paddw mm1, [GLOBAL(s63)] 882 paddw mm2, [GLOBAL(s63)] 883 psraw mm1, 7 884 psraw mm2, 7 885 packsswb mm1, mm2 886 887 888 movq mm6, [rdi+rax*4] 889 neg rax 890 movq mm3, [rdi+rax ] 891 892 pxor mm6, [GLOBAL(t80)] 893 pxor mm3, [GLOBAL(t80)] 894 895 paddsb mm6, mm1 896 psubsb mm3, mm1 897 898 pxor mm6, [GLOBAL(t80)] 899 pxor mm3, [GLOBAL(t80)] 900 movq [rdi+rax ], mm3 901 neg rax 902 movq [rdi+rax*4], mm6 903 904;EARLY_BREAK_OUT: 905 neg rax 906 add rsi,8 907 dec rcx 908 jnz next8_mbh 909 910 add rsp, 32 911 pop rsp 912 ; begin epilog 913 pop rdi 914 pop rsi 915 RESTORE_GOT 916 UNSHADOW_ARGS 917 pop rbp 918 ret 919 920 921;void vp8_mbloop_filter_vertical_edge_mmx 922;( 923; unsigned char *src_ptr, 924; int src_pixel_step, 925; const char *flimit, 926; const char *limit, 927; const char *thresh, 928; int count 929;) 930global sym(vp8_mbloop_filter_vertical_edge_mmx) 931sym(vp8_mbloop_filter_vertical_edge_mmx): 932 push rbp 933 mov rbp, rsp 934 SHADOW_ARGS_TO_STACK 6 935 GET_GOT rbx 936 push rsi 937 push rdi 938 ; end prolog 939 940 ALIGN_STACK 16, rax 941 sub rsp, 96 ; reserve 96 bytes 942 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 943 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 944 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 945 946 mov rsi, arg(0) ;src_ptr 947 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 948 949 lea rsi, [rsi + rax*4 - 4] 950 951 movsxd rcx, dword ptr arg(5) ;count 952next8_mbv: 953 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 954 955 ;transpose 956 movq mm0, [rdi+2*rax] ; 77 76 75 74 73 72 71 70 957 movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 958 959 movq mm7, mm6 ; 77 76 75 74 73 72 71 70 960 punpckhbw mm7, mm0 ; 77 67 76 66 75 65 74 64 961 962 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 963 movq mm0, [rsi+rax] ; 57 56 55 54 53 52 51 50 964 965 movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 966 movq mm5, mm4 ; 47 46 45 44 43 42 41 40 967 968 punpckhbw mm5, mm0 ; 57 47 56 46 55 45 54 44 969 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 970 971 movq mm3, mm5 ; 57 47 56 46 55 45 54 44 972 punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 973 974 punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 975 movq mm2, mm4 ; 53 43 52 42 51 41 50 40 976 977 punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 978 punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 979 980 neg rax 981 982 movq mm7, [rsi+rax] ; 37 36 35 34 33 32 31 30 983 movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 984 985 movq mm1, mm6 ; 27 26 25 24 23 22 21 20 986 punpckhbw mm6, mm7 ; 37 27 36 36 35 25 34 24 987 988 punpcklbw mm1, mm7 ; 33 23 32 22 31 21 30 20 989 990 movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 991 punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 992 993 movq mm0, mm7 ; 17 07 16 06 15 05 14 04 994 punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 995 996 punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 997 movq mm6, mm7 ; 37 27 17 07 36 26 16 06 998 999 punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 1000 punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 1001 1002 lea rdx, srct 1003 movq mm5, mm6 ; 76 66 56 46 36 26 16 06 1004 1005 movq [rdx+56], mm7 1006 psubusb mm5, mm7 ; q2-q3 1007 1008 1009 movq [rdx+48], mm6 1010 psubusb mm7, mm6 ; q3-q2 1011 1012 por mm7, mm5; ; mm7=abs (q3-q2) 1013 movq mm5, mm0 ; 35 25 15 05 34 24 14 04 1014 1015 punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 1016 punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 1017 1018 movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 1019 psubusb mm3, mm6 ; q1-q2 1020 1021 psubusb mm6, mm5 ; q2-q1 1022 por mm6, mm3 ; mm6=abs(q2-q1) 1023 1024 movq [rdx+40], mm5 ; save q1 1025 movq [rdx+32], mm0 ; save q0 1026 1027 movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 1028 punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 1029 1030 movq mm0, mm3 ; 13 03 12 02 11 01 10 00 1031 punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 1032 1033 punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 1034 movq mm1, mm0 ; 31 21 11 01 30 20 10 00 1035 1036 punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 1037 punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 1038 1039 movq [rdx], mm0 ; save p3 1040 movq [rdx+8], mm1 ; save p2 1041 1042 movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 1043 psubusb mm2, mm0 ; p2-p3 1044 1045 psubusb mm0, mm1 ; p3-p2 1046 por mm0, mm2 ; mm0=abs(p3-p2) 1047 1048 movq mm2, mm3 ; 33 23 13 03 32 22 12 02 1049 punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 1050 1051 punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 1052 movq [rdx+24], mm3 ; save p0 1053 1054 movq [rdx+16], mm2 ; save p1 1055 movq mm5, mm2 ; mm5 = p1 1056 1057 psubusb mm2, mm1 ; p1-p2 1058 psubusb mm1, mm5 ; p2-p1 1059 1060 por mm1, mm2 ; mm1=abs(p2-p1) 1061 mov rdx, arg(3) ;limit 1062 1063 movq mm4, [rdx] ; mm4 = limit 1064 psubusb mm7, mm4 ; abs(q3-q2) > limit 1065 1066 psubusb mm0, mm4 ; abs(p3-p2) > limit 1067 psubusb mm1, mm4 ; abs(p2-p1) > limit 1068 1069 psubusb mm6, mm4 ; abs(q2-q1) > limit 1070 por mm7, mm6 ; or 1071 1072 por mm0, mm1 ; 1073 por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit 1074 1075 movq mm1, mm5 ; p1 1076 1077 movq mm7, mm3 ; mm3=mm7=p0 1078 psubusb mm7, mm5 ; p0 - p1 1079 1080 psubusb mm5, mm3 ; p1 - p0 1081 por mm5, mm7 ; abs(p1-p0) 1082 1083 movq t0, mm5 ; save abs(p1-p0) 1084 lea rdx, srct 1085 1086 psubusb mm5, mm4 ; mm5 = abs(p1-p0) > limit 1087 por mm0, mm5 ; mm0=mask 1088 1089 movq mm5, [rdx+32] ; mm5=q0 1090 movq mm7, [rdx+40] ; mm7=q1 1091 1092 movq mm6, mm5 ; mm6=q0 1093 movq mm2, mm7 ; q1 1094 psubusb mm5, mm7 ; q0-q1 1095 1096 psubusb mm7, mm6 ; q1-q0 1097 por mm7, mm5 ; abs(q1-q0) 1098 1099 movq t1, mm7 ; save abs(q1-q0) 1100 psubusb mm7, mm4 ; mm7=abs(q1-q0)> limit 1101 1102 por mm0, mm7 ; mask 1103 1104 movq mm5, mm2 ; q1 1105 psubusb mm5, mm1 ; q1-=p1 1106 psubusb mm1, mm2 ; p1-=q1 1107 por mm5, mm1 ; abs(p1-q1) 1108 pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 1109 psrlw mm5, 1 ; abs(p1-q1)/2 1110 1111 mov rdx, arg(2) ;flimit ; 1112 1113 movq mm2, [rdx] ;flimit mm2 1114 movq mm1, mm3 ; mm1=mm3=p0 1115 1116 movq mm7, mm6 ; mm7=mm6=q0 1117 psubusb mm1, mm7 ; p0-q0 1118 1119 psubusb mm7, mm3 ; q0-p0 1120 por mm1, mm7 ; abs(q0-p0) 1121 paddusb mm1, mm1 ; abs(q0-p0)*2 1122 paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1123 1124 paddb mm2, mm2 ; flimit*2 (less than 255) 1125 paddb mm4, mm2 ; flimit * 2 + limit (less than 255) 1126 1127 psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 1128 por mm1, mm0; ; mask 1129 1130 pxor mm0, mm0 1131 pcmpeqb mm1, mm0 1132 1133 ; calculate high edge variance 1134 mov rdx, arg(4) ;thresh ; get thresh 1135 movq mm7, [rdx] 1136 ; 1137 movq mm4, t0 ; get abs (q1 - q0) 1138 psubusb mm4, mm7 ; abs(q1 - q0) > thresh 1139 1140 movq mm3, t1 ; get abs (p1 - p0) 1141 psubusb mm3, mm7 ; abs(p1 - p0)> thresh 1142 1143 por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 1144 pcmpeqb mm4, mm0 1145 1146 pcmpeqb mm0, mm0 1147 pxor mm4, mm0 1148 1149 1150 1151 1152 ; start work on filters 1153 lea rdx, srct 1154 1155 ; start work on filters 1156 movq mm2, [rdx+16] ; p1 1157 movq mm7, [rdx+40] ; q1 1158 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 1159 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 1160 psubsb mm2, mm7 ; p1 - q1 1161 1162 movq mm6, [rdx+24] ; p0 1163 movq mm0, [rdx+32] ; q0 1164 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 1165 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 1166 1167 movq mm3, mm0 ; q0 1168 psubsb mm0, mm6 ; q0 - p0 1169 paddsb mm2, mm0 ; 1 * (q0 - p0) + (p1 - q1) 1170 paddsb mm2, mm0 ; 2 * (q0 - p0) 1171 paddsb mm2, mm0 ; 3 * (q0 - p0) + (p1 - q1) 1172 pand mm1, mm2 ; mask filter values we don't care about 1173 1174 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0 1175 movq mm2, mm1 ; vp8_filter 1176 pand mm2, mm4; ; Filter2 = vp8_filter & hev 1177 1178 movq mm5, mm2 ; 1179 paddsb mm5, [GLOBAL(t3)]; 1180 1181 pxor mm0, mm0 ; 0 1182 pxor mm7, mm7 ; 0 1183 1184 punpcklbw mm0, mm5 ; e0f0g0h0 1185 psraw mm0, 11 ; sign extended shift right by 3 1186 punpckhbw mm7, mm5 ; a0b0c0d0 1187 psraw mm7, 11 ; sign extended shift right by 3 1188 packsswb mm0, mm7 ; Filter2 >>=3; 1189 1190 movq mm5, mm0 ; Filter2 1191 1192 paddsb mm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 1193 pxor mm0, mm0 ; 0 1194 pxor mm7, mm7 ; 0 1195 1196 punpcklbw mm0, mm2 ; e0f0g0h0 1197 psraw mm0, 11 ; sign extended shift right by 3 1198 punpckhbw mm7, mm2 ; a0b0c0d0 1199 psraw mm7, 11 ; sign extended shift right by 3 1200 packsswb mm0, mm7 ; Filter2 >>=3; 1201 1202 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0 1203 psubsb mm3, mm0 ; qs0 =qs0 - filter1 1204 paddsb mm6, mm5 ; ps0 =ps0 + Fitler2 1205 1206 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0 1207 ; vp8_filter &= ~hev; 1208 ; Filter2 = vp8_filter; 1209 pandn mm4, mm1 ; vp8_filter&=~hev 1210 1211 1212 ; mm3=qs0, mm4=filter2, mm6=ps0 1213 1214 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7); 1215 ; s = vp8_signed_char_clamp(qs0 - u); 1216 ; *oq0 = s^0x80; 1217 ; s = vp8_signed_char_clamp(ps0 + u); 1218 ; *op0 = s^0x80; 1219 pxor mm0, mm0 1220 1221 pxor mm1, mm1 1222 pxor mm2, mm2 1223 punpcklbw mm1, mm4 1224 punpckhbw mm2, mm4 1225 pmulhw mm1, [GLOBAL(s27)] 1226 pmulhw mm2, [GLOBAL(s27)] 1227 paddw mm1, [GLOBAL(s63)] 1228 paddw mm2, [GLOBAL(s63)] 1229 psraw mm1, 7 1230 psraw mm2, 7 1231 packsswb mm1, mm2 1232 1233 psubsb mm3, mm1 1234 paddsb mm6, mm1 1235 1236 pxor mm3, [GLOBAL(t80)] 1237 pxor mm6, [GLOBAL(t80)] 1238 movq [rdx+24], mm6 1239 movq [rdx+32], mm3 1240 1241 ; roughly 2/7th difference across boundary 1242 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7); 1243 ; s = vp8_signed_char_clamp(qs1 - u); 1244 ; *oq1 = s^0x80; 1245 ; s = vp8_signed_char_clamp(ps1 + u); 1246 ; *op1 = s^0x80; 1247 pxor mm1, mm1 1248 pxor mm2, mm2 1249 punpcklbw mm1, mm4 1250 punpckhbw mm2, mm4 1251 pmulhw mm1, [GLOBAL(s18)] 1252 pmulhw mm2, [GLOBAL(s18)] 1253 paddw mm1, [GLOBAL(s63)] 1254 paddw mm2, [GLOBAL(s63)] 1255 psraw mm1, 7 1256 psraw mm2, 7 1257 packsswb mm1, mm2 1258 1259 movq mm3, [rdx + 40] 1260 movq mm6, [rdx + 16] ; p1 1261 pxor mm3, [GLOBAL(t80)] 1262 pxor mm6, [GLOBAL(t80)] 1263 1264 paddsb mm6, mm1 1265 psubsb mm3, mm1 1266 1267 pxor mm6, [GLOBAL(t80)] 1268 pxor mm3, [GLOBAL(t80)] 1269 movq [rdx + 40], mm3 1270 movq [rdx + 16], mm6 1271 1272 ; roughly 1/7th difference across boundary 1273 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7); 1274 ; s = vp8_signed_char_clamp(qs2 - u); 1275 ; *oq2 = s^0x80; 1276 ; s = vp8_signed_char_clamp(ps2 + u); 1277 ; *op2 = s^0x80; 1278 pxor mm1, mm1 1279 pxor mm2, mm2 1280 punpcklbw mm1, mm4 1281 punpckhbw mm2, mm4 1282 pmulhw mm1, [GLOBAL(s9)] 1283 pmulhw mm2, [GLOBAL(s9)] 1284 paddw mm1, [GLOBAL(s63)] 1285 paddw mm2, [GLOBAL(s63)] 1286 psraw mm1, 7 1287 psraw mm2, 7 1288 packsswb mm1, mm2 1289 1290 movq mm6, [rdx+ 8] 1291 movq mm3, [rdx+48] 1292 1293 pxor mm6, [GLOBAL(t80)] 1294 pxor mm3, [GLOBAL(t80)] 1295 1296 paddsb mm6, mm1 1297 psubsb mm3, mm1 1298 1299 pxor mm6, [GLOBAL(t80)] ; mm6 = 71 61 51 41 31 21 11 01 1300 pxor mm3, [GLOBAL(t80)] ; mm3 = 76 66 56 46 36 26 15 06 1301 1302 ; tranpose and write back 1303 movq mm0, [rdx] ; mm0 = 70 60 50 40 30 20 10 00 1304 movq mm1, mm0 ; mm0 = 70 60 50 40 30 20 10 00 1305 1306 punpcklbw mm0, mm6 ; mm0 = 31 30 21 20 11 10 01 00 1307 punpckhbw mm1, mm6 ; mm3 = 71 70 61 60 51 50 41 40 1308 1309 movq mm2, [rdx+16] ; mm2 = 72 62 52 42 32 22 12 02 1310 movq mm6, mm2 ; mm3 = 72 62 52 42 32 22 12 02 1311 1312 punpcklbw mm2, [rdx+24] ; mm2 = 33 32 23 22 13 12 03 02 1313 punpckhbw mm6, [rdx+24] ; mm3 = 73 72 63 62 53 52 43 42 1314 1315 movq mm5, mm0 ; mm5 = 31 30 21 20 11 10 01 00 1316 punpcklwd mm0, mm2 ; mm0 = 13 12 11 10 03 02 01 00 1317 1318 punpckhwd mm5, mm2 ; mm5 = 33 32 31 30 23 22 21 20 1319 movq mm4, mm1 ; mm4 = 71 70 61 60 51 50 41 40 1320 1321 punpcklwd mm1, mm6 ; mm1 = 53 52 51 50 43 42 41 40 1322 punpckhwd mm4, mm6 ; mm4 = 73 72 71 70 63 62 61 60 1323 1324 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 1325 punpcklbw mm2, [rdx+40] ; mm2 = 35 34 25 24 15 14 05 04 1326 1327 movq mm6, mm3 ; mm6 = 76 66 56 46 36 26 15 06 1328 punpcklbw mm6, [rdx+56] ; mm6 = 37 36 27 26 17 16 07 06 1329 1330 movq mm7, mm2 ; mm7 = 35 34 25 24 15 14 05 04 1331 punpcklwd mm2, mm6 ; mm2 = 17 16 15 14 07 06 05 04 1332 1333 punpckhwd mm7, mm6 ; mm7 = 37 36 35 34 27 26 25 24 1334 movq mm6, mm0 ; mm6 = 13 12 11 10 03 02 01 00 1335 1336 punpckldq mm0, mm2 ; mm0 = 07 06 05 04 03 02 01 00 1337 punpckhdq mm6, mm2 ; mm6 = 17 16 15 14 13 12 11 10 1338 1339 movq [rsi+rax*4], mm0 ; write out 1340 movq [rdi+rax*4], mm6 ; write out 1341 1342 movq mm0, mm5 ; mm0 = 33 32 31 30 23 22 21 20 1343 punpckldq mm0, mm7 ; mm0 = 27 26 25 24 23 22 20 20 1344 1345 punpckhdq mm5, mm7 ; mm5 = 37 36 35 34 33 32 31 30 1346 movq [rsi+rax*2], mm0 ; write out 1347 1348 movq [rdi+rax*2], mm5 ; write out 1349 movq mm2, [rdx+32] ; mm2 = 74 64 54 44 34 24 14 04 1350 1351 punpckhbw mm2, [rdx+40] ; mm2 = 75 74 65 64 54 54 45 44 1352 punpckhbw mm3, [rdx+56] ; mm3 = 77 76 67 66 57 56 47 46 1353 1354 movq mm5, mm2 ; mm5 = 75 74 65 64 54 54 45 44 1355 punpcklwd mm2, mm3 ; mm2 = 57 56 55 54 47 46 45 44 1356 1357 punpckhwd mm5, mm3 ; mm5 = 77 76 75 74 67 66 65 64 1358 movq mm0, mm1 ; mm0= 53 52 51 50 43 42 41 40 1359 1360 movq mm3, mm4 ; mm4 = 73 72 71 70 63 62 61 60 1361 punpckldq mm0, mm2 ; mm0 = 47 46 45 44 43 42 41 40 1362 1363 punpckhdq mm1, mm2 ; mm1 = 57 56 55 54 53 52 51 50 1364 movq [rsi], mm0 ; write out 1365 1366 movq [rdi], mm1 ; write out 1367 neg rax 1368 1369 punpckldq mm3, mm5 ; mm3 = 67 66 65 64 63 62 61 60 1370 punpckhdq mm4, mm5 ; mm4 = 77 76 75 74 73 72 71 60 1371 1372 movq [rsi+rax*2], mm3 1373 movq [rdi+rax*2], mm4 1374 1375 lea rsi, [rsi+rax*8] 1376 dec rcx 1377 1378 jnz next8_mbv 1379 1380 add rsp, 96 1381 pop rsp 1382 ; begin epilog 1383 pop rdi 1384 pop rsi 1385 RESTORE_GOT 1386 UNSHADOW_ARGS 1387 pop rbp 1388 ret 1389 1390 1391;void vp8_loop_filter_simple_horizontal_edge_mmx 1392;( 1393; unsigned char *src_ptr, 1394; int src_pixel_step, 1395; const char *flimit, 1396; const char *limit, 1397; const char *thresh, 1398; int count 1399;) 1400global sym(vp8_loop_filter_simple_horizontal_edge_mmx) 1401sym(vp8_loop_filter_simple_horizontal_edge_mmx): 1402 push rbp 1403 mov rbp, rsp 1404 SHADOW_ARGS_TO_STACK 6 1405 GET_GOT rbx 1406 push rsi 1407 push rdi 1408 ; end prolog 1409 1410 mov rsi, arg(0) ;src_ptr 1411 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1412 1413 movsxd rcx, dword ptr arg(5) ;count 1414nexts8_h: 1415 mov rdx, arg(3) ;limit 1416 movq mm7, [rdx] 1417 mov rdx, arg(2) ;flimit ; get flimit 1418 movq mm3, [rdx] ; 1419 paddb mm3, mm3 ; flimit*2 (less than 255) 1420 paddb mm3, mm7 ; flimit * 2 + limit (less than 255) 1421 1422 mov rdi, rsi ; rdi points to row +1 for indirect addressing 1423 add rdi, rax 1424 neg rax 1425 1426 ; calculate mask 1427 movq mm1, [rsi+2*rax] ; p1 1428 movq mm0, [rdi] ; q1 1429 movq mm2, mm1 1430 movq mm7, mm0 1431 movq mm4, mm0 1432 psubusb mm0, mm1 ; q1-=p1 1433 psubusb mm1, mm4 ; p1-=q1 1434 por mm1, mm0 ; abs(p1-q1) 1435 pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero 1436 psrlw mm1, 1 ; abs(p1-q1)/2 1437 1438 movq mm5, [rsi+rax] ; p0 1439 movq mm4, [rsi] ; q0 1440 movq mm0, mm4 ; q0 1441 movq mm6, mm5 ; p0 1442 psubusb mm5, mm4 ; p0-=q0 1443 psubusb mm4, mm6 ; q0-=p0 1444 por mm5, mm4 ; abs(p0 - q0) 1445 paddusb mm5, mm5 ; abs(p0-q0)*2 1446 paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1447 1448 psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 1449 pxor mm3, mm3 1450 pcmpeqb mm5, mm3 1451 1452 ; start work on filters 1453 pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 1454 pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 1455 psubsb mm2, mm7 ; p1 - q1 1456 1457 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 1458 pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values 1459 movq mm3, mm0 ; q0 1460 psubsb mm0, mm6 ; q0 - p0 1461 paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) 1462 paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) 1463 paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) 1464 pand mm5, mm2 ; mask filter values we don't care about 1465 1466 ; do + 4 side 1467 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 1468 1469 movq mm0, mm5 ; get a copy of filters 1470 psllw mm0, 8 ; shift left 8 1471 psraw mm0, 3 ; arithmetic shift right 11 1472 psrlw mm0, 8 1473 movq mm1, mm5 ; get a copy of filters 1474 psraw mm1, 11 ; arithmetic shift right 11 1475 psllw mm1, 8 ; shift left 8 to put it back 1476 1477 por mm0, mm1 ; put the two together to get result 1478 1479 psubsb mm3, mm0 ; q0-= q0 add 1480 pxor mm3, [GLOBAL(t80)] ; unoffset 1481 movq [rsi], mm3 ; write back 1482 1483 1484 ; now do +3 side 1485 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 1486 1487 movq mm0, mm5 ; get a copy of filters 1488 psllw mm0, 8 ; shift left 8 1489 psraw mm0, 3 ; arithmetic shift right 11 1490 psrlw mm0, 8 1491 psraw mm5, 11 ; arithmetic shift right 11 1492 psllw mm5, 8 ; shift left 8 to put it back 1493 por mm0, mm5 ; put the two together to get result 1494 1495 1496 paddsb mm6, mm0 ; p0+= p0 add 1497 pxor mm6, [GLOBAL(t80)] ; unoffset 1498 movq [rsi+rax], mm6 ; write back 1499 1500 add rsi,8 1501 neg rax 1502 dec rcx 1503 jnz nexts8_h 1504 1505 ; begin epilog 1506 pop rdi 1507 pop rsi 1508 RESTORE_GOT 1509 UNSHADOW_ARGS 1510 pop rbp 1511 ret 1512 1513 1514;void vp8_loop_filter_simple_vertical_edge_mmx 1515;( 1516; unsigned char *src_ptr, 1517; int src_pixel_step, 1518; const char *flimit, 1519; const char *limit, 1520; const char *thresh, 1521; int count 1522;) 1523global sym(vp8_loop_filter_simple_vertical_edge_mmx) 1524sym(vp8_loop_filter_simple_vertical_edge_mmx): 1525 push rbp 1526 mov rbp, rsp 1527 SHADOW_ARGS_TO_STACK 6 1528 GET_GOT rbx 1529 push rsi 1530 push rdi 1531 ; end prolog 1532 1533 ALIGN_STACK 16, rax 1534 sub rsp, 32 ; reserve 32 bytes 1535 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; 1536 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; 1537 1538 mov rsi, arg(0) ;src_ptr 1539 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1540 1541 lea rsi, [rsi + rax*4- 2]; ; 1542 movsxd rcx, dword ptr arg(5) ;count 1543nexts8_v: 1544 1545 lea rdi, [rsi + rax]; 1546 movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 1547 1548 movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 1549 punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 1550 1551 movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 1552 movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 1553 1554 punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 1555 movq mm5, mm4 ; 53 43 52 42 51 41 50 40 1556 1557 punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 1558 punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 1559 1560 neg rax 1561 1562 movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 1563 movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 1564 1565 punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 1566 movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 1567 1568 movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 1569 punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 1570 1571 movq mm2, mm0 ; 13 03 12 02 11 01 10 00 1572 punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 1573 1574 punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 1575 movq mm1, mm0 ; 13 03 12 02 11 01 10 00 1576 1577 punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 1578 movq mm3, mm2 ; 33 23 13 03 32 22 12 02 1579 1580 punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 1581 punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 1582 1583 punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 1584 1585 1586 ; calculate mask 1587 movq mm6, mm0 ; p1 1588 movq mm7, mm3 ; q1 1589 psubusb mm7, mm6 ; q1-=p1 1590 psubusb mm6, mm3 ; p1-=q1 1591 por mm6, mm7 ; abs(p1-q1) 1592 pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero 1593 psrlw mm6, 1 ; abs(p1-q1)/2 1594 1595 movq mm5, mm1 ; p0 1596 movq mm4, mm2 ; q0 1597 1598 psubusb mm5, mm2 ; p0-=q0 1599 psubusb mm4, mm1 ; q0-=p0 1600 1601 por mm5, mm4 ; abs(p0 - q0) 1602 paddusb mm5, mm5 ; abs(p0-q0)*2 1603 paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1604 1605 mov rdx, arg(2) ;flimit ; get flimit 1606 movq mm7, [rdx] 1607 mov rdx, arg(3) ; get limit 1608 movq mm6, [rdx] 1609 paddb mm7, mm7 ; flimit*2 (less than 255) 1610 paddb mm7, mm6 ; flimit * 2 + limit (less than 255) 1611 1612 psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 1613 pxor mm7, mm7 1614 pcmpeqb mm5, mm7 ; mm5 = mask 1615 1616 ; start work on filters 1617 movq t0, mm0 1618 movq t1, mm3 1619 1620 pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values 1621 pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values 1622 1623 psubsb mm0, mm3 ; p1 - q1 1624 movq mm6, mm1 ; p0 1625 1626 movq mm7, mm2 ; q0 1627 pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values 1628 1629 pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values 1630 movq mm3, mm7 ; offseted ; q0 1631 1632 psubsb mm7, mm6 ; q0 - p0 1633 paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) 1634 1635 paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) 1636 paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) 1637 1638 pand mm5, mm0 ; mask filter values we don't care about 1639 1640 paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 1641 1642 movq mm0, mm5 ; get a copy of filters 1643 psllw mm0, 8 ; shift left 8 1644 psraw mm0, 3 ; arithmetic shift right 11 1645 psrlw mm0, 8 1646 1647 movq mm7, mm5 ; get a copy of filters 1648 psraw mm7, 11 ; arithmetic shift right 11 1649 psllw mm7, 8 ; shift left 8 to put it back 1650 1651 por mm0, mm7 ; put the two together to get result 1652 1653 psubsb mm3, mm0 ; q0-= q0sz add 1654 pxor mm3, [GLOBAL(t80)] ; unoffset 1655 1656 ; now do +3 side 1657 psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 1658 1659 movq mm0, mm5 ; get a copy of filters 1660 psllw mm0, 8 ; shift left 8 1661 psraw mm0, 3 ; arithmetic shift right 11 1662 psrlw mm0, 8 1663 1664 psraw mm5, 11 ; arithmetic shift right 11 1665 psllw mm5, 8 ; shift left 8 to put it back 1666 por mm0, mm5 ; put the two together to get result 1667 1668 paddsb mm6, mm0 ; p0+= p0 add 1669 pxor mm6, [GLOBAL(t80)] ; unoffset 1670 1671 1672 movq mm0, t0 1673 movq mm4, t1 1674 1675 ; mm0 = 70 60 50 40 30 20 10 00 1676 ; mm6 = 71 61 51 41 31 21 11 01 1677 ; mm3 = 72 62 52 42 32 22 12 02 1678 ; mm4 = 73 63 53 43 33 23 13 03 1679 ; transpose back to write out 1680 1681 movq mm1, mm0 ; 1682 punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 1683 1684 punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 1685 movq mm2, mm3 ; 1686 1687 punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 1688 movq mm5, mm1 ; 71 70 61 60 51 50 41 40 1689 1690 punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 1691 movq mm6, mm0 ; 31 30 21 20 11 10 01 00 1692 1693 punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 1694 punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 1695 1696 movd [rsi+rax*4], mm0 ; write 03 02 01 00 1697 punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 1698 1699 psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 1700 punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 1701 1702 movd [rdi+rax*4], mm0 ; write 13 12 11 10 1703 movd [rsi+rax*2], mm6 ; write 23 22 21 20 1704 1705 psrlq mm6, 32 ; 33 32 31 30 1706 movd [rsi], mm1 ; write 43 42 41 40 1707 1708 movd [rsi + rax], mm6 ; write 33 32 31 30 1709 neg rax 1710 1711 movd [rsi + rax*2], mm5 ; write 63 62 61 60 1712 psrlq mm1, 32 ; 53 52 51 50 1713 1714 movd [rdi], mm1 ; write out 53 52 51 50 1715 psrlq mm5, 32 ; 73 72 71 70 1716 1717 movd [rdi + rax*2], mm5 ; write 73 72 71 70 1718 1719 lea rsi, [rsi+rax*8] ; next 8 1720 1721 dec rcx 1722 jnz nexts8_v 1723 1724 add rsp, 32 1725 pop rsp 1726 ; begin epilog 1727 pop rdi 1728 pop rsi 1729 RESTORE_GOT 1730 UNSHADOW_ARGS 1731 pop rbp 1732 ret 1733 1734 1735 1736;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, 1737; int y_stride, 1738; loop_filter_info *lfi) 1739;{ 1740; 1741; 1742; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 1743; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 1744; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); 1745;} 1746 1747SECTION_RODATA 1748align 16 1749tfe: 1750 times 8 db 0xfe 1751align 16 1752t80: 1753 times 8 db 0x80 1754align 16 1755t1s: 1756 times 8 db 0x01 1757align 16 1758t3: 1759 times 8 db 0x03 1760align 16 1761t4: 1762 times 8 db 0x04 1763align 16 1764ones: 1765 times 4 dw 0x0001 1766align 16 1767s27: 1768 times 4 dw 0x1b00 1769align 16 1770s18: 1771 times 4 dw 0x1200 1772align 16 1773s9: 1774 times 4 dw 0x0900 1775align 16 1776s63: 1777 times 4 dw 0x003f 1778