1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;macro in deblock functions 15%macro FIRST_2_ROWS 0 16 movdqa xmm4, xmm0 17 movdqa xmm6, xmm0 18 movdqa xmm5, xmm1 19 pavgb xmm5, xmm3 20 21 ;calculate absolute value 22 psubusb xmm4, xmm1 23 psubusb xmm1, xmm0 24 psubusb xmm6, xmm3 25 psubusb xmm3, xmm0 26 paddusb xmm4, xmm1 27 paddusb xmm6, xmm3 28 29 ;get threshold 30 movdqa xmm2, flimit 31 pxor xmm1, xmm1 32 movdqa xmm7, xmm2 33 34 ;get mask 35 psubusb xmm2, xmm4 36 psubusb xmm7, xmm6 37 pcmpeqb xmm2, xmm1 38 pcmpeqb xmm7, xmm1 39 por xmm7, xmm2 40%endmacro 41 42%macro SECOND_2_ROWS 0 43 movdqa xmm6, xmm0 44 movdqa xmm4, xmm0 45 movdqa xmm2, xmm1 46 pavgb xmm1, xmm3 47 48 ;calculate absolute value 49 psubusb xmm6, xmm2 50 psubusb xmm2, xmm0 51 psubusb xmm4, xmm3 52 psubusb xmm3, xmm0 53 paddusb xmm6, xmm2 54 paddusb xmm4, xmm3 55 56 pavgb xmm5, xmm1 57 58 ;get threshold 59 movdqa xmm2, flimit 60 pxor xmm1, xmm1 61 movdqa xmm3, xmm2 62 63 ;get mask 64 psubusb xmm2, xmm6 65 psubusb xmm3, xmm4 66 pcmpeqb xmm2, xmm1 67 pcmpeqb xmm3, xmm1 68 69 por xmm7, xmm2 70 por xmm7, xmm3 71 72 pavgb xmm5, xmm0 73 74 ;decide if or not to use filtered value 75 pand xmm0, xmm7 76 pandn xmm7, xmm5 77 paddusb xmm0, xmm7 78%endmacro 79 80%macro UPDATE_FLIMIT 0 81 movdqu xmm2, XMMWORD PTR [rbx] 82 movdqu [rsp], xmm2 83 add rbx, 16 84%endmacro 85 86SECTION .text 87 88;void vpx_post_proc_down_and_across_mb_row_sse2 89;( 90; unsigned char *src_ptr, 91; unsigned char *dst_ptr, 92; int src_pixels_per_line, 93; int dst_pixels_per_line, 94; int cols, 95; int *flimits, 96; int size 97;) 98global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE 99sym(vpx_post_proc_down_and_across_mb_row_sse2): 100 push rbp 101 mov rbp, rsp 102 SHADOW_ARGS_TO_STACK 7 103 SAVE_XMM 7 104 push rbx 105 push rsi 106 push rdi 107 ; end prolog 108 ALIGN_STACK 16, rax 109 sub rsp, 16 110 111 ; put flimit on stack 112 mov rbx, arg(5) ;flimits ptr 113 UPDATE_FLIMIT 114 115%define flimit [rsp] 116 117 mov rsi, arg(0) ;src_ptr 118 mov rdi, arg(1) ;dst_ptr 119 120 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line 121 movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock 122.nextrow: 123 xor rdx, rdx ;col 124.nextcol: 125 ;load current and next 2 rows 126 movdqu xmm0, XMMWORD PTR [rsi] 127 movdqu xmm1, XMMWORD PTR [rsi + rax] 128 movdqu xmm3, XMMWORD PTR [rsi + 2*rax] 129 130 FIRST_2_ROWS 131 132 ;load above 2 rows 133 neg rax 134 movdqu xmm1, XMMWORD PTR [rsi + 2*rax] 135 movdqu xmm3, XMMWORD PTR [rsi + rax] 136 137 SECOND_2_ROWS 138 139 movdqu XMMWORD PTR [rdi], xmm0 140 141 neg rax ; positive stride 142 add rsi, 16 143 add rdi, 16 144 145 add rdx, 16 146 cmp edx, dword arg(4) ;cols 147 jge .downdone 148 UPDATE_FLIMIT 149 jmp .nextcol 150 151.downdone: 152 ; done with the all cols, start the across filtering in place 153 sub rsi, rdx 154 sub rdi, rdx 155 156 mov rbx, arg(5) ; flimits 157 UPDATE_FLIMIT 158 159 ; dup the first byte into the left border 8 times 160 movq mm1, [rdi] 161 punpcklbw mm1, mm1 162 punpcklwd mm1, mm1 163 punpckldq mm1, mm1 164 mov rdx, -8 165 movq [rdi+rdx], mm1 166 167 ; dup the last byte into the right border 168 movsxd rdx, dword arg(4) 169 movq mm1, [rdi + rdx + -1] 170 punpcklbw mm1, mm1 171 punpcklwd mm1, mm1 172 punpckldq mm1, mm1 173 movq [rdi+rdx], mm1 174 175 xor rdx, rdx 176 movq mm0, QWORD PTR [rdi-16]; 177 movq mm1, QWORD PTR [rdi-8]; 178 179.acrossnextcol: 180 movdqu xmm0, XMMWORD PTR [rdi + rdx] 181 movdqu xmm1, XMMWORD PTR [rdi + rdx -2] 182 movdqu xmm3, XMMWORD PTR [rdi + rdx -1] 183 184 FIRST_2_ROWS 185 186 movdqu xmm1, XMMWORD PTR [rdi + rdx +1] 187 movdqu xmm3, XMMWORD PTR [rdi + rdx +2] 188 189 SECOND_2_ROWS 190 191 movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes 192 movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes 193 movdq2q mm0, xmm0 194 psrldq xmm0, 8 195 movdq2q mm1, xmm0 196 197 add rdx, 16 198 cmp edx, dword arg(4) ;cols 199 jge .acrossdone 200 UPDATE_FLIMIT 201 jmp .acrossnextcol 202 203.acrossdone: 204 ; last 16 pixels 205 movq QWORD PTR [rdi+rdx-16], mm0 206 207 cmp edx, dword arg(4) 208 jne .throw_last_8 209 movq QWORD PTR [rdi+rdx-8], mm1 210.throw_last_8: 211 ; done with this rwo 212 add rsi,rax ;next src line 213 mov eax, dword arg(3) ;dst_pixels_per_line 214 add rdi,rax ;next destination 215 mov eax, dword arg(2) ;src_pixels_per_line 216 217 mov rbx, arg(5) ;flimits 218 UPDATE_FLIMIT 219 220 dec rcx ;decrement count 221 jnz .nextrow ;next row 222 223 add rsp, 16 224 pop rsp 225 ; begin epilog 226 pop rdi 227 pop rsi 228 pop rbx 229 RESTORE_XMM 230 UNSHADOW_ARGS 231 pop rbp 232 ret 233%undef flimit 234 235;void vpx_mbpost_proc_down_sse2(unsigned char *dst, 236; int pitch, int rows, int cols,int flimit) 237extern sym(vpx_rv) 238global sym(vpx_mbpost_proc_down_sse2) PRIVATE 239sym(vpx_mbpost_proc_down_sse2): 240 push rbp 241 mov rbp, rsp 242 SHADOW_ARGS_TO_STACK 5 243 SAVE_XMM 7 244 GET_GOT rbx 245 push rsi 246 push rdi 247 ; end prolog 248 249 ALIGN_STACK 16, rax 250 sub rsp, 128+16 251 252 ; unsigned char d[16][8] at [rsp] 253 ; create flimit2 at [rsp+128] 254 mov eax, dword ptr arg(4) ;flimit 255 mov [rsp+128], eax 256 mov [rsp+128+4], eax 257 mov [rsp+128+8], eax 258 mov [rsp+128+12], eax 259%define flimit4 [rsp+128] 260 261%if ABI_IS_32BIT=0 262 lea r8, [GLOBAL(sym(vpx_rv))] 263%endif 264 265 ;rows +=8; 266 add dword arg(2), 8 267 268 ;for(c=0; c<cols; c+=8) 269.loop_col: 270 mov rsi, arg(0) ; s 271 pxor xmm0, xmm0 ; 272 273 movsxd rax, dword ptr arg(1) ;pitch ; 274 275 ; this copies the last row down into the border 8 rows 276 mov rdi, rsi 277 mov rdx, arg(2) 278 sub rdx, 9 279 imul rdx, rax 280 lea rdi, [rdi+rdx] 281 movq xmm1, QWORD ptr[rdi] ; first row 282 mov rcx, 8 283.init_borderd: ; initialize borders 284 lea rdi, [rdi + rax] 285 movq [rdi], xmm1 286 287 dec rcx 288 jne .init_borderd 289 290 neg rax ; rax = -pitch 291 292 ; this copies the first row up into the border 8 rows 293 mov rdi, rsi 294 movq xmm1, QWORD ptr[rdi] ; first row 295 mov rcx, 8 296.init_border: ; initialize borders 297 lea rdi, [rdi + rax] 298 movq [rdi], xmm1 299 300 dec rcx 301 jne .init_border 302 303 304 305 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] 306 neg rax 307 308 pxor xmm5, xmm5 309 pxor xmm6, xmm6 ; 310 311 pxor xmm7, xmm7 ; 312 mov rdi, rsi 313 314 mov rcx, 15 ; 315 316.loop_initvar: 317 movq xmm1, QWORD PTR [rdi]; 318 punpcklbw xmm1, xmm0 ; 319 320 paddw xmm5, xmm1 ; 321 pmullw xmm1, xmm1 ; 322 323 movdqa xmm2, xmm1 ; 324 punpcklwd xmm1, xmm0 ; 325 326 punpckhwd xmm2, xmm0 ; 327 paddd xmm6, xmm1 ; 328 329 paddd xmm7, xmm2 ; 330 lea rdi, [rdi+rax] ; 331 332 dec rcx 333 jne .loop_initvar 334 ;save the var and sum 335 xor rdx, rdx 336.loop_row: 337 movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] 338 movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] 339 340 punpcklbw xmm1, xmm0 341 punpcklbw xmm2, xmm0 342 343 paddw xmm5, xmm2 344 psubw xmm5, xmm1 345 346 pmullw xmm2, xmm2 347 movdqa xmm4, xmm2 348 349 punpcklwd xmm2, xmm0 350 punpckhwd xmm4, xmm0 351 352 paddd xmm6, xmm2 353 paddd xmm7, xmm4 354 355 pmullw xmm1, xmm1 356 movdqa xmm2, xmm1 357 358 punpcklwd xmm1, xmm0 359 psubd xmm6, xmm1 360 361 punpckhwd xmm2, xmm0 362 psubd xmm7, xmm2 363 364 365 movdqa xmm3, xmm6 366 pslld xmm3, 4 367 368 psubd xmm3, xmm6 369 movdqa xmm1, xmm5 370 371 movdqa xmm4, xmm5 372 pmullw xmm1, xmm1 373 374 pmulhw xmm4, xmm4 375 movdqa xmm2, xmm1 376 377 punpcklwd xmm1, xmm4 378 punpckhwd xmm2, xmm4 379 380 movdqa xmm4, xmm7 381 pslld xmm4, 4 382 383 psubd xmm4, xmm7 384 385 psubd xmm3, xmm1 386 psubd xmm4, xmm2 387 388 psubd xmm3, flimit4 389 psubd xmm4, flimit4 390 391 psrad xmm3, 31 392 psrad xmm4, 31 393 394 packssdw xmm3, xmm4 395 packsswb xmm3, xmm0 396 397 movq xmm1, QWORD PTR [rsi+rax*8] 398 399 movq xmm2, xmm1 400 punpcklbw xmm1, xmm0 401 402 paddw xmm1, xmm5 403 mov rcx, rdx 404 405 and rcx, 127 406%if ABI_IS_32BIT=1 && CONFIG_PIC=1 407 push rax 408 lea rax, [GLOBAL(sym(vpx_rv))] 409 movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2] 410 pop rax 411%elif ABI_IS_32BIT=0 412 movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2] 413%else 414 movdqu xmm4, [sym(vpx_rv) + rcx*2] 415%endif 416 417 paddw xmm1, xmm4 418 ;paddw xmm1, eight8s 419 psraw xmm1, 4 420 421 packuswb xmm1, xmm0 422 pand xmm1, xmm3 423 424 pandn xmm3, xmm2 425 por xmm1, xmm3 426 427 and rcx, 15 428 movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] 429 430 cmp edx, 8 431 jl .skip_assignment 432 433 mov rcx, rdx 434 sub rcx, 8 435 and rcx, 15 436 movq mm0, [rsp + rcx*8] ;d[rcx*8] 437 movq [rsi], mm0 438 439.skip_assignment: 440 lea rsi, [rsi+rax] 441 442 lea rdi, [rdi+rax] 443 add rdx, 1 444 445 cmp edx, dword arg(2) ;rows 446 jl .loop_row 447 448 add dword arg(0), 8 ; s += 8 449 sub dword arg(3), 8 ; cols -= 8 450 cmp dword arg(3), 0 451 jg .loop_col 452 453 add rsp, 128+16 454 pop rsp 455 456 ; begin epilog 457 pop rdi 458 pop rsi 459 RESTORE_GOT 460 RESTORE_XMM 461 UNSHADOW_ARGS 462 pop rbp 463 ret 464%undef flimit4 465 466 467;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, 468; int pitch, int rows, int cols,int flimit) 469global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE 470sym(vpx_mbpost_proc_across_ip_sse2): 471 push rbp 472 mov rbp, rsp 473 SHADOW_ARGS_TO_STACK 5 474 SAVE_XMM 7 475 GET_GOT rbx 476 push rsi 477 push rdi 478 ; end prolog 479 480 ALIGN_STACK 16, rax 481 sub rsp, 16 482 483 ; create flimit4 at [rsp] 484 mov eax, dword ptr arg(4) ;flimit 485 mov [rsp], eax 486 mov [rsp+4], eax 487 mov [rsp+8], eax 488 mov [rsp+12], eax 489%define flimit4 [rsp] 490 491 492 ;for(r=0;r<rows;r++) 493.ip_row_loop: 494 495 xor rdx, rdx ;sumsq=0; 496 xor rcx, rcx ;sum=0; 497 mov rsi, arg(0); s 498 499 500 ; dup the first byte into the left border 8 times 501 movq mm1, [rsi] 502 punpcklbw mm1, mm1 503 punpcklwd mm1, mm1 504 punpckldq mm1, mm1 505 506 mov rdi, -8 507 movq [rsi+rdi], mm1 508 509 ; dup the last byte into the right border 510 movsxd rdx, dword arg(3) 511 movq mm1, [rsi + rdx + -1] 512 punpcklbw mm1, mm1 513 punpcklwd mm1, mm1 514 punpckldq mm1, mm1 515 movq [rsi+rdx], mm1 516 517.ip_var_loop: 518 ;for(i=-8;i<=6;i++) 519 ;{ 520 ; sumsq += s[i]*s[i]; 521 ; sum += s[i]; 522 ;} 523 movzx eax, byte [rsi+rdi] 524 add ecx, eax 525 mul al 526 add edx, eax 527 add rdi, 1 528 cmp rdi, 6 529 jle .ip_var_loop 530 531 532 ;mov rax, sumsq 533 ;movd xmm7, rax 534 movd xmm7, edx 535 536 ;mov rax, sum 537 ;movd xmm6, rax 538 movd xmm6, ecx 539 540 mov rsi, arg(0) ;s 541 xor rcx, rcx 542 543 movsxd rdx, dword arg(3) ;cols 544 add rdx, 8 545 pxor mm0, mm0 546 pxor mm1, mm1 547 548 pxor xmm0, xmm0 549.nextcol4: 550 551 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 552 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 553 554 punpcklbw xmm1, xmm0 ; expanding 555 punpcklbw xmm2, xmm0 ; expanding 556 557 punpcklwd xmm1, xmm0 ; expanding to dwords 558 punpcklwd xmm2, xmm0 ; expanding to dwords 559 560 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 561 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 562 563 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 564 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 565 566 paddd xmm6, xmm2 567 paddd xmm7, xmm1 568 569 pshufd xmm6, xmm6, 0 ; duplicate the last ones 570 pshufd xmm7, xmm7, 0 ; duplicate the last ones 571 572 psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 573 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 574 575 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared 576 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared 577 578 paddd xmm6, xmm4 579 paddd xmm7, xmm3 580 581 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared 582 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared 583 584 paddd xmm7, xmm3 585 paddd xmm6, xmm4 586 587 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared 588 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared 589 590 paddd xmm7, xmm3 591 paddd xmm6, xmm4 592 593 movdqa xmm3, xmm6 594 pmaddwd xmm3, xmm3 595 596 movdqa xmm5, xmm7 597 pslld xmm5, 4 598 599 psubd xmm5, xmm7 600 psubd xmm5, xmm3 601 602 psubd xmm5, flimit4 603 psrad xmm5, 31 604 605 packssdw xmm5, xmm0 606 packsswb xmm5, xmm0 607 608 movd xmm1, DWORD PTR [rsi+rcx] 609 movq xmm2, xmm1 610 611 punpcklbw xmm1, xmm0 612 punpcklwd xmm1, xmm0 613 614 paddd xmm1, xmm6 615 paddd xmm1, [GLOBAL(four8s)] 616 617 psrad xmm1, 4 618 packssdw xmm1, xmm0 619 620 packuswb xmm1, xmm0 621 pand xmm1, xmm5 622 623 pandn xmm5, xmm2 624 por xmm5, xmm1 625 626 movd [rsi+rcx-8], mm0 627 movq mm0, mm1 628 629 movdq2q mm1, xmm5 630 psrldq xmm7, 12 631 632 psrldq xmm6, 12 633 add rcx, 4 634 635 cmp rcx, rdx 636 jl .nextcol4 637 638 ;s+=pitch; 639 movsxd rax, dword arg(1) 640 add arg(0), rax 641 642 sub dword arg(2), 1 ;rows-=1 643 cmp dword arg(2), 0 644 jg .ip_row_loop 645 646 add rsp, 16 647 pop rsp 648 649 ; begin epilog 650 pop rdi 651 pop rsi 652 RESTORE_GOT 653 RESTORE_XMM 654 UNSHADOW_ARGS 655 pop rbp 656 ret 657%undef flimit4 658 659 660SECTION_RODATA 661align 16 662four8s: 663 times 4 dd 8 664