1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;void idct_dequant_0_2x_sse2 15; ( 16; short *qcoeff - 0 17; short *dequant - 1 18; unsigned char *pre - 2 19; unsigned char *dst - 3 20; int dst_stride - 4 21; int blk_stride - 5 22; ) 23 24global sym(idct_dequant_0_2x_sse2) 25sym(idct_dequant_0_2x_sse2): 26 push rbp 27 mov rbp, rsp 28 SHADOW_ARGS_TO_STACK 6 29 GET_GOT rbx 30 ; end prolog 31 32 mov rdx, arg(1) ; dequant 33 mov rax, arg(0) ; qcoeff 34 35 ; Zero out xmm7, for use unpacking 36 pxor xmm7, xmm7 37 38 movd xmm4, [rax] 39 movd xmm5, [rdx] 40 41 pinsrw xmm4, [rax+32], 4 42 pinsrw xmm5, [rdx], 4 43 44 pmullw xmm4, xmm5 45 46 ; clear coeffs 47 movd [rax], xmm7 48 movd [rax+32], xmm7 49;pshufb 50 pshuflw xmm4, xmm4, 00000000b 51 pshufhw xmm4, xmm4, 00000000b 52 53 mov rax, arg(2) ; pre 54 paddw xmm4, [GLOBAL(fours)] 55 56 movsxd rcx, dword ptr arg(5) ; blk_stride 57 psraw xmm4, 3 58 59 movq xmm0, [rax] 60 movq xmm1, [rax+rcx] 61 movq xmm2, [rax+2*rcx] 62 lea rcx, [3*rcx] 63 movq xmm3, [rax+rcx] 64 65 punpcklbw xmm0, xmm7 66 punpcklbw xmm1, xmm7 67 punpcklbw xmm2, xmm7 68 punpcklbw xmm3, xmm7 69 70 mov rax, arg(3) ; dst 71 movsxd rdx, dword ptr arg(4) ; dst_stride 72 73 ; Add to predict buffer 74 paddw xmm0, xmm4 75 paddw xmm1, xmm4 76 paddw xmm2, xmm4 77 paddw xmm3, xmm4 78 79 ; pack up before storing 80 packuswb xmm0, xmm7 81 packuswb xmm1, xmm7 82 packuswb xmm2, xmm7 83 packuswb xmm3, xmm7 84 85 ; store blocks back out 86 movq [rax], xmm0 87 movq [rax + rdx], xmm1 88 89 lea rax, [rax + 2*rdx] 90 91 movq [rax], xmm2 92 movq [rax + rdx], xmm3 93 94 ; begin epilog 95 RESTORE_GOT 96 UNSHADOW_ARGS 97 pop rbp 98 ret 99 100global sym(idct_dequant_full_2x_sse2) 101sym(idct_dequant_full_2x_sse2): 102 push rbp 103 mov rbp, rsp 104 SHADOW_ARGS_TO_STACK 7 105 GET_GOT rbx 106 push rsi 107 push rdi 108 ; end prolog 109 110 ; special case when 2 blocks have 0 or 1 coeffs 111 ; dc is set as first coeff, so no need to load qcoeff 112 mov rax, arg(0) ; qcoeff 113 mov rsi, arg(2) ; pre 114 mov rdi, arg(3) ; dst 115 movsxd rcx, dword ptr arg(5) ; blk_stride 116 117 ; Zero out xmm7, for use unpacking 118 pxor xmm7, xmm7 119 120 mov rdx, arg(1) ; dequant 121 122 ; note the transpose of xmm1 and xmm2, necessary for shuffle 123 ; to spit out sensicle data 124 movdqa xmm0, [rax] 125 movdqa xmm2, [rax+16] 126 movdqa xmm1, [rax+32] 127 movdqa xmm3, [rax+48] 128 129 ; Clear out coeffs 130 movdqa [rax], xmm7 131 movdqa [rax+16], xmm7 132 movdqa [rax+32], xmm7 133 movdqa [rax+48], xmm7 134 135 ; dequantize qcoeff buffer 136 pmullw xmm0, [rdx] 137 pmullw xmm2, [rdx+16] 138 pmullw xmm1, [rdx] 139 pmullw xmm3, [rdx+16] 140 141 ; repack so block 0 row x and block 1 row x are together 142 movdqa xmm4, xmm0 143 punpckldq xmm0, xmm1 144 punpckhdq xmm4, xmm1 145 146 pshufd xmm0, xmm0, 11011000b 147 pshufd xmm1, xmm4, 11011000b 148 149 movdqa xmm4, xmm2 150 punpckldq xmm2, xmm3 151 punpckhdq xmm4, xmm3 152 153 pshufd xmm2, xmm2, 11011000b 154 pshufd xmm3, xmm4, 11011000b 155 156 ; first pass 157 psubw xmm0, xmm2 ; b1 = 0-2 158 paddw xmm2, xmm2 ; 159 160 movdqa xmm5, xmm1 161 paddw xmm2, xmm0 ; a1 = 0+2 162 163 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 164 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 165 166 movdqa xmm7, xmm3 167 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 168 169 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 170 psubw xmm7, xmm5 ; c1 171 172 movdqa xmm5, xmm1 173 movdqa xmm4, xmm3 174 175 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 176 paddw xmm5, xmm1 177 178 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 179 paddw xmm3, xmm4 180 181 paddw xmm3, xmm5 ; d1 182 movdqa xmm6, xmm2 ; a1 183 184 movdqa xmm4, xmm0 ; b1 185 paddw xmm2, xmm3 ;0 186 187 paddw xmm4, xmm7 ;1 188 psubw xmm0, xmm7 ;2 189 190 psubw xmm6, xmm3 ;3 191 192 ; transpose for the second pass 193 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 194 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 195 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 196 197 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 198 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 199 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 200 201 202 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 203 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 204 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 205 206 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 207 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 208 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 209 210 211 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 212 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 213 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 214 215 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 216 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 217 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 218 219 pshufd xmm0, xmm2, 11011000b 220 pshufd xmm2, xmm1, 11011000b 221 222 pshufd xmm1, xmm5, 11011000b 223 pshufd xmm3, xmm7, 11011000b 224 225 ; second pass 226 psubw xmm0, xmm2 ; b1 = 0-2 227 paddw xmm2, xmm2 228 229 movdqa xmm5, xmm1 230 paddw xmm2, xmm0 ; a1 = 0+2 231 232 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 233 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 234 235 movdqa xmm7, xmm3 236 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 237 238 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 239 psubw xmm7, xmm5 ; c1 240 241 movdqa xmm5, xmm1 242 movdqa xmm4, xmm3 243 244 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 245 paddw xmm5, xmm1 246 247 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 248 paddw xmm3, xmm4 249 250 paddw xmm3, xmm5 ; d1 251 paddw xmm0, [GLOBAL(fours)] 252 253 paddw xmm2, [GLOBAL(fours)] 254 movdqa xmm6, xmm2 ; a1 255 256 movdqa xmm4, xmm0 ; b1 257 paddw xmm2, xmm3 ;0 258 259 paddw xmm4, xmm7 ;1 260 psubw xmm0, xmm7 ;2 261 262 psubw xmm6, xmm3 ;3 263 psraw xmm2, 3 264 265 psraw xmm0, 3 266 psraw xmm4, 3 267 268 psraw xmm6, 3 269 270 ; transpose to save 271 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 272 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 273 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 274 275 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 276 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 277 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 278 279 280 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 281 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 282 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 283 284 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 285 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 286 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 287 288 289 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 290 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 291 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 292 293 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 294 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 295 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 296 297 pshufd xmm0, xmm2, 11011000b 298 pshufd xmm2, xmm1, 11011000b 299 300 pshufd xmm1, xmm5, 11011000b 301 pshufd xmm3, xmm7, 11011000b 302 303 pxor xmm7, xmm7 304 305 ; Load up predict blocks 306 movq xmm4, [rsi] 307 movq xmm5, [rsi+rcx] 308 309 punpcklbw xmm4, xmm7 310 punpcklbw xmm5, xmm7 311 312 paddw xmm0, xmm4 313 paddw xmm1, xmm5 314 315 movq xmm4, [rsi+2*rcx] 316 lea rcx, [3*rcx] 317 movq xmm5, [rsi+rcx] 318 319 punpcklbw xmm4, xmm7 320 punpcklbw xmm5, xmm7 321 322 paddw xmm2, xmm4 323 paddw xmm3, xmm5 324 325.finish: 326 327 ; pack up before storing 328 packuswb xmm0, xmm7 329 packuswb xmm1, xmm7 330 packuswb xmm2, xmm7 331 packuswb xmm3, xmm7 332 333 ; Load destination stride before writing out, 334 ; doesn't need to persist 335 movsxd rdx, dword ptr arg(4) ; dst_stride 336 337 ; store blocks back out 338 movq [rdi], xmm0 339 movq [rdi + rdx], xmm1 340 341 lea rdi, [rdi + 2*rdx] 342 343 movq [rdi], xmm2 344 movq [rdi + rdx], xmm3 345 346 ; begin epilog 347 pop rdi 348 pop rsi 349 RESTORE_GOT 350 UNSHADOW_ARGS 351 pop rbp 352 ret 353 354;void idct_dequant_dc_0_2x_sse2 355; ( 356; short *qcoeff - 0 357; short *dequant - 1 358; unsigned char *pre - 2 359; unsigned char *dst - 3 360; int dst_stride - 4 361; short *dc - 5 362; ) 363global sym(idct_dequant_dc_0_2x_sse2) 364sym(idct_dequant_dc_0_2x_sse2): 365 push rbp 366 mov rbp, rsp 367 SHADOW_ARGS_TO_STACK 7 368 GET_GOT rbx 369 push rsi 370 push rdi 371 ; end prolog 372 373 ; special case when 2 blocks have 0 or 1 coeffs 374 ; dc is set as first coeff, so no need to load qcoeff 375 mov rax, arg(0) ; qcoeff 376 mov rsi, arg(2) ; pre 377 mov rdi, arg(3) ; dst 378 mov rdx, arg(5) ; dc 379 380 ; Zero out xmm7, for use unpacking 381 pxor xmm7, xmm7 382 383 ; load up 2 dc words here == 2*16 = doubleword 384 movd xmm4, [rdx] 385 386 ; Load up predict blocks 387 movq xmm0, [rsi] 388 movq xmm1, [rsi+16] 389 movq xmm2, [rsi+32] 390 movq xmm3, [rsi+48] 391 392 ; Duplicate and expand dc across 393 punpcklwd xmm4, xmm4 394 punpckldq xmm4, xmm4 395 396 ; Rounding to dequant and downshift 397 paddw xmm4, [GLOBAL(fours)] 398 psraw xmm4, 3 399 400 ; Predict buffer needs to be expanded from bytes to words 401 punpcklbw xmm0, xmm7 402 punpcklbw xmm1, xmm7 403 punpcklbw xmm2, xmm7 404 punpcklbw xmm3, xmm7 405 406 ; Add to predict buffer 407 paddw xmm0, xmm4 408 paddw xmm1, xmm4 409 paddw xmm2, xmm4 410 paddw xmm3, xmm4 411 412 ; pack up before storing 413 packuswb xmm0, xmm7 414 packuswb xmm1, xmm7 415 packuswb xmm2, xmm7 416 packuswb xmm3, xmm7 417 418 ; Load destination stride before writing out, 419 ; doesn't need to persist 420 movsxd rdx, dword ptr arg(4) ; dst_stride 421 422 ; store blocks back out 423 movq [rdi], xmm0 424 movq [rdi + rdx], xmm1 425 426 lea rdi, [rdi + 2*rdx] 427 428 movq [rdi], xmm2 429 movq [rdi + rdx], xmm3 430 431 ; begin epilog 432 pop rdi 433 pop rsi 434 RESTORE_GOT 435 UNSHADOW_ARGS 436 pop rbp 437 ret 438 439global sym(idct_dequant_dc_full_2x_sse2) 440sym(idct_dequant_dc_full_2x_sse2): 441 push rbp 442 mov rbp, rsp 443 SHADOW_ARGS_TO_STACK 7 444 GET_GOT rbx 445 push rsi 446 push rdi 447 ; end prolog 448 449 ; special case when 2 blocks have 0 or 1 coeffs 450 ; dc is set as first coeff, so no need to load qcoeff 451 mov rax, arg(0) ; qcoeff 452 mov rsi, arg(2) ; pre 453 mov rdi, arg(3) ; dst 454 455 ; Zero out xmm7, for use unpacking 456 pxor xmm7, xmm7 457 458 mov rdx, arg(1) ; dequant 459 460 ; note the transpose of xmm1 and xmm2, necessary for shuffle 461 ; to spit out sensicle data 462 movdqa xmm0, [rax] 463 movdqa xmm2, [rax+16] 464 movdqa xmm1, [rax+32] 465 movdqa xmm3, [rax+48] 466 467 ; Clear out coeffs 468 movdqa [rax], xmm7 469 movdqa [rax+16], xmm7 470 movdqa [rax+32], xmm7 471 movdqa [rax+48], xmm7 472 473 ; dequantize qcoeff buffer 474 pmullw xmm0, [rdx] 475 pmullw xmm2, [rdx+16] 476 pmullw xmm1, [rdx] 477 pmullw xmm3, [rdx+16] 478 479 ; DC component 480 mov rdx, arg(5) 481 482 ; repack so block 0 row x and block 1 row x are together 483 movdqa xmm4, xmm0 484 punpckldq xmm0, xmm1 485 punpckhdq xmm4, xmm1 486 487 pshufd xmm0, xmm0, 11011000b 488 pshufd xmm1, xmm4, 11011000b 489 490 movdqa xmm4, xmm2 491 punpckldq xmm2, xmm3 492 punpckhdq xmm4, xmm3 493 494 pshufd xmm2, xmm2, 11011000b 495 pshufd xmm3, xmm4, 11011000b 496 497 ; insert DC component 498 pinsrw xmm0, [rdx], 0 499 pinsrw xmm0, [rdx+2], 4 500 501 ; first pass 502 psubw xmm0, xmm2 ; b1 = 0-2 503 paddw xmm2, xmm2 ; 504 505 movdqa xmm5, xmm1 506 paddw xmm2, xmm0 ; a1 = 0+2 507 508 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 509 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 510 511 movdqa xmm7, xmm3 512 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 513 514 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 515 psubw xmm7, xmm5 ; c1 516 517 movdqa xmm5, xmm1 518 movdqa xmm4, xmm3 519 520 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 521 paddw xmm5, xmm1 522 523 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 524 paddw xmm3, xmm4 525 526 paddw xmm3, xmm5 ; d1 527 movdqa xmm6, xmm2 ; a1 528 529 movdqa xmm4, xmm0 ; b1 530 paddw xmm2, xmm3 ;0 531 532 paddw xmm4, xmm7 ;1 533 psubw xmm0, xmm7 ;2 534 535 psubw xmm6, xmm3 ;3 536 537 ; transpose for the second pass 538 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 539 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 540 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 541 542 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 543 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 544 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 545 546 547 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 548 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 549 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 550 551 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 552 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 553 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 554 555 556 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 557 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 558 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 559 560 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 561 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 562 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 563 564 pshufd xmm0, xmm2, 11011000b 565 pshufd xmm2, xmm1, 11011000b 566 567 pshufd xmm1, xmm5, 11011000b 568 pshufd xmm3, xmm7, 11011000b 569 570 ; second pass 571 psubw xmm0, xmm2 ; b1 = 0-2 572 paddw xmm2, xmm2 573 574 movdqa xmm5, xmm1 575 paddw xmm2, xmm0 ; a1 = 0+2 576 577 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 578 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 579 580 movdqa xmm7, xmm3 581 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 582 583 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 584 psubw xmm7, xmm5 ; c1 585 586 movdqa xmm5, xmm1 587 movdqa xmm4, xmm3 588 589 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 590 paddw xmm5, xmm1 591 592 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 593 paddw xmm3, xmm4 594 595 paddw xmm3, xmm5 ; d1 596 paddw xmm0, [GLOBAL(fours)] 597 598 paddw xmm2, [GLOBAL(fours)] 599 movdqa xmm6, xmm2 ; a1 600 601 movdqa xmm4, xmm0 ; b1 602 paddw xmm2, xmm3 ;0 603 604 paddw xmm4, xmm7 ;1 605 psubw xmm0, xmm7 ;2 606 607 psubw xmm6, xmm3 ;3 608 psraw xmm2, 3 609 610 psraw xmm0, 3 611 psraw xmm4, 3 612 613 psraw xmm6, 3 614 615 ; transpose to save 616 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 617 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 618 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 619 620 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 621 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 622 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 623 624 625 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 626 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 627 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 628 629 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 630 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 631 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 632 633 634 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 635 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 636 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 637 638 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 639 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 640 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 641 642 pshufd xmm0, xmm2, 11011000b 643 pshufd xmm2, xmm1, 11011000b 644 645 pshufd xmm1, xmm5, 11011000b 646 pshufd xmm3, xmm7, 11011000b 647 648 pxor xmm7, xmm7 649 650 ; Load up predict blocks 651 movq xmm4, [rsi] 652 movq xmm5, [rsi+16] 653 654 punpcklbw xmm4, xmm7 655 punpcklbw xmm5, xmm7 656 657 paddw xmm0, xmm4 658 paddw xmm1, xmm5 659 660 movq xmm4, [rsi+32] 661 movq xmm5, [rsi+48] 662 663 punpcklbw xmm4, xmm7 664 punpcklbw xmm5, xmm7 665 666 paddw xmm2, xmm4 667 paddw xmm3, xmm5 668 669.finish: 670 671 ; pack up before storing 672 packuswb xmm0, xmm7 673 packuswb xmm1, xmm7 674 packuswb xmm2, xmm7 675 packuswb xmm3, xmm7 676 677 ; Load destination stride before writing out, 678 ; doesn't need to persist 679 movsxd rdx, dword ptr arg(4) ; dst_stride 680 681 ; store blocks back out 682 movq [rdi], xmm0 683 movq [rdi + rdx], xmm1 684 685 lea rdi, [rdi + 2*rdx] 686 687 movq [rdi], xmm2 688 movq [rdi + rdx], xmm3 689 690 691 ; begin epilog 692 pop rdi 693 pop rsi 694 RESTORE_GOT 695 UNSHADOW_ARGS 696 pop rbp 697 ret 698 699SECTION_RODATA 700align 16 701fours: 702 times 8 dw 0x0004 703align 16 704x_s1sqr2: 705 times 8 dw 0x8A8C 706align 16 707x_c1sqr2less1: 708 times 8 dw 0x4E7B 709