1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_4: times 8 dw 4 15pw_8: times 8 dw 8 16pw_16: times 8 dw 16 17pw_32: times 8 dw 32 18dc_128: times 16 db 128 19pw2_4: times 8 dw 2 20pw2_8: times 8 dw 4 21pw2_16: times 8 dw 8 22pw2_32: times 8 dw 16 23 24SECTION .text 25 26INIT_MMX sse 27cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset 28 GET_GOT goffsetq 29 30 pxor m1, m1 31 movd m0, [aboveq] 32 punpckldq m0, [leftq] 33 psadbw m0, m1 34 paddw m0, [GLOBAL(pw_4)] 35 psraw m0, 3 36 pshufw m0, m0, 0x0 37 packuswb m0, m0 38 movd [dstq ], m0 39 movd [dstq+strideq], m0 40 lea dstq, [dstq+strideq*2] 41 movd [dstq ], m0 42 movd [dstq+strideq], m0 43 44 RESTORE_GOT 45 RET 46 47INIT_MMX sse 48cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset 49 GET_GOT goffsetq 50 51 pxor m1, m1 52 movd m0, [leftq] 53 psadbw m0, m1 54 paddw m0, [GLOBAL(pw2_4)] 55 psraw m0, 2 56 pshufw m0, m0, 0x0 57 packuswb m0, m0 58 movd [dstq ], m0 59 movd [dstq+strideq], m0 60 lea dstq, [dstq+strideq*2] 61 movd [dstq ], m0 62 movd [dstq+strideq], m0 63 64 RESTORE_GOT 65 RET 66 67INIT_MMX sse 68cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset 69 GET_GOT goffsetq 70 71 pxor m1, m1 72 movd m0, [aboveq] 73 psadbw m0, m1 74 paddw m0, [GLOBAL(pw2_4)] 75 psraw m0, 2 76 pshufw m0, m0, 0x0 77 packuswb m0, m0 78 movd [dstq ], m0 79 movd [dstq+strideq], m0 80 lea dstq, [dstq+strideq*2] 81 movd [dstq ], m0 82 movd [dstq+strideq], m0 83 84 RESTORE_GOT 85 RET 86 87INIT_MMX sse 88cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset 89 GET_GOT goffsetq 90 91 pxor m1, m1 92 movq m0, [aboveq] 93 movq m2, [leftq] 94 DEFINE_ARGS dst, stride, stride3 95 lea stride3q, [strideq*3] 96 psadbw m0, m1 97 psadbw m2, m1 98 paddw m0, m2 99 paddw m0, [GLOBAL(pw_8)] 100 psraw m0, 4 101 pshufw m0, m0, 0x0 102 packuswb m0, m0 103 movq [dstq ], m0 104 movq [dstq+strideq ], m0 105 movq [dstq+strideq*2], m0 106 movq [dstq+stride3q ], m0 107 lea dstq, [dstq+strideq*4] 108 movq [dstq ], m0 109 movq [dstq+strideq ], m0 110 movq [dstq+strideq*2], m0 111 movq [dstq+stride3q ], m0 112 113 RESTORE_GOT 114 RET 115 116INIT_MMX sse 117cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset 118 GET_GOT goffsetq 119 120 pxor m1, m1 121 movq m0, [aboveq] 122 DEFINE_ARGS dst, stride, stride3 123 lea stride3q, [strideq*3] 124 psadbw m0, m1 125 paddw m0, [GLOBAL(pw2_8)] 126 psraw m0, 3 127 pshufw m0, m0, 0x0 128 packuswb m0, m0 129 movq [dstq ], m0 130 movq [dstq+strideq ], m0 131 movq [dstq+strideq*2], m0 132 movq [dstq+stride3q ], m0 133 lea dstq, [dstq+strideq*4] 134 movq [dstq ], m0 135 movq [dstq+strideq ], m0 136 movq [dstq+strideq*2], m0 137 movq [dstq+stride3q ], m0 138 139 RESTORE_GOT 140 RET 141 142INIT_MMX sse 143cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset 144 GET_GOT goffsetq 145 146 pxor m1, m1 147 movq m0, [leftq] 148 DEFINE_ARGS dst, stride, stride3 149 lea stride3q, [strideq*3] 150 psadbw m0, m1 151 paddw m0, [GLOBAL(pw2_8)] 152 psraw m0, 3 153 pshufw m0, m0, 0x0 154 packuswb m0, m0 155 movq [dstq ], m0 156 movq [dstq+strideq ], m0 157 movq [dstq+strideq*2], m0 158 movq [dstq+stride3q ], m0 159 lea dstq, [dstq+strideq*4] 160 movq [dstq ], m0 161 movq [dstq+strideq ], m0 162 movq [dstq+strideq*2], m0 163 movq [dstq+stride3q ], m0 164 165 RESTORE_GOT 166 RET 167 168INIT_MMX sse 169cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset 170 GET_GOT goffsetq 171 172 DEFINE_ARGS dst, stride, stride3 173 lea stride3q, [strideq*3] 174 movd m0, [GLOBAL(dc_128)] 175 movd [dstq ], m0 176 movd [dstq+strideq ], m0 177 movd [dstq+strideq*2], m0 178 movd [dstq+stride3q ], m0 179 RESTORE_GOT 180 RET 181 182INIT_MMX sse 183cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset 184 GET_GOT goffsetq 185 186 DEFINE_ARGS dst, stride, stride3 187 lea stride3q, [strideq*3] 188 movq m0, [GLOBAL(dc_128)] 189 movq [dstq ], m0 190 movq [dstq+strideq ], m0 191 movq [dstq+strideq*2], m0 192 movq [dstq+stride3q ], m0 193 lea dstq, [dstq+strideq*4] 194 movq [dstq ], m0 195 movq [dstq+strideq ], m0 196 movq [dstq+strideq*2], m0 197 movq [dstq+stride3q ], m0 198 RESTORE_GOT 199 RET 200 201INIT_XMM sse2 202cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 203 GET_GOT goffsetq 204 205 pxor m1, m1 206 mova m0, [aboveq] 207 mova m2, [leftq] 208 DEFINE_ARGS dst, stride, stride3, lines4 209 lea stride3q, [strideq*3] 210 mov lines4d, 4 211 psadbw m0, m1 212 psadbw m2, m1 213 paddw m0, m2 214 movhlps m2, m0 215 paddw m0, m2 216 paddw m0, [GLOBAL(pw_16)] 217 psraw m0, 5 218 pshuflw m0, m0, 0x0 219 punpcklqdq m0, m0 220 packuswb m0, m0 221.loop: 222 mova [dstq ], m0 223 mova [dstq+strideq ], m0 224 mova [dstq+strideq*2], m0 225 mova [dstq+stride3q ], m0 226 lea dstq, [dstq+strideq*4] 227 dec lines4d 228 jnz .loop 229 230 RESTORE_GOT 231 REP_RET 232 233 234INIT_XMM sse2 235cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 236 GET_GOT goffsetq 237 238 pxor m1, m1 239 pxor m2, m2 240 mova m0, [aboveq] 241 DEFINE_ARGS dst, stride, stride3, lines4 242 lea stride3q, [strideq*3] 243 mov lines4d, 4 244 psadbw m0, m1 245 psadbw m2, m1 246 paddw m0, m2 247 movhlps m2, m0 248 paddw m0, m2 249 paddw m0, [GLOBAL(pw2_16)] 250 psraw m0, 4 251 pshuflw m0, m0, 0x0 252 punpcklqdq m0, m0 253 packuswb m0, m0 254.loop: 255 mova [dstq ], m0 256 mova [dstq+strideq ], m0 257 mova [dstq+strideq*2], m0 258 mova [dstq+stride3q ], m0 259 lea dstq, [dstq+strideq*4] 260 dec lines4d 261 jnz .loop 262 263 RESTORE_GOT 264 REP_RET 265 266INIT_XMM sse2 267cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 268 GET_GOT goffsetq 269 270 pxor m1, m1 271 pxor m2, m2 272 mova m0, [leftq] 273 DEFINE_ARGS dst, stride, stride3, lines4 274 lea stride3q, [strideq*3] 275 mov lines4d, 4 276 psadbw m0, m1 277 psadbw m2, m1 278 paddw m0, m2 279 movhlps m2, m0 280 paddw m0, m2 281 paddw m0, [GLOBAL(pw2_16)] 282 psraw m0, 4 283 pshuflw m0, m0, 0x0 284 punpcklqdq m0, m0 285 packuswb m0, m0 286.loop: 287 mova [dstq ], m0 288 mova [dstq+strideq ], m0 289 mova [dstq+strideq*2], m0 290 mova [dstq+stride3q ], m0 291 lea dstq, [dstq+strideq*4] 292 dec lines4d 293 jnz .loop 294 295 RESTORE_GOT 296 REP_RET 297 298INIT_XMM sse2 299cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 300 GET_GOT goffsetq 301 302 DEFINE_ARGS dst, stride, stride3, lines4 303 lea stride3q, [strideq*3] 304 mov lines4d, 4 305 mova m0, [GLOBAL(dc_128)] 306.loop: 307 mova [dstq ], m0 308 mova [dstq+strideq ], m0 309 mova [dstq+strideq*2], m0 310 mova [dstq+stride3q ], m0 311 lea dstq, [dstq+strideq*4] 312 dec lines4d 313 jnz .loop 314 RESTORE_GOT 315 RET 316 317 318INIT_XMM sse2 319cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 320 GET_GOT goffsetq 321 322 pxor m1, m1 323 mova m0, [aboveq] 324 mova m2, [aboveq+16] 325 mova m3, [leftq] 326 mova m4, [leftq+16] 327 DEFINE_ARGS dst, stride, stride3, lines4 328 lea stride3q, [strideq*3] 329 mov lines4d, 8 330 psadbw m0, m1 331 psadbw m2, m1 332 psadbw m3, m1 333 psadbw m4, m1 334 paddw m0, m2 335 paddw m0, m3 336 paddw m0, m4 337 movhlps m2, m0 338 paddw m0, m2 339 paddw m0, [GLOBAL(pw_32)] 340 psraw m0, 6 341 pshuflw m0, m0, 0x0 342 punpcklqdq m0, m0 343 packuswb m0, m0 344.loop: 345 mova [dstq ], m0 346 mova [dstq +16], m0 347 mova [dstq+strideq ], m0 348 mova [dstq+strideq +16], m0 349 mova [dstq+strideq*2 ], m0 350 mova [dstq+strideq*2+16], m0 351 mova [dstq+stride3q ], m0 352 mova [dstq+stride3q +16], m0 353 lea dstq, [dstq+strideq*4] 354 dec lines4d 355 jnz .loop 356 357 RESTORE_GOT 358 REP_RET 359 360INIT_XMM sse2 361cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 362 GET_GOT goffsetq 363 364 pxor m1, m1 365 mova m0, [aboveq] 366 mova m2, [aboveq+16] 367 DEFINE_ARGS dst, stride, stride3, lines4 368 lea stride3q, [strideq*3] 369 mov lines4d, 8 370 psadbw m0, m1 371 psadbw m2, m1 372 paddw m0, m2 373 movhlps m2, m0 374 paddw m0, m2 375 paddw m0, [GLOBAL(pw2_32)] 376 psraw m0, 5 377 pshuflw m0, m0, 0x0 378 punpcklqdq m0, m0 379 packuswb m0, m0 380.loop: 381 mova [dstq ], m0 382 mova [dstq +16], m0 383 mova [dstq+strideq ], m0 384 mova [dstq+strideq +16], m0 385 mova [dstq+strideq*2 ], m0 386 mova [dstq+strideq*2+16], m0 387 mova [dstq+stride3q ], m0 388 mova [dstq+stride3q +16], m0 389 lea dstq, [dstq+strideq*4] 390 dec lines4d 391 jnz .loop 392 393 RESTORE_GOT 394 REP_RET 395 396INIT_XMM sse2 397cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 398 GET_GOT goffsetq 399 400 pxor m1, m1 401 mova m0, [leftq] 402 mova m2, [leftq+16] 403 DEFINE_ARGS dst, stride, stride3, lines4 404 lea stride3q, [strideq*3] 405 mov lines4d, 8 406 psadbw m0, m1 407 psadbw m2, m1 408 paddw m0, m2 409 movhlps m2, m0 410 paddw m0, m2 411 paddw m0, [GLOBAL(pw2_32)] 412 psraw m0, 5 413 pshuflw m0, m0, 0x0 414 punpcklqdq m0, m0 415 packuswb m0, m0 416.loop: 417 mova [dstq ], m0 418 mova [dstq +16], m0 419 mova [dstq+strideq ], m0 420 mova [dstq+strideq +16], m0 421 mova [dstq+strideq*2 ], m0 422 mova [dstq+strideq*2+16], m0 423 mova [dstq+stride3q ], m0 424 mova [dstq+stride3q +16], m0 425 lea dstq, [dstq+strideq*4] 426 dec lines4d 427 jnz .loop 428 429 RESTORE_GOT 430 REP_RET 431 432INIT_XMM sse2 433cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset 434 GET_GOT goffsetq 435 436 DEFINE_ARGS dst, stride, stride3, lines4 437 lea stride3q, [strideq*3] 438 mov lines4d, 8 439 mova m0, [GLOBAL(dc_128)] 440.loop: 441 mova [dstq ], m0 442 mova [dstq +16], m0 443 mova [dstq+strideq ], m0 444 mova [dstq+strideq +16], m0 445 mova [dstq+strideq*2 ], m0 446 mova [dstq+strideq*2+16], m0 447 mova [dstq+stride3q ], m0 448 mova [dstq+stride3q +16], m0 449 lea dstq, [dstq+strideq*4] 450 dec lines4d 451 jnz .loop 452 RESTORE_GOT 453 RET 454 455INIT_MMX sse 456cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above 457 movd m0, [aboveq] 458 movd [dstq ], m0 459 movd [dstq+strideq], m0 460 lea dstq, [dstq+strideq*2] 461 movd [dstq ], m0 462 movd [dstq+strideq], m0 463 RET 464 465INIT_MMX sse 466cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above 467 movq m0, [aboveq] 468 DEFINE_ARGS dst, stride, stride3 469 lea stride3q, [strideq*3] 470 movq [dstq ], m0 471 movq [dstq+strideq ], m0 472 movq [dstq+strideq*2], m0 473 movq [dstq+stride3q ], m0 474 lea dstq, [dstq+strideq*4] 475 movq [dstq ], m0 476 movq [dstq+strideq ], m0 477 movq [dstq+strideq*2], m0 478 movq [dstq+stride3q ], m0 479 RET 480 481INIT_XMM sse2 482cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above 483 mova m0, [aboveq] 484 DEFINE_ARGS dst, stride, stride3, nlines4 485 lea stride3q, [strideq*3] 486 mov nlines4d, 4 487.loop: 488 mova [dstq ], m0 489 mova [dstq+strideq ], m0 490 mova [dstq+strideq*2], m0 491 mova [dstq+stride3q ], m0 492 lea dstq, [dstq+strideq*4] 493 dec nlines4d 494 jnz .loop 495 REP_RET 496 497INIT_XMM sse2 498cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above 499 mova m0, [aboveq] 500 mova m1, [aboveq+16] 501 DEFINE_ARGS dst, stride, stride3, nlines4 502 lea stride3q, [strideq*3] 503 mov nlines4d, 8 504.loop: 505 mova [dstq ], m0 506 mova [dstq +16], m1 507 mova [dstq+strideq ], m0 508 mova [dstq+strideq +16], m1 509 mova [dstq+strideq*2 ], m0 510 mova [dstq+strideq*2+16], m1 511 mova [dstq+stride3q ], m0 512 mova [dstq+stride3q +16], m1 513 lea dstq, [dstq+strideq*4] 514 dec nlines4d 515 jnz .loop 516 REP_RET 517 518INIT_MMX sse 519cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left 520 pxor m1, m1 521 movd m2, [aboveq-1] 522 movd m0, [aboveq] 523 punpcklbw m2, m1 524 punpcklbw m0, m1 525 pshufw m2, m2, 0x0 526 DEFINE_ARGS dst, stride, line, left 527 mov lineq, -2 528 add leftq, 4 529 psubw m0, m2 530.loop: 531 movd m2, [leftq+lineq*2] 532 movd m3, [leftq+lineq*2+1] 533 punpcklbw m2, m1 534 punpcklbw m3, m1 535 pshufw m2, m2, 0x0 536 pshufw m3, m3, 0x0 537 paddw m2, m0 538 paddw m3, m0 539 packuswb m2, m2 540 packuswb m3, m3 541 movd [dstq ], m2 542 movd [dstq+strideq], m3 543 lea dstq, [dstq+strideq*2] 544 inc lineq 545 jnz .loop 546 REP_RET 547 548INIT_XMM sse2 549cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left 550 pxor m1, m1 551 movd m2, [aboveq-1] 552 movq m0, [aboveq] 553 punpcklbw m2, m1 554 punpcklbw m0, m1 555 pshuflw m2, m2, 0x0 556 DEFINE_ARGS dst, stride, line, left 557 mov lineq, -4 558 punpcklqdq m2, m2 559 add leftq, 8 560 psubw m0, m2 561.loop: 562 movd m2, [leftq+lineq*2] 563 movd m3, [leftq+lineq*2+1] 564 punpcklbw m2, m1 565 punpcklbw m3, m1 566 pshuflw m2, m2, 0x0 567 pshuflw m3, m3, 0x0 568 punpcklqdq m2, m2 569 punpcklqdq m3, m3 570 paddw m2, m0 571 paddw m3, m0 572 packuswb m2, m3 573 movq [dstq ], m2 574 movhps [dstq+strideq], m2 575 lea dstq, [dstq+strideq*2] 576 inc lineq 577 jnz .loop 578 REP_RET 579 580INIT_XMM sse2 581cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left 582 pxor m1, m1 583 movd m2, [aboveq-1] 584 mova m0, [aboveq] 585 punpcklbw m2, m1 586 punpckhbw m4, m0, m1 587 punpcklbw m0, m1 588 pshuflw m2, m2, 0x0 589 DEFINE_ARGS dst, stride, line, left 590 mov lineq, -8 591 punpcklqdq m2, m2 592 add leftq, 16 593 psubw m0, m2 594 psubw m4, m2 595.loop: 596 movd m2, [leftq+lineq*2] 597 movd m3, [leftq+lineq*2+1] 598 punpcklbw m2, m1 599 punpcklbw m3, m1 600 pshuflw m2, m2, 0x0 601 pshuflw m3, m3, 0x0 602 punpcklqdq m2, m2 603 punpcklqdq m3, m3 604 paddw m5, m2, m0 605 paddw m6, m3, m0 606 paddw m2, m4 607 paddw m3, m4 608 packuswb m5, m2 609 packuswb m6, m3 610 mova [dstq ], m5 611 mova [dstq+strideq], m6 612 lea dstq, [dstq+strideq*2] 613 inc lineq 614 jnz .loop 615 REP_RET 616 617%if ARCH_X86_64 618INIT_XMM sse2 619cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left 620 pxor m1, m1 621 movd m2, [aboveq-1] 622 mova m0, [aboveq] 623 mova m4, [aboveq+16] 624 punpcklbw m2, m1 625 punpckhbw m3, m0, m1 626 punpckhbw m5, m4, m1 627 punpcklbw m0, m1 628 punpcklbw m4, m1 629 pshuflw m2, m2, 0x0 630 DEFINE_ARGS dst, stride, line, left 631 mov lineq, -16 632 punpcklqdq m2, m2 633 add leftq, 32 634 psubw m0, m2 635 psubw m3, m2 636 psubw m4, m2 637 psubw m5, m2 638.loop: 639 movd m2, [leftq+lineq*2] 640 movd m6, [leftq+lineq*2+1] 641 punpcklbw m2, m1 642 punpcklbw m6, m1 643 pshuflw m2, m2, 0x0 644 pshuflw m6, m6, 0x0 645 punpcklqdq m2, m2 646 punpcklqdq m6, m6 647 paddw m7, m2, m0 648 paddw m8, m2, m3 649 paddw m9, m2, m4 650 paddw m2, m5 651 packuswb m7, m8 652 packuswb m9, m2 653 paddw m2, m6, m0 654 paddw m8, m6, m3 655 mova [dstq ], m7 656 paddw m7, m6, m4 657 paddw m6, m5 658 mova [dstq +16], m9 659 packuswb m2, m8 660 packuswb m7, m6 661 mova [dstq+strideq ], m2 662 mova [dstq+strideq+16], m7 663 lea dstq, [dstq+strideq*2] 664 inc lineq 665 jnz .loop 666 REP_RET 667%endif 668