1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12#include "vpx_config.h" 13#include "vp8_rtcd.h" 14#include "vpx_ports/mem.h" 15#include "filter_x86.h" 16 17extern const short vp8_six_tap_mmx[8][6*8]; 18 19extern void vp8_filter_block1d_h6_mmx 20( 21 unsigned char *src_ptr, 22 unsigned short *output_ptr, 23 unsigned int src_pixels_per_line, 24 unsigned int pixel_step, 25 unsigned int output_height, 26 unsigned int output_width, 27 const short *vp8_filter 28); 29extern void vp8_filter_block1dc_v6_mmx 30( 31 unsigned short *src_ptr, 32 unsigned char *output_ptr, 33 int output_pitch, 34 unsigned int pixels_per_line, 35 unsigned int pixel_step, 36 unsigned int output_height, 37 unsigned int output_width, 38 const short *vp8_filter 39); 40extern void vp8_filter_block1d8_h6_sse2 41( 42 unsigned char *src_ptr, 43 unsigned short *output_ptr, 44 unsigned int src_pixels_per_line, 45 unsigned int pixel_step, 46 unsigned int output_height, 47 unsigned int output_width, 48 const short *vp8_filter 49); 50extern void vp8_filter_block1d16_h6_sse2 51( 52 unsigned char *src_ptr, 53 unsigned short *output_ptr, 54 unsigned int src_pixels_per_line, 55 unsigned int pixel_step, 56 unsigned int output_height, 57 unsigned int output_width, 58 const short *vp8_filter 59); 60extern void vp8_filter_block1d8_v6_sse2 61( 62 unsigned short *src_ptr, 63 unsigned char *output_ptr, 64 int dst_ptich, 65 unsigned int pixels_per_line, 66 unsigned int pixel_step, 67 unsigned int output_height, 68 unsigned int output_width, 69 const short *vp8_filter 70); 71extern void vp8_filter_block1d16_v6_sse2 72( 73 unsigned short *src_ptr, 74 unsigned char *output_ptr, 75 int dst_ptich, 76 unsigned int pixels_per_line, 77 unsigned int pixel_step, 78 unsigned int output_height, 79 unsigned int output_width, 80 const short *vp8_filter 81); 82extern void vp8_unpack_block1d16_h6_sse2 83( 84 unsigned char *src_ptr, 85 unsigned short *output_ptr, 86 unsigned int src_pixels_per_line, 87 unsigned int output_height, 88 unsigned int output_width 89); 90extern void vp8_filter_block1d8_h6_only_sse2 91( 92 unsigned char *src_ptr, 93 unsigned int src_pixels_per_line, 94 unsigned char *output_ptr, 95 int dst_ptich, 96 unsigned int output_height, 97 const short *vp8_filter 98); 99extern void vp8_filter_block1d16_h6_only_sse2 100( 101 unsigned char *src_ptr, 102 unsigned int src_pixels_per_line, 103 unsigned char *output_ptr, 104 int dst_ptich, 105 unsigned int output_height, 106 const short *vp8_filter 107); 108extern void vp8_filter_block1d8_v6_only_sse2 109( 110 unsigned char *src_ptr, 111 unsigned int src_pixels_per_line, 112 unsigned char *output_ptr, 113 int dst_ptich, 114 unsigned int output_height, 115 const short *vp8_filter 116); 117 118 119#if HAVE_MMX 120void vp8_sixtap_predict4x4_mmx 121( 122 unsigned char *src_ptr, 123 int src_pixels_per_line, 124 int xoffset, 125 int yoffset, 126 unsigned char *dst_ptr, 127 int dst_pitch 128) 129{ 130 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */ 131 const short *HFilter, *VFilter; 132 HFilter = vp8_six_tap_mmx[xoffset]; 133 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); 134 VFilter = vp8_six_tap_mmx[yoffset]; 135 vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter); 136 137} 138 139 140void vp8_sixtap_predict16x16_mmx 141( 142 unsigned char *src_ptr, 143 int src_pixels_per_line, 144 int xoffset, 145 int yoffset, 146 unsigned char *dst_ptr, 147 int dst_pitch 148) 149{ 150 151 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ 152 153 const short *HFilter, *VFilter; 154 155 156 HFilter = vp8_six_tap_mmx[xoffset]; 157 158 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); 159 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter); 160 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter); 161 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter); 162 163 VFilter = vp8_six_tap_mmx[yoffset]; 164 vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); 165 vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter); 166 vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); 167 vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter); 168 169} 170 171 172void vp8_sixtap_predict8x8_mmx 173( 174 unsigned char *src_ptr, 175 int src_pixels_per_line, 176 int xoffset, 177 int yoffset, 178 unsigned char *dst_ptr, 179 int dst_pitch 180) 181{ 182 183 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 184 185 const short *HFilter, *VFilter; 186 187 HFilter = vp8_six_tap_mmx[xoffset]; 188 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); 189 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter); 190 191 VFilter = vp8_six_tap_mmx[yoffset]; 192 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter); 193 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter); 194 195} 196 197 198void vp8_sixtap_predict8x4_mmx 199( 200 unsigned char *src_ptr, 201 int src_pixels_per_line, 202 int xoffset, 203 int yoffset, 204 unsigned char *dst_ptr, 205 int dst_pitch 206) 207{ 208 209 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 210 211 const short *HFilter, *VFilter; 212 213 HFilter = vp8_six_tap_mmx[xoffset]; 214 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); 215 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter); 216 217 VFilter = vp8_six_tap_mmx[yoffset]; 218 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter); 219 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter); 220 221} 222 223 224 225void vp8_bilinear_predict16x16_mmx 226( 227 unsigned char *src_ptr, 228 int src_pixels_per_line, 229 int xoffset, 230 int yoffset, 231 unsigned char *dst_ptr, 232 int dst_pitch 233) 234{ 235 vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch); 236 vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch); 237 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch); 238 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch); 239} 240#endif 241 242 243#if HAVE_SSE2 244void vp8_sixtap_predict16x16_sse2 245( 246 unsigned char *src_ptr, 247 int src_pixels_per_line, 248 int xoffset, 249 int yoffset, 250 unsigned char *dst_ptr, 251 int dst_pitch 252 253) 254{ 255 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ 256 257 const short *HFilter, *VFilter; 258 259 if (xoffset) 260 { 261 if (yoffset) 262 { 263 HFilter = vp8_six_tap_mmx[xoffset]; 264 vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); 265 VFilter = vp8_six_tap_mmx[yoffset]; 266 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); 267 } 268 else 269 { 270 /* First-pass only */ 271 HFilter = vp8_six_tap_mmx[xoffset]; 272 vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter); 273 } 274 } 275 else 276 { 277 /* Second-pass only */ 278 VFilter = vp8_six_tap_mmx[yoffset]; 279 vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); 280 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); 281 } 282} 283 284 285void vp8_sixtap_predict8x8_sse2 286( 287 unsigned char *src_ptr, 288 int src_pixels_per_line, 289 int xoffset, 290 int yoffset, 291 unsigned char *dst_ptr, 292 int dst_pitch 293) 294{ 295 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 296 const short *HFilter, *VFilter; 297 298 if (xoffset) 299 { 300 if (yoffset) 301 { 302 HFilter = vp8_six_tap_mmx[xoffset]; 303 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); 304 VFilter = vp8_six_tap_mmx[yoffset]; 305 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter); 306 } 307 else 308 { 309 /* First-pass only */ 310 HFilter = vp8_six_tap_mmx[xoffset]; 311 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter); 312 } 313 } 314 else 315 { 316 /* Second-pass only */ 317 VFilter = vp8_six_tap_mmx[yoffset]; 318 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter); 319 } 320} 321 322 323void vp8_sixtap_predict8x4_sse2 324( 325 unsigned char *src_ptr, 326 int src_pixels_per_line, 327 int xoffset, 328 int yoffset, 329 unsigned char *dst_ptr, 330 int dst_pitch 331) 332{ 333 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 334 const short *HFilter, *VFilter; 335 336 if (xoffset) 337 { 338 if (yoffset) 339 { 340 HFilter = vp8_six_tap_mmx[xoffset]; 341 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); 342 VFilter = vp8_six_tap_mmx[yoffset]; 343 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter); 344 } 345 else 346 { 347 /* First-pass only */ 348 HFilter = vp8_six_tap_mmx[xoffset]; 349 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter); 350 } 351 } 352 else 353 { 354 /* Second-pass only */ 355 VFilter = vp8_six_tap_mmx[yoffset]; 356 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter); 357 } 358} 359 360#endif 361 362#if HAVE_SSSE3 363 364extern void vp8_filter_block1d8_h6_ssse3 365( 366 unsigned char *src_ptr, 367 unsigned int src_pixels_per_line, 368 unsigned char *output_ptr, 369 unsigned int output_pitch, 370 unsigned int output_height, 371 unsigned int vp8_filter_index 372); 373 374extern void vp8_filter_block1d16_h6_ssse3 375( 376 unsigned char *src_ptr, 377 unsigned int src_pixels_per_line, 378 unsigned char *output_ptr, 379 unsigned int output_pitch, 380 unsigned int output_height, 381 unsigned int vp8_filter_index 382); 383 384extern void vp8_filter_block1d16_v6_ssse3 385( 386 unsigned char *src_ptr, 387 unsigned int src_pitch, 388 unsigned char *output_ptr, 389 unsigned int out_pitch, 390 unsigned int output_height, 391 unsigned int vp8_filter_index 392); 393 394extern void vp8_filter_block1d8_v6_ssse3 395( 396 unsigned char *src_ptr, 397 unsigned int src_pitch, 398 unsigned char *output_ptr, 399 unsigned int out_pitch, 400 unsigned int output_height, 401 unsigned int vp8_filter_index 402); 403 404extern void vp8_filter_block1d4_h6_ssse3 405( 406 unsigned char *src_ptr, 407 unsigned int src_pixels_per_line, 408 unsigned char *output_ptr, 409 unsigned int output_pitch, 410 unsigned int output_height, 411 unsigned int vp8_filter_index 412); 413 414extern void vp8_filter_block1d4_v6_ssse3 415( 416 unsigned char *src_ptr, 417 unsigned int src_pitch, 418 unsigned char *output_ptr, 419 unsigned int out_pitch, 420 unsigned int output_height, 421 unsigned int vp8_filter_index 422); 423 424void vp8_sixtap_predict16x16_ssse3 425( 426 unsigned char *src_ptr, 427 int src_pixels_per_line, 428 int xoffset, 429 int yoffset, 430 unsigned char *dst_ptr, 431 int dst_pitch 432 433) 434{ 435 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24); 436 437 if (xoffset) 438 { 439 if (yoffset) 440 { 441 vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 442 src_pixels_per_line, FData2, 443 16, 21, xoffset); 444 vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 445 16, yoffset); 446 } 447 else 448 { 449 /* First-pass only */ 450 vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, 451 dst_ptr, dst_pitch, 16, xoffset); 452 } 453 } 454 else 455 { 456 if (yoffset) 457 { 458 /* Second-pass only */ 459 vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 460 src_pixels_per_line, 461 dst_ptr, dst_pitch, 16, yoffset); 462 } 463 else 464 { 465 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 466 * yoffset==0) case correctly. Add copy function here to guarantee 467 * six-tap function handles all possible offsets. */ 468 vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 469 } 470 } 471} 472 473void vp8_sixtap_predict8x8_ssse3 474( 475 unsigned char *src_ptr, 476 int src_pixels_per_line, 477 int xoffset, 478 int yoffset, 479 unsigned char *dst_ptr, 480 int dst_pitch 481) 482{ 483 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); 484 485 if (xoffset) 486 { 487 if (yoffset) 488 { 489 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 490 src_pixels_per_line, FData2, 491 8, 13, xoffset); 492 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 493 8, yoffset); 494 } 495 else 496 { 497 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, 498 dst_ptr, dst_pitch, 8, xoffset); 499 } 500 } 501 else 502 { 503 if (yoffset) 504 { 505 /* Second-pass only */ 506 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 507 src_pixels_per_line, 508 dst_ptr, dst_pitch, 8, yoffset); 509 } 510 else 511 { 512 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 513 * yoffset==0) case correctly. Add copy function here to guarantee 514 * six-tap function handles all possible offsets. */ 515 vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 516 } 517 } 518} 519 520 521void vp8_sixtap_predict8x4_ssse3 522( 523 unsigned char *src_ptr, 524 int src_pixels_per_line, 525 int xoffset, 526 int yoffset, 527 unsigned char *dst_ptr, 528 int dst_pitch 529) 530{ 531 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); 532 533 if (xoffset) 534 { 535 if (yoffset) 536 { 537 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 538 src_pixels_per_line, FData2, 539 8, 9, xoffset); 540 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 541 4, yoffset); 542 } 543 else 544 { 545 /* First-pass only */ 546 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, 547 dst_ptr, dst_pitch, 4, xoffset); 548 } 549 } 550 else 551 { 552 if (yoffset) 553 { 554 /* Second-pass only */ 555 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 556 src_pixels_per_line, 557 dst_ptr, dst_pitch, 4, yoffset); 558 } 559 else 560 { 561 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 562 * yoffset==0) case correctly. Add copy function here to guarantee 563 * six-tap function handles all possible offsets. */ 564 vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 565 } 566 } 567} 568 569void vp8_sixtap_predict4x4_ssse3 570( 571 unsigned char *src_ptr, 572 int src_pixels_per_line, 573 int xoffset, 574 int yoffset, 575 unsigned char *dst_ptr, 576 int dst_pitch 577) 578{ 579 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9); 580 581 if (xoffset) 582 { 583 if (yoffset) 584 { 585 vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 586 src_pixels_per_line, 587 FData2, 4, 9, xoffset); 588 vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 589 4, yoffset); 590 } 591 else 592 { 593 vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, 594 dst_ptr, dst_pitch, 4, xoffset); 595 } 596 } 597 else 598 { 599 if (yoffset) 600 { 601 vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 602 src_pixels_per_line, 603 dst_ptr, dst_pitch, 4, yoffset); 604 } 605 else 606 { 607 /* ssse3 second-pass only function couldn't handle (xoffset==0 && 608 * yoffset==0) case correctly. Add copy function here to guarantee 609 * six-tap function handles all possible offsets. */ 610 int r; 611 612 for (r = 0; r < 4; r++) 613 { 614 dst_ptr[0] = src_ptr[0]; 615 dst_ptr[1] = src_ptr[1]; 616 dst_ptr[2] = src_ptr[2]; 617 dst_ptr[3] = src_ptr[3]; 618 dst_ptr += dst_pitch; 619 src_ptr += src_pixels_per_line; 620 } 621 } 622 } 623} 624 625#endif 626