1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 .globl bilinear_predict4x4_ppc 13 .globl bilinear_predict8x4_ppc 14 .globl bilinear_predict8x8_ppc 15 .globl bilinear_predict16x16_ppc 16 17.macro load_c V, LABEL, OFF, R0, R1 18 lis \R0, \LABEL@ha 19 la \R1, \LABEL@l(\R0) 20 lvx \V, \OFF, \R1 21.endm 22 23.macro load_vfilter V0, V1 24 load_c \V0, vfilter_b, r6, r9, r10 25 26 addi r6, r6, 16 27 lvx \V1, r6, r10 28.endm 29 30.macro HProlog jump_label 31 ;# load up horizontal filter 32 slwi. r5, r5, 4 ;# index into horizontal filter array 33 34 ;# index to the next set of vectors in the row. 35 li r10, 16 36 li r12, 32 37 38 ;# downshift by 7 ( divide by 128 ) at the end 39 vspltish v19, 7 40 41 ;# If there isn't any filtering to be done for the horizontal, then 42 ;# just skip to the second pass. 43 beq \jump_label 44 45 load_c v20, hfilter_b, r5, r9, r0 46 47 ;# setup constants 48 ;# v14 permutation value for alignment 49 load_c v28, b_hperm_b, 0, r9, r0 50 51 ;# rounding added in on the multiply 52 vspltisw v21, 8 53 vspltisw v18, 3 54 vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 55 56 slwi. r6, r6, 5 ;# index into vertical filter array 57.endm 58 59;# Filters a horizontal line 60;# expects: 61;# r3 src_ptr 62;# r4 pitch 63;# r10 16 64;# r12 32 65;# v17 perm intput 66;# v18 rounding 67;# v19 shift 68;# v20 filter taps 69;# v21 tmp 70;# v22 tmp 71;# v23 tmp 72;# v24 tmp 73;# v25 tmp 74;# v26 tmp 75;# v27 tmp 76;# v28 perm output 77;# 78.macro HFilter V 79 vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456 80 vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A 81 82 vmsummbm v24, v20, v24, v18 83 vmsummbm v25, v20, v25, v18 84 85 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) 86 87 vsrh v24, v24, v19 ;# divide v0, v1 by 128 88 89 vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result 90.endm 91 92.macro hfilter_8 V, increment_counter 93 lvsl v17, 0, r3 ;# permutate value for alignment 94 95 ;# input to filter is 9 bytes wide, output is 8 bytes. 96 lvx v21, 0, r3 97 lvx v22, r10, r3 98 99.if \increment_counter 100 add r3, r3, r4 101.endif 102 vperm v21, v21, v22, v17 103 104 HFilter \V 105.endm 106 107 108.macro load_and_align_8 V, increment_counter 109 lvsl v17, 0, r3 ;# permutate value for alignment 110 111 ;# input to filter is 21 bytes wide, output is 16 bytes. 112 ;# input will can span three vectors if not aligned correctly. 113 lvx v21, 0, r3 114 lvx v22, r10, r3 115 116.if \increment_counter 117 add r3, r3, r4 118.endif 119 120 vperm \V, v21, v22, v17 121.endm 122 123.macro write_aligned_8 V, increment_counter 124 stvx \V, 0, r7 125 126.if \increment_counter 127 add r7, r7, r8 128.endif 129.endm 130 131.macro vfilter_16 P0 P1 132 vmuleub v22, \P0, v20 ;# 64 + 4 positive taps 133 vadduhm v22, v18, v22 134 vmuloub v23, \P0, v20 135 vadduhm v23, v18, v23 136 137 vmuleub v24, \P1, v21 138 vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary 139 vmuloub v25, \P1, v21 140 vadduhm v23, v23, v25 ;# Ro = odds 141 142 vsrh v22, v22, v19 ;# divide by 128 143 vsrh v23, v23, v19 ;# v16 v17 = evens, odds 144 vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order 145 vmrglh v23, v22, v23 146 vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result 147.endm 148 149 150.macro w_8x8 V, D, R, P 151 stvx \V, 0, r1 152 lwz \R, 0(r1) 153 stw \R, 0(r7) 154 lwz \R, 4(r1) 155 stw \R, 4(r7) 156 add \D, \D, \P 157.endm 158 159 160 .align 2 161;# r3 unsigned char * src 162;# r4 int src_pitch 163;# r5 int x_offset 164;# r6 int y_offset 165;# r7 unsigned char * dst 166;# r8 int dst_pitch 167bilinear_predict4x4_ppc: 168 mfspr r11, 256 ;# get old VRSAVE 169 oris r12, r11, 0xf830 170 ori r12, r12, 0xfff8 171 mtspr 256, r12 ;# set VRSAVE 172 173 stwu r1,-32(r1) ;# create space on the stack 174 175 HProlog second_pass_4x4_pre_copy_b 176 177 ;# Load up permutation constants 178 load_c v10, b_0123_b, 0, r9, r12 179 load_c v11, b_4567_b, 0, r9, r12 180 181 hfilter_8 v0, 1 182 hfilter_8 v1, 1 183 hfilter_8 v2, 1 184 hfilter_8 v3, 1 185 186 ;# Finished filtering main horizontal block. If there is no 187 ;# vertical filtering, jump to storing the data. Otherwise 188 ;# load up and filter the additional line that is needed 189 ;# for the vertical filter. 190 beq store_out_4x4_b 191 192 hfilter_8 v4, 0 193 194 b second_pass_4x4_b 195 196second_pass_4x4_pre_copy_b: 197 slwi r6, r6, 5 ;# index into vertical filter array 198 199 load_and_align_8 v0, 1 200 load_and_align_8 v1, 1 201 load_and_align_8 v2, 1 202 load_and_align_8 v3, 1 203 load_and_align_8 v4, 1 204 205second_pass_4x4_b: 206 vspltish v20, 8 207 vspltish v18, 3 208 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 209 210 load_vfilter v20, v21 211 212 vfilter_16 v0, v1 213 vfilter_16 v1, v2 214 vfilter_16 v2, v3 215 vfilter_16 v3, v4 216 217store_out_4x4_b: 218 219 stvx v0, 0, r1 220 lwz r0, 0(r1) 221 stw r0, 0(r7) 222 add r7, r7, r8 223 224 stvx v1, 0, r1 225 lwz r0, 0(r1) 226 stw r0, 0(r7) 227 add r7, r7, r8 228 229 stvx v2, 0, r1 230 lwz r0, 0(r1) 231 stw r0, 0(r7) 232 add r7, r7, r8 233 234 stvx v3, 0, r1 235 lwz r0, 0(r1) 236 stw r0, 0(r7) 237 238exit_4x4: 239 240 addi r1, r1, 32 ;# recover stack 241 mtspr 256, r11 ;# reset old VRSAVE 242 243 blr 244 245 .align 2 246;# r3 unsigned char * src 247;# r4 int src_pitch 248;# r5 int x_offset 249;# r6 int y_offset 250;# r7 unsigned char * dst 251;# r8 int dst_pitch 252bilinear_predict8x4_ppc: 253 mfspr r11, 256 ;# get old VRSAVE 254 oris r12, r11, 0xf830 255 ori r12, r12, 0xfff8 256 mtspr 256, r12 ;# set VRSAVE 257 258 stwu r1,-32(r1) ;# create space on the stack 259 260 HProlog second_pass_8x4_pre_copy_b 261 262 ;# Load up permutation constants 263 load_c v10, b_0123_b, 0, r9, r12 264 load_c v11, b_4567_b, 0, r9, r12 265 266 hfilter_8 v0, 1 267 hfilter_8 v1, 1 268 hfilter_8 v2, 1 269 hfilter_8 v3, 1 270 271 ;# Finished filtering main horizontal block. If there is no 272 ;# vertical filtering, jump to storing the data. Otherwise 273 ;# load up and filter the additional line that is needed 274 ;# for the vertical filter. 275 beq store_out_8x4_b 276 277 hfilter_8 v4, 0 278 279 b second_pass_8x4_b 280 281second_pass_8x4_pre_copy_b: 282 slwi r6, r6, 5 ;# index into vertical filter array 283 284 load_and_align_8 v0, 1 285 load_and_align_8 v1, 1 286 load_and_align_8 v2, 1 287 load_and_align_8 v3, 1 288 load_and_align_8 v4, 1 289 290second_pass_8x4_b: 291 vspltish v20, 8 292 vspltish v18, 3 293 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 294 295 load_vfilter v20, v21 296 297 vfilter_16 v0, v1 298 vfilter_16 v1, v2 299 vfilter_16 v2, v3 300 vfilter_16 v3, v4 301 302store_out_8x4_b: 303 304 cmpi cr0, r8, 8 305 beq cr0, store_aligned_8x4_b 306 307 w_8x8 v0, r7, r0, r8 308 w_8x8 v1, r7, r0, r8 309 w_8x8 v2, r7, r0, r8 310 w_8x8 v3, r7, r0, r8 311 312 b exit_8x4 313 314store_aligned_8x4_b: 315 load_c v10, b_hilo_b, 0, r9, r10 316 317 vperm v0, v0, v1, v10 318 vperm v2, v2, v3, v10 319 320 stvx v0, 0, r7 321 addi r7, r7, 16 322 stvx v2, 0, r7 323 324exit_8x4: 325 326 addi r1, r1, 32 ;# recover stack 327 mtspr 256, r11 ;# reset old VRSAVE 328 329 blr 330 331 .align 2 332;# r3 unsigned char * src 333;# r4 int src_pitch 334;# r5 int x_offset 335;# r6 int y_offset 336;# r7 unsigned char * dst 337;# r8 int dst_pitch 338bilinear_predict8x8_ppc: 339 mfspr r11, 256 ;# get old VRSAVE 340 oris r12, r11, 0xfff0 341 ori r12, r12, 0xffff 342 mtspr 256, r12 ;# set VRSAVE 343 344 stwu r1,-32(r1) ;# create space on the stack 345 346 HProlog second_pass_8x8_pre_copy_b 347 348 ;# Load up permutation constants 349 load_c v10, b_0123_b, 0, r9, r12 350 load_c v11, b_4567_b, 0, r9, r12 351 352 hfilter_8 v0, 1 353 hfilter_8 v1, 1 354 hfilter_8 v2, 1 355 hfilter_8 v3, 1 356 hfilter_8 v4, 1 357 hfilter_8 v5, 1 358 hfilter_8 v6, 1 359 hfilter_8 v7, 1 360 361 ;# Finished filtering main horizontal block. If there is no 362 ;# vertical filtering, jump to storing the data. Otherwise 363 ;# load up and filter the additional line that is needed 364 ;# for the vertical filter. 365 beq store_out_8x8_b 366 367 hfilter_8 v8, 0 368 369 b second_pass_8x8_b 370 371second_pass_8x8_pre_copy_b: 372 slwi r6, r6, 5 ;# index into vertical filter array 373 374 load_and_align_8 v0, 1 375 load_and_align_8 v1, 1 376 load_and_align_8 v2, 1 377 load_and_align_8 v3, 1 378 load_and_align_8 v4, 1 379 load_and_align_8 v5, 1 380 load_and_align_8 v6, 1 381 load_and_align_8 v7, 1 382 load_and_align_8 v8, 0 383 384second_pass_8x8_b: 385 vspltish v20, 8 386 vspltish v18, 3 387 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 388 389 load_vfilter v20, v21 390 391 vfilter_16 v0, v1 392 vfilter_16 v1, v2 393 vfilter_16 v2, v3 394 vfilter_16 v3, v4 395 vfilter_16 v4, v5 396 vfilter_16 v5, v6 397 vfilter_16 v6, v7 398 vfilter_16 v7, v8 399 400store_out_8x8_b: 401 402 cmpi cr0, r8, 8 403 beq cr0, store_aligned_8x8_b 404 405 w_8x8 v0, r7, r0, r8 406 w_8x8 v1, r7, r0, r8 407 w_8x8 v2, r7, r0, r8 408 w_8x8 v3, r7, r0, r8 409 w_8x8 v4, r7, r0, r8 410 w_8x8 v5, r7, r0, r8 411 w_8x8 v6, r7, r0, r8 412 w_8x8 v7, r7, r0, r8 413 414 b exit_8x8 415 416store_aligned_8x8_b: 417 load_c v10, b_hilo_b, 0, r9, r10 418 419 vperm v0, v0, v1, v10 420 vperm v2, v2, v3, v10 421 vperm v4, v4, v5, v10 422 vperm v6, v6, v7, v10 423 424 stvx v0, 0, r7 425 addi r7, r7, 16 426 stvx v2, 0, r7 427 addi r7, r7, 16 428 stvx v4, 0, r7 429 addi r7, r7, 16 430 stvx v6, 0, r7 431 432exit_8x8: 433 434 addi r1, r1, 32 ;# recover stack 435 mtspr 256, r11 ;# reset old VRSAVE 436 437 blr 438 439;# Filters a horizontal line 440;# expects: 441;# r3 src_ptr 442;# r4 pitch 443;# r10 16 444;# r12 32 445;# v17 perm intput 446;# v18 rounding 447;# v19 shift 448;# v20 filter taps 449;# v21 tmp 450;# v22 tmp 451;# v23 tmp 452;# v24 tmp 453;# v25 tmp 454;# v26 tmp 455;# v27 tmp 456;# v28 perm output 457;# 458.macro hfilter_16 V, increment_counter 459 460 lvsl v17, 0, r3 ;# permutate value for alignment 461 462 ;# input to filter is 21 bytes wide, output is 16 bytes. 463 ;# input will can span three vectors if not aligned correctly. 464 lvx v21, 0, r3 465 lvx v22, r10, r3 466 lvx v23, r12, r3 467 468.if \increment_counter 469 add r3, r3, r4 470.endif 471 vperm v21, v21, v22, v17 472 vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified 473 474 ;# set 0 475 vmsummbm v24, v20, v21, v18 ;# taps times elements 476 477 ;# set 1 478 vsldoi v23, v21, v22, 1 479 vmsummbm v25, v20, v23, v18 480 481 ;# set 2 482 vsldoi v23, v21, v22, 2 483 vmsummbm v26, v20, v23, v18 484 485 ;# set 3 486 vsldoi v23, v21, v22, 3 487 vmsummbm v27, v20, v23, v18 488 489 vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) 490 vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F 491 492 vsrh v24, v24, v19 ;# divide v0, v1 by 128 493 vsrh v25, v25, v19 494 495 vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result 496 vperm \V, \V, v0, v28 ;# \V = correctly-ordered result 497.endm 498 499.macro load_and_align_16 V, increment_counter 500 lvsl v17, 0, r3 ;# permutate value for alignment 501 502 ;# input to filter is 21 bytes wide, output is 16 bytes. 503 ;# input will can span three vectors if not aligned correctly. 504 lvx v21, 0, r3 505 lvx v22, r10, r3 506 507.if \increment_counter 508 add r3, r3, r4 509.endif 510 511 vperm \V, v21, v22, v17 512.endm 513 514.macro write_16 V, increment_counter 515 stvx \V, 0, r7 516 517.if \increment_counter 518 add r7, r7, r8 519.endif 520.endm 521 522 .align 2 523;# r3 unsigned char * src 524;# r4 int src_pitch 525;# r5 int x_offset 526;# r6 int y_offset 527;# r7 unsigned char * dst 528;# r8 int dst_pitch 529bilinear_predict16x16_ppc: 530 mfspr r11, 256 ;# get old VRSAVE 531 oris r12, r11, 0xffff 532 ori r12, r12, 0xfff8 533 mtspr 256, r12 ;# set VRSAVE 534 535 HProlog second_pass_16x16_pre_copy_b 536 537 hfilter_16 v0, 1 538 hfilter_16 v1, 1 539 hfilter_16 v2, 1 540 hfilter_16 v3, 1 541 hfilter_16 v4, 1 542 hfilter_16 v5, 1 543 hfilter_16 v6, 1 544 hfilter_16 v7, 1 545 hfilter_16 v8, 1 546 hfilter_16 v9, 1 547 hfilter_16 v10, 1 548 hfilter_16 v11, 1 549 hfilter_16 v12, 1 550 hfilter_16 v13, 1 551 hfilter_16 v14, 1 552 hfilter_16 v15, 1 553 554 ;# Finished filtering main horizontal block. If there is no 555 ;# vertical filtering, jump to storing the data. Otherwise 556 ;# load up and filter the additional line that is needed 557 ;# for the vertical filter. 558 beq store_out_16x16_b 559 560 hfilter_16 v16, 0 561 562 b second_pass_16x16_b 563 564second_pass_16x16_pre_copy_b: 565 slwi r6, r6, 5 ;# index into vertical filter array 566 567 load_and_align_16 v0, 1 568 load_and_align_16 v1, 1 569 load_and_align_16 v2, 1 570 load_and_align_16 v3, 1 571 load_and_align_16 v4, 1 572 load_and_align_16 v5, 1 573 load_and_align_16 v6, 1 574 load_and_align_16 v7, 1 575 load_and_align_16 v8, 1 576 load_and_align_16 v9, 1 577 load_and_align_16 v10, 1 578 load_and_align_16 v11, 1 579 load_and_align_16 v12, 1 580 load_and_align_16 v13, 1 581 load_and_align_16 v14, 1 582 load_and_align_16 v15, 1 583 load_and_align_16 v16, 0 584 585second_pass_16x16_b: 586 vspltish v20, 8 587 vspltish v18, 3 588 vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 589 590 load_vfilter v20, v21 591 592 vfilter_16 v0, v1 593 vfilter_16 v1, v2 594 vfilter_16 v2, v3 595 vfilter_16 v3, v4 596 vfilter_16 v4, v5 597 vfilter_16 v5, v6 598 vfilter_16 v6, v7 599 vfilter_16 v7, v8 600 vfilter_16 v8, v9 601 vfilter_16 v9, v10 602 vfilter_16 v10, v11 603 vfilter_16 v11, v12 604 vfilter_16 v12, v13 605 vfilter_16 v13, v14 606 vfilter_16 v14, v15 607 vfilter_16 v15, v16 608 609store_out_16x16_b: 610 611 write_16 v0, 1 612 write_16 v1, 1 613 write_16 v2, 1 614 write_16 v3, 1 615 write_16 v4, 1 616 write_16 v5, 1 617 write_16 v6, 1 618 write_16 v7, 1 619 write_16 v8, 1 620 write_16 v9, 1 621 write_16 v10, 1 622 write_16 v11, 1 623 write_16 v12, 1 624 write_16 v13, 1 625 write_16 v14, 1 626 write_16 v15, 0 627 628 mtspr 256, r11 ;# reset old VRSAVE 629 630 blr 631 632 .data 633 634 .align 4 635hfilter_b: 636 .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 637 .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 638 .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 639 .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 640 .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 641 .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 642 .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 643 .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 644 645 .align 4 646vfilter_b: 647 .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 648 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 649 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 650 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 651 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 652 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 653 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 654 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 655 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 656 .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 657 .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 658 .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 659 .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 660 .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 661 .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 662 .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 663 664 .align 4 665b_hperm_b: 666 .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 667 668 .align 4 669b_0123_b: 670 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 671 672 .align 4 673b_4567_b: 674 .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 675 676b_hilo_b: 677 .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 678